Esempi in Python per GetHTML, esempi in Python per GetHTML

Esempio n. 1

0

Mostra file

File: KindEconomic.py Progetto: sassyst/NewsClassifier

def catchEco(cursor, conn):
    for href in href_list:
        currenthref = href
        '''开始循环页面'''
        count = 0
        while currenthref != '' \
                and currenthref != 'javascript:void(0);' \
                and currenthref != "javascript:alert('没有了');" \
                and count < 200:
            htmlcode = GetHTML.getHtml(currenthref)
            soup = BeautifulSoup(htmlcode, "html.parser")
            detail_list = soup.find_all('h3')
            for h in detail_list:
                #print 'Economy:',h.a.get('href');
                GetWebContent.catch(unicode(h.a.get('href')), 'Economy',
                                    cursor, conn)

            div_nextpage = soup.find_all('a', class_="a_ts02")
            if div_nextpage.__len__() != 0:
                currenthref = div_nextpage[0].get('href')
            else:
                currenthref = ''

            count += 1

    for href in href_home:
        htmlcode = GetHTML.getHtml(href)
        soup = BeautifulSoup(htmlcode, "html.parser")
        list1 = soup.find_all('a',
                              href=re.compile('http://finance.ifeng.com/a/'))
        for item in list1:
            if unicode(item.get("href"))[0] == 'h':
                #print 'Economy:',item.get("href");
                GetWebContent.catch(unicode(item.get('href')), 'Economy',
                                    cursor, conn)

Esempio n. 2

0

Mostra file

def GetLinkEpisode(soup):
    ep = soup.find("a", {"class": "active"})

    AnimeID = soup.find("input", {"id": "movie_id"})['value']
    link = f"https://ajax.gogocdn.net/ajax/load-list-episode?ep_start={0}&ep_end={10000}&id={AnimeID}"
    soup = GetHTML(link)
    AnimeList = ["https://www19.gogoanime.io"+i['href'].lstrip()
                 for i in soup.find_all("a")]
    return AnimeID, AnimeList

Esempio n. 3

0

Mostra file

File: FUTABA.py Progetto: HerrNKD/FutaWatcher

    def get_thread_URL(self):
        """
        get thread url
        """

        # print "GetThreadURL"
        thread_links = GetHTML(self.url_catalog, "href", "a").get_links()
        for link in thread_links[:]:
            if "res" not in link:
                thread_links.remove(link)
        return thread_links

Esempio n. 4

0

Mostra file

def GetEpisodeMethodeDICO(link):
    soup = GetHTML(link)
    embed = soup.find_all("script")
    embed = list(
        map(lambda data: str(data)
            if "multilinks" in str(data) else None, embed))
    embed = list(filter(None, embed))[0]
    embed = embed.replace('<script type="text/javascript">', '')
    embed = embed.replace('</script>', '')
    embed = embed.replace('var multilinks = ', '')
    embed = embed.split(";")[0]
    Dico = json.loads(embed)[0]
    return Dico

Esempio n. 5

0

Mostra file

def catchSports(cursor, conn):
    for href in href_list:
        currenthref = href
        '''开始循环页面'''
        count = 0
        while currenthref !=''\
            and currenthref != 'javascript:void(0);' \
            and currenthref != "javascript:alert('没有了');" \
            and count < 50:
            htmlcode = GetHTML.getHtml(currenthref)
            soup = BeautifulSoup(htmlcode, "html.parser")
            title_list = soup.find_all('div', class_='box_list clearfix')
            for t in title_list:
                #print 'Sports:',t.a.get('href');
                if unicode(t.a.get('href')).endswith('wemedia.shtml'):
                    GetWebContent.catch(unicode(t.a.get('href')),
                                        'Sports_wemedia', cursor, conn)
                else:
                    GetWebContent.catch(unicode(t.a.get('href')), 'Sports',
                                        cursor, conn)

            div_nextpage = soup.find_all('a', id='pagenext')
            if div_nextpage.__len__() != 0:
                currenthref = div_nextpage[0].get('href')
            else:
                currenthref = ''
            count += 1

Esempio n. 6

0

Mostra file

File: KindSociety.py Progetto: sassyst/NewsClassifier

def catchSociety(cursor, conn):
    try:
        for href in href_list:
            currenthref = href
            '''开始循环页面'''
            count = 0
            while currenthref != '' \
                    and currenthref != 'javascript:void(0);' \
                    and currenthref != "javascript:alert('没有了');" \
                    and count < 100:
                htmlcode = GetHTML.getHtml(currenthref)
                soup = BeautifulSoup(htmlcode, "html.parser")

                li_list = soup.find_all('div',
                                        class_='newsList')[0].find_all('li')
                for li in li_list:
                    # print 'Society:',count, ':', li.a.get('href');
                    GetWebContent.catch(unicode(li.a.get('href')), 'Society',
                                        cursor, conn)

                div_nextpage = soup.find_all('div', id='backDay')
                if div_nextpage.__len__() != 0:
                    currenthref = div_nextpage[0].a.get('href')
                else:
                    currenthref = ''

                count += 1

    except BaseException, e:
        print e

Esempio n. 7

0

Mostra file

File: main.py Progetto: Maxim-Costa/API_anime_Final

def get_saison_anime_dbanimes(PGname):
    link = "https://dbanimes.com/anime/" + escape(PGname)
    soup = GetHTML(link)
    AnimeList = dbanimes.GetEpisodeDB(soup)
    result = dbanimes.GetEpisodeMethodeDB(AnimeList)
    result = sorted(result, key=lambda anime: anime['index'])
    return jsonify({"Page_name": PGname, "Link": link, "Episode": result})

Esempio n. 8

0

Mostra file

def GetEpisodeAsync(link, key, Dico):
    soup = GetHTML(link)
    embed = soup.find("div", {"class": "anime_muti_link"})
    embed = embed.find_all('li')
    title = soup.find("div", {"class": "title_name"}).text.rstrip().lstrip()
    methode = []
    for i in embed:
        link = i.a['data-video']
        if link[:2] == "//":
            link = "http:"+link
        methode.append({
            "link": link,
            "title": i.a.text.replace("Choose this server", "")
        })
    Dico.append({
        "index": key,
        "title": title,
        "methode": methode
    })

Esempio n. 9

0

Mostra file

File: dbanimes.py Progetto: Maxim-Costa/API_anime_Final

def asyncGet(key, link, Dico):
    soup = GetHTML(link)
    title = soup.find("li", {"class": "breadcrumb-item active"}).text
    methodeDIC = []
    methode = soup.find_all("div", {"class": "player"})
    for i in methode:
        dataURL = i['data-url']
        if dataURL != " " and dataURL != "None" and dataURL != " ":
            linkMethode = BeautifulSoup(dataURL,
                                        "html.parser").find("iframe")['src']
            if linkMethode[:2] == "//":
                linkMethode = "http:" + linkMethode
        else:
            linkMethode = "link/methode/not_find"
        methodeDIC.append({
            "link": linkMethode,
            "title": linkMethode.split("/")[2]
        })
    Dico.append({"index": key, "title": title, "methode": methodeDIC})

Esempio n. 10

0

Mostra file

File: KindTech.py Progetto: sassyst/NewsClassifier

def catchTech(cursor, conn):
    currenthref = 'http://tech.ifeng.com/listpage/803/1/list.shtml'
    '''开始循环页面'''
    count = 0
    while currenthref != '' \
            and currenthref != 'javascript:void(0);' \
            and currenthref != "javascript:alert('没有了');" \
            and count < 70:
        htmlcode = GetHTML.getHtml(currenthref)
        soup = BeautifulSoup(htmlcode, "html.parser")
        title_list = soup.find_all('div', class_='zheng_list pl10 box')
        for t in title_list:
            #print 'Tech:',t.a.get('href');
            GetWebContent.catch(unicode(t.a.get('href')), 'Tech', cursor, conn)

        div_nextpage = soup.find_all('a', id='pagenext')
        if div_nextpage.__len__() != 0:
            currenthref = div_nextpage[0].get('href')
        else:
            currenthref = ''
        count += 1

    currenthref = 'http://tech.ifeng.com/listpage/26334/1/list.shtml'
    '''开始循环页面'''
    count = 0
    while currenthref != '' \
            and currenthref != 'javascript:void(0);' \
            and currenthref != "javascript:alert('没有了');" \
            and count < 50:
        htmlcode = GetHTML.getHtml(currenthref)
        soup = BeautifulSoup(htmlcode, "html.parser")
        title_list = soup.find_all('div', class_='box_list clearfix')
        for t in title_list:
            #print 'Tech:',t.a.get('href');
            GetWebContent.catch(unicode(t.a.get('href')), 'Tech', cursor, conn)

        div_nextpage = soup.find_all('a', id='pagenext')
        if div_nextpage.__len__() != 0:
            currenthref = div_nextpage[0].get('href')
        else:
            currenthref = ''
        count += 1

Esempio n. 11

0

Mostra file

def catch(url, kind, cursor, conn):
    """
    """
    try:
        if url == '':
            return
        htmlcode = GetHTML.getHtml(url)

        soup = BeautifulSoup(htmlcode, "html.parser")

        title_soup = soup.title
        topic = ''
        if title_soup != None:
            topic = unicode(title_soup.string)
            rstr = r"[\/\\\:\*\?\"\<\>\|]"
            topic = re.sub(rstr, "", topic)  # 消除非法字符
        else:
            return
        kind_name = getkindChineseName(kind)
        out_path = 'fenghaungnews'
        filepath = u"../data/%s/%s/" % (out_path, kind_name)

        if not os.path.exists(filepath):
            os.makedirs(filepath)

        content = ''
        if kind == 'Sports_wemedia':
            for child in soup.find_all('div',
                                       class_="yc_con_txt")[0].descendants:
                if child.name == 'p' and child.string != None:
                    content = content + unicode(child.string)
        else:
            content_list = soup.find_all('div', id='main_content')
            if content_list == []:
                return

            for child in content_list[0].descendants:
                if child.string != None \
                        and unicode(child)[0] != '<' \
                        and type(child.string) != bs4.element.Comment \
                        and unicode(child.string) != '\n':
                    content = content + unicode(child.string) + u'\n'

        # cursor.execute("INSERT INTO web_page(article_class, article_content) VALUES(%s ,%s);",
        #                (kind_name.encode('utf-8'), content.encode('utf-8')))
        # conn.commit()

        with open(filepath + str(topic) + '.txt', 'w') as f:
            f.write(content)
        print kind_name, ' : ', url
        print

    except BaseException, e:
        dealException1(url, kind, e)

Esempio n. 12

0

Mostra file

File: FUTABA.py Progetto: HerrNKD/FutaWatcher

    def get_thumbnail_URL(self, thread_links):
        """
        get thumbnail url
        """

        # print "GetThumbnailURL"
        t_url = []
        for link in thread_links[:]:
            link = "http://img.2chan.net/b/" + link
            thumbnail_links = GetHTML(link, "href", "a").get_links()
            for i in thumbnail_links[:]:
                if "jpg" not in i:
                    if "png" not in i:
                        thumbnail_links.remove(i)
            for idx, thumbnail in enumerate(thumbnail_links):

                if len(thumbnail_links) != 0:
                    tpl = (link, "http://img.2chan.net" + thumbnail_links[0])
                    t_url.insert(idx, tpl)
        return t_url

Esempio n. 13

0

Mostra file

File: KindGame.py Progetto: sassyst/NewsClassifier

def catchGames(cursor, conn):
    for href in href_home:
        htmlcode = GetHTML.getHtml(href)
        soup = BeautifulSoup(htmlcode, "html.parser")
        list1 = soup.find_all('a',
                              href=re.compile('http://games.ifeng.com/a/'))
        for item in list1:
            if unicode(item.get("href"))[0] == 'h':
                #print 'Games:',item.get("href");
                GetWebContent.catch(unicode(item.get('href')), 'Games', cursor,
                                    conn)

Esempio n. 14

0

Mostra file

File: main.py Progetto: Maxim-Costa/API_anime_Final

def get_saison_anime_animeultime(PGname):
    link = "https://v5.anime-ultime.net/" + PGname + ".html"
    soup = GetHTML(link)
    AnimeID, ListID = animeultime.GetIdEpisode(soup)
    result = animeultime.jsonRequest(AnimeID, ListID)
    result = sorted(result, key=lambda anime: anime['index'])
    return jsonify({
        "Page_name": PGname,
        "AnimeID": AnimeID,
        "Link": link,
        "Episode": result
    })

Esempio n. 15

0

Mostra file

File: main.py Progetto: Maxim-Costa/API_anime_Final

def get_saison_anime_gogoanime(PGname):
    link = "https://www19.gogoanime.io/category/" + escape(PGname)
    soup = GetHTML(link)
    AnimeID, AnimeList = gogoanime.GetLinkEpisode(soup)
    result = gogoanime.GetEpisodeMethodeGOGO(AnimeList)
    result = sorted(result, key=lambda anime: anime['index'])
    return jsonify({
        "Page_name": PGname,
        "AnimeID": AnimeID,
        "Link": link,
        "Episode": result
    })

Esempio n. 16

0

Mostra file

File: views.py Progetto: phuston/iCalOlin

def index():
    username = None
    password = None
    email = None
    form = loginForm()
    if form.validate_on_submit():
        try:
            course_info = GetHTML.htmlHandle(form.username.data,form.password.data)

            timesold = course_info[2]
            names = course_info[1]
            course_info[2] = Formatter.formatTimes(course_info[2])
            sdates = {'M':'26','T':'20','W':'21','R':'22','F':'23'}

            snames = {}

            for i in range(len(timesold)):
                snames[names[i]] = len(timesold[i])

            dnames = {}

            k = 0
            for j in range(7):
                for i in range(snames[names[j]]):
                    dnames[k] = names[j]
                    k += 1

            filename = 'OliniCalendar.ics'

            iCalCreation.iCalWrite(course_info[2],"201501","20150430T000000",sdates,dnames,filename)

            ical = open('OliniCalendar.ics','r')

            email = form.email.data
            Emailer.iCalCreator(email,ical)

            os.remove('OliniCalendar.ics')
            # return render_template(html_sched)
            
            form.username.data = ''
            form.password.data = ''
            form.email.data = ''
        except:
            "Oops"

    return render_template('index.html', form=form)

Esempio n. 17

0

Mostra file

File: VPNpass.py Progetto: mcolli30/VPN_Pass

def GetFreeVPNPass(FreeVPNdomain):
    #import modules
    import os
    import GetHTML
    from bs4 import BeautifulSoup

    #Assign Variables
    d = str(FreeVPNdomain)
    urlcat = "https://freevpn." + d + "/accounts/"
    url = GetHTML.GetSource(urlcat)

    #Write files for temp data storage.
    source = open('freevpn.html', 'w')
    source.write(str(url))
    source.close()
    f = open('creds.txt', 'w')

    #Parse the downloaded html source code
    with open("freevpn.html") as fp:
        soup = BeautifulSoup(fp, "lxml")

    #Search parsed code for the login credentials
    for tag in soup.find_all('b'):
        f.write(str(tag.text) + "\n")
        f.write(str(tag.next_sibling) + "\n")

    #Delete html source code
    os.remove('freevpn.html')
    f.close()

    #Pass Credential info into list and extract password field.
    CredList = open('creds.txt', 'r').readlines()
    VPN_Pass = CredList[5]

    #Clean up credential list file
    os.remove('creds.txt')

    #return password.
    return str(VPN_Pass.strip())

Esempio n. 18

0

Mostra file

File: KindArmy.py Progetto: sassyst/NewsClassifier

def catchArmy(cursor, conn):
    for href in href_list:
        currenthref = href;
        '''开始循环页面'''
        count = 0;
        while currenthref != '' \
                and currenthref != 'javascript:void(0);' \
                and currenthref != "javascript:alert('没有了');" \
                and count < 50:
            htmlcode = GetHTML.getHtml(currenthref);
            soup = BeautifulSoup(htmlcode, "html.parser");
            title_list = soup.find_all('div',class_='comListBox');
            for t in title_list:
                #print 'Army:',t.a.get('href');
                GetWebContent.catch(unicode(t.a.get('href')), 'Army', cursor, conn);

            div_nextpage = soup.find_all('a', id='pagenext');
            if div_nextpage.__len__() != 0:
                currenthref = div_nextpage[0].get('href');
            else:
                currenthref = '';
            count += 1;

Esempio n. 19

0

Mostra file

def catchInterNation(cursor, conn):
    for href in href_list:
        currenthref = href;
        '''开始循环页面'''
        count = 0;
        while currenthref != '' \
                and currenthref != 'javascript:void(0);' \
                and currenthref != "javascript:alert('没有了');" \
                and count < 100:
            htmlcode = GetHTML.getHtml(currenthref);
            soup = BeautifulSoup(htmlcode, "html.parser");
            li_list = soup.find_all('div', class_='newsList')[0].find_all('li');
            for li in li_list:
                #print 'InterNation:',count, ':', li.a.get('href');
                GetWebContent.catch(unicode(li.a.get('href')), 'InterNation', cursor, conn);

            div_nextpage = soup.find_all('div', id='backDay');
            if div_nextpage.__len__() != 0:
                currenthref = div_nextpage[0].a.get('href');
            else:
                currenthref = '';
            count += 1;

Esempio n. 20

0

Mostra file

File: KindEntertain.py Progetto: sassyst/NewsClassifier

def catchEntertain(cursor, conn):
    for href in href_list:
        currenthref = href
        '''开始循环页面'''
        count = 0
        while currenthref != '' \
                and currenthref != 'javascript:void(0);' \
                and currenthref != "javascript:alert('没有了');" \
                and count < 500:
            htmlcode = GetHTML.getHtml(currenthref)
            soup = BeautifulSoup(htmlcode, "html.parser")
            h2_list = soup.find_all('h2')
            for h2 in h2_list:
                print 'Entertain:', h2.a.get('href')
                GetWebContent.catch(unicode(h2.a.get('href')), 'Entertain',
                                    cursor, conn)

            div_nextpage = soup.find_all('a', id='pagenext')
            if div_nextpage.__len__() != 0:
                currenthref = div_nextpage[0].get('href')
            else:
                currenthref = ''
            count += 1

Esempio n. 21

0

Mostra file

def GetEpisodeLink(link):
    soup = GetHTML(link)
    embed = soup.find_all('a', {"class": "btn-default"})
    return embed

Esempio n. 22

0

Mostra file

def GetSaison(link):
    soup = GetHTML(link)
    title = soup.find_all('div', {"class": "module-title"})
    titleLink = soup.find_all('a', {"title": "Regarder en VOSTFR"})
    return title, titleLink

Esempio n. 23

0

Mostra file

File: main.py Progetto: phuston/iCalOlin

import os

filename = 'OliniCalendar.ics'

# This is the start of the semester:
yearmonth = '201501'
# This is the end of the semester:
endyearmonthdaytime = '20150430T000000'



USERNAME = str(raw_input("Enter my.olin.edu Username: "******"time slots" are made instead. This just maps each class to it's respective slots.
snames = {}

Esempio n. 24

0

Mostra file

File: main.py Progetto: Maxim-Costa/API_anime_Final

def get_saison_anime_vostfree(PGname):
    link = "https://vostfree.com/" + escape(PGname) + ".html"
    soup = GetHTML(link)
    result = vostfree.GetAllPlayer(soup)
    result = sorted(result, key=lambda anime: anime['index'])
    return jsonify({"Page_name": PGname, "Link": link, "Episode": result})