Ejemplos de get_html en Python, ejemplos de downloader.get_html en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: girl_manager.py Proyecto: Hansi7/chasingstar

def girls_movie(scode, maxpage=1):
    um = UrlManager()
    um.update()
    source_urls = um.show()
    url_to_scray = source_urls[0].url + '/star/' + scode
    print('url is ' + url_to_scray)
    html = downloader.get_html(url_to_scray)
    soup = BeautifulSoup(html, "html.parser")

    results = []

    nextpageurl = 'first_loop'
    page_counter = 0
    while nextpageurl != None:
        if nextpageurl != 'first_loop':  #
            html = downloader.get_html(nextpageurl)
            soup = BeautifulSoup(html, "html.parser")
        for url in pageparser.parser_homeurl(html):
            results.append(url)
        nextpageurl = pageparser.get_next_page_url(source_urls[0].url, html)
        page_counter += 1
        print('page => ' + str(page_counter))
        if page_counter == maxpage:
            break
    fcodes = []
    for url in results:
        print(url)
        code = (url[len(source_urls[0].url):]).strip('/')
        fcodes.append(code)
    return fcodes

Ejemplo n.º 2

0

Mostrar archivo

def main(entrance):
    start_time = time.time()
    
    print "entrance:{}".format(entrance)

    entrance_html = downloader.get_html(entrance)
    teams_url_list = pageparser.get_teams(entrance_html)
    
    for x in teams_url_list:
        print 'spider to url:{}'.format(x)
        team_id = int(x.split('/')[-1])
        
        team_html = downloader.get_html(x)
        try:
            team_name,country,create_year,league,coach,city,match_place,website,intro,img_url = pageparser.get_team_info(team_html)
            schedule,match_time,home_team,market,visit_team,score,half_score,result,market_trend,bet_type1,bet_type2,goal_number = pageparser.get_team_history_match(team_id,team_html)

            controler.write_team_data(team_id,team_name,country,create_year,league,coach,city,match_place,website,intro,img_url)
            print 'running time:{}s'.format(str(time.time()-start_time))

        except Exception as e:
            print e
            with open('fail_url.txt', 'a') as fd:
                fd.write('{}\n{}'.format(e,x))
            continue

Ejemplo n.º 3

0

Mostrar archivo

def get_dict(url):
    """get the dict of the detail page and yield the dict"""

    url_html = downloader.get_html(url)
    for detail_url in pageparser.parser_homeurl(url_html):
        detail_page_html = downloader.get_html(detail_url)
        dict_jav = pageparser.parser_content(detail_page_html)
        #重写url
        dict_jav['URL'] = detail_url
        yield dict_jav, detail_url

Ejemplo n.º 4

0

Mostrar archivo

def main(entrance):
    #无码为1，有码为0
    is_censored = 0 if 'uncensored' in entrance else 1
    
    page_crawler(entrance, is_censored)  
    entrance_html = downloader.get_html(entrance)
    next_page_url = pageparser.get_next_page_url(entrance, entrance_html)

    while next_page_url:
        page_crawler(next_page_url,is_censored)
        next_page_html = downloader.get_html(next_page_url)
        next_page_url = pageparser.get_next_page_url(entrance, next_page_html)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: crawler.py Proyecto: xl-jeeter/Javbus_crawler

def get_dict(url):
    """get the dict of the detail page and yield the dict"""

    url_html = downloader.get_html(url)
    for detail_url in pageparser.parser_homeurl(url_html):
        try:
            detail_page_html = downloader.get_html(detail_url)
            dict_jav = pageparser.parser_content(detail_page_html)
        except:
            with open('fail_url.txt', 'a') as fd:
                fd.write('%s\n' % detail_url)
            print("Fail to crawl %s\ncrawl next detail page......" %
                  detail_url)
            continue
        yield dict_jav, detail_url

Ejemplo n.º 6

0

Mostrar archivo

Archivo: crawler.py Proyecto: xl-jeeter/Javbus_crawler

def main(entrance):
    #创建数据表
    controler.create_db()
    #无码为1，有码为0
    is_uncensored = 1 if 'uncensored' in entrance else 0
    join_db(entrance, is_uncensored)

    entrance_html = downloader.get_html(entrance)
    next_page_url = pageparser.get_next_page_url(entrance, entrance_html)
    while True:
        if next_page_url:
            join_db(next_page_url, is_uncensored)
        next_page_html = downloader.get_html(next_page_url)
        next_page_url = pageparser.get_next_page_url(entrance, next_page_html)
        if next_page_url == None:
            break

Ejemplo n.º 7

0

Mostrar archivo

def parser_content(html):
    """parser_content(html),parser page's content of every url and yield the dict of content"""

    soup = BeautifulSoup(html, "html.parser")

    categories = {}

    code_name_doc = soup.find('span', text="識別碼:")
    code_name = code_name_doc.parent.contents[2].text if code_name_doc else ''
    categories['Video_ID'] = code_name

    date_issue_doc = soup.find('span', text="發行日期:")
    date_issue = date_issue_doc.parent.contents[1].strip(
    ) if date_issue_doc else ''
    categories['Release_Date'] = date_issue

    duration_doc = soup.find('span', text="長度:")
    duration = duration_doc.parent.contents[1].strip() if duration_doc else ''
    categories['Length'] = re.match(r"\d+", duration)[0]

    manufacturer_doc = soup.find('span', text="製作商:")
    manufacturer = manufacturer_doc.parent.contents[
        2].text if manufacturer_doc else ''
    categories['Producer'] = manufacturer

    series_doc = soup.find('span', text="系列:")
    series = series_doc.parent.contents[2].text if series_doc else ''
    categories['Series'] = series

    genre_doc = soup.find('p', text="類別:")
    genre = (
        i.text.strip()
        for i in genre_doc.find_next('p').select('span')) if genre_doc else ''
    genre_text = ''
    for tex in genre:
        genre_text += '%s   ' % tex
    categories['Label'] = genre_text

    actors = soup.select('span[onmouseover^="hoverdiv"]')
    list_actor = parser_actor(actors)
    categories['Actors'] = list_actor

    url = soup.select('link[hreflang="zh"]')[0]['href']
    categories['URL'] = url

    magnet_html = downloader.get_html(get_cili_url(soup), Referer_url=url)
    magnet = parser_magnet(magnet_html)
    categories['Magnet'] = magnet

    # publisher_doc = soup.find('span', text="發行商:")
    # publisher = publisher_doc.parent.contents[2].text if publisher_doc else ''
    # categories['發行商'] = publisher

    # director_doc = soup.find('span', text="導演:")
    # director = director_doc.parent.contents[2].text if director_doc else ''
    # categories['導演'] = director

    return categories

Ejemplo n.º 8

0

Mostrar archivo

def get_sub_areas(city):
    url_prefix = "https://www.rew.ca/sitemap/real-estate/"
    city_url = url_prefix + city
    html = downloader.get_html(city_url)

    soup = BeautifulSoup(html, "html.parser")
    sub_areas = soup.select(".gridblock-link")
    if sub_areas:
        for sub_area in sub_areas:
            yield sub_area.get('href').split('/')[3]
    else:
        yield city

Ejemplo n.º 9

0

Mostrar archivo

def get_list(province):
    url_prefix = "https://www.rew.ca/sitemap/real-estate/"
    province_url = url_prefix + province
    html = downloader.get_html(province_url)

    soup = BeautifulSoup(html, "html.parser")
    areas = soup.select(".gridblock-link")
    for area in areas:
        for i in get_sub_areas(area.get('href').split('/')[3]):
            print(i)
            with open(province + '_sub_area_list.txt', 'a') as fd:
                fd.write('%s\n' % i)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: girl_manager.py Proyecto: Hansi7/chasingstar

def subscribe_girl(scode):
    gs = session.query(Girl).filter(Girl.scode == scode).all()
    if len(gs) == 0:
        uu = UrlManager().show()[0].url + '/star/' + scode
        html = downloader.get_html(uu)
        name = pageparser.parser_girlurl(html)
        session.add(Girl(name=str(name), scode=scode, date=now()))
        print(str(name) + 'subscribed!')
        session.commit()
        session.close()
    else:
        print('this girl already subscribed!')

Ejemplo n.º 11

0

Mostrar archivo

Archivo: crawler.py Proyecto: meoww-bot/gaokao-specialty

def main(entrance):
    
    print "entrance:{}".format(entrance)

    entrance_html = downloader.get_html(entrance)
    specialty_code_spe_dict,specialty_code_url_dict = pageparser.get_specialty(entrance_html)
    for code in specialty_code_url_dict:
        title = specialty_code_spe_dict[code]
        url = specialty_code_url_dict[code]
        print title,code,url
        

        spe_url = main_url + url
        spe_html = downloader.get_html(spe_url)
        detail = pageparser.get_specialtyDetail(spe_html)

        subject = detail['subject']
        class_ = detail['class']
        name = detail['name']
        intro = detail['intro']        
        

        controler.write_data(title, code,subject, class_, name, intro)

Ejemplo n.º 12

0

Mostrar archivo

def main(entrance_url):
    current_page = downloader.get_html(entrance_url)
    urls = htmlparser.get_urls(current_page)
    for url in urls:
        if not database.check_url_in_db(url):
            time.sleep(3)
            print("Working on " + url)
            html = downloader.get_html(url)
            property_dict = htmlparser.parse_content(html)
            if property_dict:
                property_dict['url'] = url
                database.write_data(property_dict)
            else:
                print("Nothing here " + url)


#         else:
#             print("Already have it " + url)
    print("Done with " + entrance_url)

    next_page_url = htmlparser.get_next_page_urls(current_page)
    if next_page_url:
        main(next_page_url)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: scheduler.py Proyecto: mu1902/Creeper

 def run(self):
     # apply(self.target, self.args)
     while not self.task_q.full():
         self.task_q.put(0)
         if lock.acquire():
             url = self.urlmgr.get_one()
             lock.release()
         if url:
             con = downloader.get_html(url)
             self.res_q.put(self.parse_content_fn(con))
             if lock.acquire():
                 if parse_url_fn:
                     self.urlmgr.add_url(None, self.parse_url_fn(con))
                 self.urlmgr.fin_url(url)
                 lock.release()
         self.task_q.get()
         self.task_q.task_done()

Ejemplo n.º 14

0

Mostrar archivo

def parser_content(html):
    """parser_content(html),parser page's content of every url and yield the dict of content"""

    soup = BeautifulSoup(html, "html.parser")

    categories = {}

    code_name_doc = soup.find('span', text="識別碼:")
    code_name = code_name_doc.parent.contents[2].text if code_name_doc else ''
    categories['識別碼'] = code_name
    #code_name = soup.find('span', text="識別碼:").parent.contents[2].text if soup.find('span', text="識別碼:") else ''

    date_issue_doc = soup.find('span', text="發行日期:")
    date_issue = date_issue_doc.parent.contents[1].strip(
    ) if date_issue_doc else ''
    categories['發行日期'] = date_issue
    #date_issue = soup.find('span', text="發行日期:").parent.contents[1].strip() if soup.find('span', text="發行日期:") else ''

    duration_doc = soup.find('span', text="長度:")
    duration = duration_doc.parent.contents[1].strip() if duration_doc else ''
    categories['長度'] = duration
    #duration = soup.find('span', text="長度:").parent.contents[1].strip() if soup.find('span', text="長度:") else ''

    director_doc = soup.find('span', text="導演:")
    director = director_doc.parent.contents[2].text if director_doc else ''
    categories['導演'] = director
    #director = soup.find('span', text="導演:").parent.contents[2].text if soup.find('span', text="導演:") else ''

    manufacturer_doc = soup.find('span', text="製作商:")
    manufacturer = manufacturer_doc.parent.contents[
        2].text if manufacturer_doc else ''
    categories['製作商'] = manufacturer
    #manufacturer = soup.find('span', text="製作商:").parent.contents[2].text if soup.find('span', text="製作商:") else ''

    publisher_doc = soup.find('span', text="發行商:")
    publisher = publisher_doc.parent.contents[2].text if publisher_doc else ''
    categories['發行商'] = publisher
    #publisher = soup.find('span', text="發行商:").parent.contents[2].text if soup.find('span', text="發行商:") else ''

    series_doc = soup.find('span', text="系列:")
    series = series_doc.parent.contents[2].text if series_doc else ''
    categories['系列'] = series
    #series = soup.find('span', text="系列:").parent.contents[2].text if soup.find('span', text="系列:") else ''

    genre_doc = soup.find('p', text="類別:")
    genre = (
        i.text.strip()
        for i in genre_doc.find_next('p').select('span')) if genre_doc else ''
    #genre =(i.text.strip() for i in soup.find('p', text="類別:").find_next('p').select('span')) if soup.find('p', text="類別:") else ''
    genre_text = ''
    for tex in genre:
        genre_text += '%s   ' % tex
    categories['類別'] = genre_text

    actor_doc = soup.select('span[onmouseover^="hoverdiv"]')
    actor = (i.text.strip() for i in actor_doc) if actor_doc else ''
    #actor = (i.text.strip() for i in soup.select('span[onmouseover^="hoverdiv"]')) if soup.select('span[onmouseover^="hoverdiv"]') else ''
    actor_text = ''
    for tex in actor:
        actor_text += '%s   ' % tex
    categories['演員'] = actor_text

    #网址加入字典
    url = soup.select('link[hreflang="zh"]')[0]['href']
    categories['URL'] = url

    #将磁力链接加入字典
    magnet_html = downloader.get_html(_get_cili_url(soup), Referer_url=url)
    magnet = _parser_magnet(magnet_html)
    categories['磁力链接'] = magnet

    return categories