Python Crawler Examples

Programming Language: Python

Namespace/Package Name: HSH.LinearSpider.crawler

Class/Type: Crawler

Examples at hotexamples.com: 11

Python Crawler - 11 examples found. These are the top rated real world Python examples of HSH.LinearSpider.crawler.Crawler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Crawler(7)

html(5)

enable_proxy(2)

_login(1)

download(1)

save_html(1)

set_referer(1)

Example #1

Show file

File: module_LinearSpider.py Project: MacHu-GWU/HSH-toolbox

 def html_WITH_proxy():
     """test random proxy mechanism"""
     url = "http://docs.python-requests.org/"
     spider = Crawler()
     spider.enable_proxy()
     for i in range(10):
         html = spider.html(url)
         print(i, spider.pm.current_proxy)
         if html:
             print("\tSUCCESS")
             spider.pm.update_health(1)
         else:
             print("\tFAILED")
     print(spider)

Example #2

Show file

File: step1_taskplan.py Project: MacHu-GWU/EFA-finished-projects

def get_disease_category_url():
    """taskplan level2 get disease subcategory url
    """
    base_url = 'http://www.mayoclinic.org'         
    task = load_jt('task.json')   
    spider = Crawler()
    
    for url in task:
        html = spider.html(url)
        if html:
            soup = BS4(html)
            div = soup.find_all('div', id = 'main_0_left1_0_tertiarynav')[0]
            for a in div.find_all('a'):
                task[url].setdefault(base_url + a['href'], {'data': {'category': a.text.strip()}})
        dump_jt(task, 'task.json', replace = True)

Example #3

Show file

File: step1_taskplan.py Project: MacHu-GWU/EFA-finished-projects

def get_disease_url():
    """taskplan level1 get disease homepage url
    """
    base_url = 'http://www.mayoclinic.org'
    task = load_jt('task.json')
    spider = Crawler()
    
    for entrance_url in gen_entranceURL():
        html = spider.html(entrance_url)
        if html:
            soup = BS4(html)
            ol = soup.find_all('ol')[1]
            for li in ol.find_all('li'):
                url = base_url + li.a['href']
                task.setdefault(url, {'data': {'disease_name': li.text.strip()} } )
                dump_jt(task, 'task.json', replace = True)

Example #4

Show file

File: shzillow.py Project: MacHu-GWU/HSH-toolbox-backup

def zillow_property_detail(address, zipcode):
    url = gen_url(address, zipcode) # generate query's http url
    spider = Crawler()
    html = spider.html(url) # fetch html
    if html: # if good html, analysis it
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_ ="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop = "addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop = "addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except: # if something wrong in analysis, raise ExtractorError
            raise ExtractorError(address, zipcode, url)
    else: # if bad html, raise HttpError
        raise HttpError(address, zipcode, url)

Example #5

Show file

File: step1_taskplan.py Project: MacHu-GWU/EFA-finished-projects

def download_all():
    """crawl them all, disease_name - subcategory
    """
    task = load_jt('task.json')
    data = load_jt('data.json')
    spider = Crawler()
    
    for _, v in task.iteritems():
        disease_name = v['data']['disease_name']
        data.setdefault(disease_name, {})
        for url, v1 in ignore_iteritems(v, ignore = ['data']):
            print url
            html = spider.html(url)
            if html:
                soup = BS4(html)
                div = soup.find('div', id='main-content')
                data[disease_name].setdefault(v1['data']['category'], str(div))
        dump_jt(data, 'data.json', fastmode = True, replace = True)

Example #6

Show file

def zillow_property_detail(address, zipcode):
    url = gen_url(address, zipcode)  # generate query's http url
    spider = Crawler()
    html = spider.html(url)  # fetch html
    if html:  # if good html, analysis it
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop="addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop="addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except:  # if something wrong in analysis, raise ExtractorError
            raise ExtractorError(address, zipcode, url)
    else:  # if bad html, raise HttpError
        raise HttpError(address, zipcode, url)

Example #7

Show file

File: zillow.py Project: MacHu-GWU/six-demon-bag

def property_info(address, zipcode):
    url = gen_url(address, zipcode)
    spider = Crawler()
    html = spider.html(url)
    if html:
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop="addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop="addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except:
            log.write(
                "Failed to analysis address = %s, zipcode = %s" %
                (address, zipcode), "Failed Extraction")
            return None
    else:
        log.write("%s Failed to get http request" % url, "Http Error")

Example #8

Show file

File: module_LinearSpider.py Project: MacHu-GWU/HSH-toolbox

 def html_WITHOUt_proxy():
     """test normal http request"""
     url = "http://docs.python-requests.org/"
     spider = Crawler()
     html = spider.html(url)
     print(BS4(html).prettify())

Example #9

Show file

File: module_LinearSpider.py Project: MacHu-GWU/HSH-toolbox

 def enable_proxy():
     print("{:=^100}".format("enable_proxy"))
     spider = Crawler()
     spider.enable_proxy()
     print(spider)

Example #10

Show file

File: module_LinearSpider.py Project: MacHu-GWU/HSH-toolbox

 def set_referer():
     print("{:=^100}".format("set_referer"))
     spider = Crawler()
     print(spider)
     spider.set_referer("https://www.python.org/")
     print(spider)

Example #11

Show file

def UT_crawl_html():
    spider = Crawler()
    # test html download
    url = "https://www.python.org/" 
    print( spider.html(url) )
    
    # test save html to .html
    url = "http://www.archives.com/"
    spider.save_html(url, "www.archives.com.html")
    
    # test file download
    img_url = "https://www.python.org/static/img/python-logo.png" 
    spider.download(img_url, "python-logo.png")
    
    # test download html to .html file
    target_url = "http://www.archives.com/" # www.archives.com 是 utf-8 编码
    with open("before_login.html", "wb") as f: # because requests.text return bytes, so mode has to be "wb"
        html = spider.html(target_url)
        print( type(html), chardet.detect(html) ) # 用于展示在2和3之中requests.text返回的对象类型
        f.write( html )
         
    spider._login(url = "http://www.archives.com/member/", # test html after login
                  payload = {"__uid":"*****@*****.**","__pwd":"efa2014"} )
    with open("after_login.html", "wb") as f:
        f.write( spider.html(target_url))