def html_WITH_proxy(): """test random proxy mechanism""" url = "http://docs.python-requests.org/" spider = Crawler() spider.enable_proxy() for i in range(10): html = spider.html(url) print(i, spider.pm.current_proxy) if html: print("\tSUCCESS") spider.pm.update_health(1) else: print("\tFAILED") print(spider)
def get_disease_category_url(): """taskplan level2 get disease subcategory url """ base_url = 'http://www.mayoclinic.org' task = load_jt('task.json') spider = Crawler() for url in task: html = spider.html(url) if html: soup = BS4(html) div = soup.find_all('div', id = 'main_0_left1_0_tertiarynav')[0] for a in div.find_all('a'): task[url].setdefault(base_url + a['href'], {'data': {'category': a.text.strip()}}) dump_jt(task, 'task.json', replace = True)
def get_disease_url(): """taskplan level1 get disease homepage url """ base_url = 'http://www.mayoclinic.org' task = load_jt('task.json') spider = Crawler() for entrance_url in gen_entranceURL(): html = spider.html(entrance_url) if html: soup = BS4(html) ol = soup.find_all('ol')[1] for li in ol.find_all('li'): url = base_url + li.a['href'] task.setdefault(url, {'data': {'disease_name': li.text.strip()} } ) dump_jt(task, 'task.json', replace = True)
def zillow_property_detail(address, zipcode): url = gen_url(address, zipcode) # generate query's http url spider = Crawler() html = spider.html(url) # fetch html if html: # if good html, analysis it try: soup = BS4(html) dt = soup.find("dt", class_ ="property-data") info = dt.text.strip() span = soup.find("span", itemprop = "addressLocality") city = span.text.strip() span = soup.find("span", itemprop = "addressRegion") state = span.text.strip() return address, city, state, zipcode, info except: # if something wrong in analysis, raise ExtractorError raise ExtractorError(address, zipcode, url) else: # if bad html, raise HttpError raise HttpError(address, zipcode, url)
def download_all(): """crawl them all, disease_name - subcategory """ task = load_jt('task.json') data = load_jt('data.json') spider = Crawler() for _, v in task.iteritems(): disease_name = v['data']['disease_name'] data.setdefault(disease_name, {}) for url, v1 in ignore_iteritems(v, ignore = ['data']): print url html = spider.html(url) if html: soup = BS4(html) div = soup.find('div', id='main-content') data[disease_name].setdefault(v1['data']['category'], str(div)) dump_jt(data, 'data.json', fastmode = True, replace = True)
def zillow_property_detail(address, zipcode): url = gen_url(address, zipcode) # generate query's http url spider = Crawler() html = spider.html(url) # fetch html if html: # if good html, analysis it try: soup = BS4(html) dt = soup.find("dt", class_="property-data") info = dt.text.strip() span = soup.find("span", itemprop="addressLocality") city = span.text.strip() span = soup.find("span", itemprop="addressRegion") state = span.text.strip() return address, city, state, zipcode, info except: # if something wrong in analysis, raise ExtractorError raise ExtractorError(address, zipcode, url) else: # if bad html, raise HttpError raise HttpError(address, zipcode, url)
def property_info(address, zipcode): url = gen_url(address, zipcode) spider = Crawler() html = spider.html(url) if html: try: soup = BS4(html) dt = soup.find("dt", class_="property-data") info = dt.text.strip() span = soup.find("span", itemprop="addressLocality") city = span.text.strip() span = soup.find("span", itemprop="addressRegion") state = span.text.strip() return address, city, state, zipcode, info except: log.write( "Failed to analysis address = %s, zipcode = %s" % (address, zipcode), "Failed Extraction") return None else: log.write("%s Failed to get http request" % url, "Http Error")
def html_WITHOUt_proxy(): """test normal http request""" url = "http://docs.python-requests.org/" spider = Crawler() html = spider.html(url) print(BS4(html).prettify())
def enable_proxy(): print("{:=^100}".format("enable_proxy")) spider = Crawler() spider.enable_proxy() print(spider)
def set_referer(): print("{:=^100}".format("set_referer")) spider = Crawler() print(spider) spider.set_referer("https://www.python.org/") print(spider)
def UT_crawl_html(): spider = Crawler() # test html download url = "https://www.python.org/" print( spider.html(url) ) # test save html to .html url = "http://www.archives.com/" spider.save_html(url, "www.archives.com.html") # test file download img_url = "https://www.python.org/static/img/python-logo.png" spider.download(img_url, "python-logo.png") # test download html to .html file target_url = "http://www.archives.com/" # www.archives.com 是 utf-8 编码 with open("before_login.html", "wb") as f: # because requests.text return bytes, so mode has to be "wb" html = spider.html(target_url) print( type(html), chardet.detect(html) ) # 用于展示在2和3之中requests.text返回的对象类型 f.write( html ) spider._login(url = "http://www.archives.com/member/", # test html after login payload = {"__uid":"*****@*****.**","__pwd":"efa2014"} ) with open("after_login.html", "wb") as f: f.write( spider.html(target_url))