def crawl_one(cls, task): """ **中文文档** 抓取一个Url页面。 """ url = task.target_url cls.logger.info("crawl %s, %s left ..." % (url, task.left_counter)) try: html = spider.get_html(url) except Exception as e: cls.logger.info("http error: %s" % e, 1) return try: data = task.fetch_data(html) if task.__class__.use_json: cls.objects(_id=task._id).update(json=json.dumps(data), status=1) else: cls.objects(_id=task._id).update(pickle=pickle.dumps(data), status=1) cls.logger.info("success! data: %s" % data, 1) except Exception as e: cls.logger.info("parse error: %s" % e, 1) return
def get_html(self): """ **中文文档** 从Url上获得Html。默认使用自动检测文本编码的crawlib中的 spider.get_html(url)方法。不过有些网站需要用到cookie登陆。可以通过重写 该方法来实现。 """ return spider.get_html(self.target_url)
def get_testdata(): """ **中文文档** 下载测试数据。 """ for page, state, county, zipcode, street in testdata: url = urlencoder.browse_home_listpage_url(state, county, zipcode, street) filepath = Path("testdata", "%s.html" % page) if not filepath.exists(): html = spider.get_html(url, encoding="utf-8") textfile.write(html, filepath.abspath) for href in zillow_house_url_list: url = urlencoder.url_join(href) zid = href.split("/")[-2] filepath = Path("testdata", "%s.html" % zid) if not filepath.exists(): html = spider.get_html(url, encoding="utf-8") textfile.write(html, filepath.abspath)
def fill_state(): """Put 51 states as entry points from http://www.cvs.com/store-locator/cvs-pharmacy-locations. """ data = list() url = "http://www.cvs.com/store-locator/cvs-pharmacy-locations" html = spider.get_html(url) soup = BeautifulSoup(html) div = soup.find("div", class_="states") for a in div.find_all("a"): state = State(_id=a["href"], name=a.text.strip(), status=0) data.append(state) State.smart_insert(data)
def test_Store_fetch_data(): url = "http://www.cvs.com/store-locator/cvs-pharmacy-address/415+Monroe+Avenue-Alexandria-VA-22301/storeid=1410" html = spider.get_html(url) data = Store.fetch_data(html) js.pprint(data)
def test_City_fetch_data(): url = "http://www.cvs.com/store-locator/cvs-pharmacy-locations/Virginia/Alexandria" html = spider.get_html(url) data = City.fetch_data(html) js.pprint(data)
def test_State_fetch_data(): url = "http://www.cvs.com/store-locator/cvs-pharmacy-locations/Virginia" html = spider.get_html(url) data = State.fetch_data(html) js.pprint(data)