def crawl_item_details(queue, i): shopall = ShopAllItemDb1() agentIp = Utils.GetAgentIp() # agentIp = "127.0.0.1:8888" driver = PhantomDriver(1, agentIp, 60).return_driver() while (True): try: nid = None nid = queue.get(block=False) except: pass try: if nid: item = CrawlItem.crawl(driver, nid) if item: shopall.insert_or_update_details(item) else: #如果抓取失败就更换代理ip并重新放到队列中 agentIp = Utils.GetAgentIp() try: queue.put(nid, block=False) except: pass print("进程%s抓取%s宝贝详情失败,重试 Ip:%s" % (i, nid, agentIp)) except Exception, e: agentIp = Utils.GetAgentIp() driver.quit() driver = PhantomDriver(1, agentIp, 60).return_driver() try: queue.put(nid, block=False) except: pass print("进程%s抓取%s宝贝详情失败:%s,退出浏览器重试" % (i, nid, e.message)) driver.quit()
def crawl_shop_all_item(self): agentIp = Utils.GetAgentIp() # agentIp="127.0.0.1:8888" driver = PhantomDriver(2, agentIp, 60) parms_url = "{shop_url}/i/asynSearch.htm?_ksTS={now}569_240&callback=jsonp241&mid=w-14766145001-0&wid=14766145001&path=/search.htm&search=y&orderType=hotsell_desc&scene=taobao_shop&pageNo={page_num}" url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format(shop_url=self.shop_url) print (url) result = driver.download_no_quit(url) shop_id = -1 source = result['page_source'] if result['ok']: html = etree.HTML(source) shop_items = [] if html is not None and 'page-info' in source and html.xpath("//span[contains(@class,'page-info')]/text()"): total = int(html.xpath("//span[contains(@class,'page-info')]/text()")[0].split('/')[1]) if html.xpath("//meta[@name='microscope-data']"): for meta in html.xpath("//meta[@name='microscope-data']")[0].get('content').split(';'): if 'shopid' in meta.lower(): shop_id = meta.split("=")[1] self.shopall.format_data(shop_id,False) shop_items.extend(self.parse_items(html, shop_id)) driver = PhantomDriver(1, agentIp, 60) for i in range(1, total): page_num = i + 1 print("page%s" % page_num) url = parms_url.format(shop_url=self.shop_url, now=int(time.time()), page_num=page_num) result = driver.download_no_quit(url) if result['ok']: html = etree.HTML(result['page_source']) if result['ok'] and 'page-info' in source and html.xpath("//span[contains(@class,'page-info')]/text()"): results = self.parse_items(html, shop_id) shop_items.extend(results) self.shopall.insert_or_update(shop_items) else: print("无法获取%s" % agentIp) return -1 return shop_id