def crawl_item_details(queue, i): shopall = ShopAllItemDb() agentIp = Utils.GetAgentIp() # agentIp = "127.0.0.1:8888" driver = PhantomDriver(1, agentIp, 60).return_driver() while (True): try: nid = None nid = queue.get(block=False) except: pass try: if nid: item = CrawlItem.crawl(driver, nid) if item: print("进程%s抓取%s宝贝详情成功, 使用Ip:%s" % (i, nid, agentIp)) shopall.insert_or_update_details(item) else: # 如果抓取失败就更换代理ip并重新放到队列中 agentIp = Utils.GetAgentIp() try: queue.put(nid, block=False) except: pass print("进程%s抓取%s宝贝详情失败,重试 Ip:%s" % (i, nid, agentIp)) except Exception, e: agentIp = Utils.GetAgentIp() driver.quit() driver = PhantomDriver(1, agentIp, 60).return_driver() try: queue.put(nid, block=False) except: pass print("进程%s抓取%s宝贝详情失败:%s,退出浏览器重试" % (i, nid, e.message)) driver.quit()
def crawl_shop_all_item(self): agentIp = Utils.GetAgentIp() shop_id = -1 agentIp = None driver = PhantomDriver(1, agentIp, 60) parms_url = "{shop_url}/i/asynSearch.htm?_ksTS={now}569_240&callback=jsonp241&mid=w-14766145001-0&wid=14766145001&path=/search.htm&search=y&orderType=hotsell_desc&scene=taobao_shop&pageNo={page_num}" url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format(shop_url=self.shop_url) # url = parms_url.format(shop_url=self.shop_url, page_num=1, now=long(time.time())) result = driver.download_no_quit(url) source = result['page_source'] if result['ok']: html = etree.HTML(source) # ok, result = Html_Downloader.Download_Html(url, {}, {'ip': agentIp}, post=False) # if ok: # source=result.text # html = etree.HTML(result.text) shop_items = [] if html is not None and 'page-info' in source and html.xpath("//span[contains(@class,'page-info')]/text()"): total = int(html.xpath("//span[contains(@class,'page-info')]/text()")[0].split('/')[1]) if html.xpath("//meta[@name='microscope-data']"): for meta in html.xpath("//meta[@name='microscope-data']")[0].get('content').split(';'): if 'shopid' in meta.lower(): shop_id = meta.split("=")[1] self.shopall.format_data(shop_id, False) shop_items.extend(self.parse_items(html, shop_id)) for i in range(1, total): page_num = i + 1 print("page%s" % page_num) url = parms_url.format(shop_url=self.shop_url, now=long(time.time()), page_num=page_num) result = driver.download_no_quit(url) if result['ok']: html = etree.HTML(result['page_source']) if result['ok'] and 'page-info' in source and html.xpath("//span[contains(@class,'page-info')]/text()"): results = self.parse_items(html, shop_id) shop_items.extend(results) sleep(15) self.shopall.insert_or_update(shop_items) else: # 失败就退出关闭webdriver driver.return_driver().quit() print("无法获取%s" % agentIp) return -1 return shop_id
def run(self): str = 'CrawlNid----->>>>>>>>beginning' sys.stdout.write(str + "\r\n") sys.stdout.flush() agentip = Utils.GetAgentIp() search_url = "https://s.taobao.com/search?q={q}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_{day}&ie=utf8&bcoffset=1&ntoffset=1&p4ppushleft=1%2C48&s={s}&sort=sale-desc" day = datetime.now().strftime("%Y%m%d") total = 8 for i in range(1, total): t_url = search_url.format(q=self.key_word, day=day, s=i) header = {'ip': agentip} try: ok, response = Html_Downloader.Download_Html(t_url, {}, header) if ok: html = etree.HTML(response.text) matchs = html.xpath( "//script[contains(.,'g_page_config')]") if len(matchs) > 0: data = re.compile( "g_page_config=(.*)?;g_srp_loadCss").match( matchs[0].text.replace("\n\n", "\n").replace( "\n", "").replace(" ", "")) if data.lastindex > 0: data = json.loads(data.group(1).encode('utf-8')) if data.has_key('mods'): self.crawlNid(data) totalpage = data['mods']['pager']['data'][ 'totalPage'] total = totalpage if totalpage < total else total else: print("无法匹配有效的json") else: print("无法匹配到宝贝列表") except Exception, e: print("关键词{p}第{i}页抓取错误{m}".format(p=self.key_word, i=i, m=e.message))