def crawl_item_details(queue, i):
    shopall = ShopAllItemDb()
    agentIp = Utils.GetAgentIp()
    # agentIp = "127.0.0.1:8888"
    driver = PhantomDriver(1, agentIp, 60).return_driver()
    while (True):
        try:
            nid = None
            nid = queue.get(block=False)
        except:
            pass
        try:
            if nid:
                item = CrawlItem.crawl(driver, nid)
                if item:
                    print("进程%s抓取%s宝贝详情成功, 使用Ip:%s" % (i, nid, agentIp))
                    shopall.insert_or_update_details(item)
                else:
                    # 如果抓取失败就更换代理ip并重新放到队列中
                    agentIp = Utils.GetAgentIp()
                    try:
                        queue.put(nid, block=False)
                    except:
                        pass
                    print("进程%s抓取%s宝贝详情失败,重试 Ip:%s" % (i, nid, agentIp))
        except Exception, e:
            agentIp = Utils.GetAgentIp()
            driver.quit()
            driver = PhantomDriver(1, agentIp, 60).return_driver()
            try:
                queue.put(nid, block=False)
            except:
                pass
            print("进程%s抓取%s宝贝详情失败:%s,退出浏览器重试" % (i, nid, e.message))
            driver.quit()
    def crawl_shop_all_item(self):
        agentIp = Utils.GetAgentIp()
        shop_id = -1
        agentIp = None
        driver = PhantomDriver(1, agentIp, 60)
        parms_url = "{shop_url}/i/asynSearch.htm?_ksTS={now}569_240&callback=jsonp241&mid=w-14766145001-0&wid=14766145001&path=/search.htm&search=y&orderType=hotsell_desc&scene=taobao_shop&pageNo={page_num}"

        url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format(shop_url=self.shop_url)
        # url = parms_url.format(shop_url=self.shop_url, page_num=1, now=long(time.time()))
        result = driver.download_no_quit(url)
        source = result['page_source']
        if result['ok']:
            html = etree.HTML(source)

        # ok, result = Html_Downloader.Download_Html(url, {}, {'ip': agentIp}, post=False)
        # if ok:
        #     source=result.text
        #     html = etree.HTML(result.text)
        shop_items = []
        if html is not None and 'page-info' in source and html.xpath("//span[contains(@class,'page-info')]/text()"):
            total = int(html.xpath("//span[contains(@class,'page-info')]/text()")[0].split('/')[1])
            if html.xpath("//meta[@name='microscope-data']"):
                for meta in html.xpath("//meta[@name='microscope-data']")[0].get('content').split(';'):
                    if 'shopid' in meta.lower():
                        shop_id = meta.split("=")[1]
                        self.shopall.format_data(shop_id, False)
                        shop_items.extend(self.parse_items(html, shop_id))
            for i in range(1, total):
                page_num = i + 1
                print("page%s" % page_num)
                url = parms_url.format(shop_url=self.shop_url, now=long(time.time()), page_num=page_num)
                result = driver.download_no_quit(url)
                if result['ok']:
                    html = etree.HTML(result['page_source'])
                if result['ok'] and 'page-info' in source and html.xpath("//span[contains(@class,'page-info')]/text()"):
                    results = self.parse_items(html, shop_id)
                    shop_items.extend(results)
                sleep(15)
            self.shopall.insert_or_update(shop_items)
        else:
            # 失败就退出关闭webdriver
            driver.return_driver().quit()
            print("无法获取%s" % agentIp)
            return -1
        return shop_id
Esempio n. 3
0
 def run(self):
     str = 'CrawlNid----->>>>>>>>beginning'
     sys.stdout.write(str + "\r\n")
     sys.stdout.flush()
     agentip = Utils.GetAgentIp()
     search_url = "https://s.taobao.com/search?q={q}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_{day}&ie=utf8&bcoffset=1&ntoffset=1&p4ppushleft=1%2C48&s={s}&sort=sale-desc"
     day = datetime.now().strftime("%Y%m%d")
     total = 8
     for i in range(1, total):
         t_url = search_url.format(q=self.key_word, day=day, s=i)
         header = {'ip': agentip}
         try:
             ok, response = Html_Downloader.Download_Html(t_url, {}, header)
             if ok:
                 html = etree.HTML(response.text)
                 matchs = html.xpath(
                     "//script[contains(.,'g_page_config')]")
                 if len(matchs) > 0:
                     data = re.compile(
                         "g_page_config=(.*)?;g_srp_loadCss").match(
                             matchs[0].text.replace("\n\n", "\n").replace(
                                 "\n", "").replace(" ", ""))
                     if data.lastindex > 0:
                         data = json.loads(data.group(1).encode('utf-8'))
                         if data.has_key('mods'):
                             self.crawlNid(data)
                             totalpage = data['mods']['pager']['data'][
                                 'totalPage']
                             total = totalpage if totalpage < total else total
                     else:
                         print("无法匹配有效的json")
                 else:
                     print("无法匹配到宝贝列表")
         except Exception, e:
             print("关键词{p}第{i}页抓取错误{m}".format(p=self.key_word,
                                               i=i,
                                               m=e.message))