Beispiel #1
0
 def crawlMonthSales(self, nid, agentip):
     try:
         month_Sales = ""
         nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}"
         refer_url = "https://detail.taobao.com/item.htm?id={nid}"
         nid_Url = nid_url.format(nid=nid)
         nid_refer = refer_url.format(nid=nid)
         cookies = "ab=56; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; uc2=wuf=https%3A%2F%2Ftrade.tmall.com%2Fdetail%2ForderDetail.htm%3Fbiz_order_id%3D70514222507416230%26forward_action%3D; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _tb_token_=3e0501668eb3b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VW9L9wvPPdgBBh%2BJHeH%2BVW8D%2FgmRg%2B6YCnShUPaOH0CFHrL4%2FVpP4v7d; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=efe1ec1051eec814; v=0; cookie2=1ce9fff7464537de3d45fe012006d49d; unb=2077259956; t=1630b104e4d32df897451d6c96642469; _m_h5_tk=37be146862abddcfc955f9ec15ebb25d_1508307778971; _m_h5_tk_enc=7ab9ef3ea063dd2c4cd6d33cf84ea2a4; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; uc1=cookie14=UoTcBzysjIcUbw%3D%3D&lng=zh_CN; isg=Amxsuy9SGdk0Xg26l9-JufebPUpejRva_jrq6MateJe60Qzb7jXgX2Ljh68S; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C9A2685321202E656A2C4B44241C24328"
         # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0"
         cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')}
         header = {'ip': agentip, 'Referer': nid_refer,
                   "cookies": cookie_dict,
                   'User-Agent': Html_Downloader.GetUserAgent()}
         ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
         if not ok:
             count = 0
             while (count < 11):
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip}
                 ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
                 if ok:
                     break
                 count += 1
                 print "获取月销量第{conut}试错".format(count=count)
         if ok:
             month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",",
                                                                                                                 "").replace(
                 ",", "").strip()
             if month_Sales == None:
                 self.crawlMonthSales(nid, agentip)
             print  "获得月销量为:{month_Sales}".format(month_Sales=month_Sales)
             return month_Sales
     except Exception, e:
         logging.info("月销量爬取错误{m}".format(m=e.message))
Beispiel #2
0
 def run(self):
     agentip = Utils.GetMyAgent()
     agentipjj = Utils.GetMyAgent()
     day = datetime.now().strftime("%Y%m%d")
     search_url = "https://s.taobao.com/search?q={q}&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_{day}&ie=utf8&bcoffset=0&ntoffset=1&p4ppushleft=%2C44&sort=sale-desc&s={s}"
     page_Url = search_url.format(q=self.key_word, day=day, s=0)
     header = {'ip': agentip}
     total = 3
     totalpage = self.crawlTotalpage(page_Url, header)
     total = totalpage if totalpage < total else total
     total = total + 1
     for i in range(1, total):
         t_url = search_url.format(q=self.key_word, day=day, s=(i - 1) * 44)
         try:
             ok, response = Html_Downloader.Download_Html(t_url, {}, header)
             if not ok:
                 count = 0
                 while (count < 4):
                     sleep(2)
                     agentip = Utils.GetMyAgent()
                     header = {'ip': agentip}
                     ok, response = Html_Downloader.Download_Html(
                         t_url, {}, header)
                     if ok:
                         break
                     count += 1
                     if count == 3:
                         header = {}
             if ok:
                 html = etree.HTML(response.text)
                 matchs = html.xpath(
                     "//script[contains(.,'g_page_config')]")
                 if len(matchs) > 0:
                     data = re.compile(
                         "g_page_config=(.*)?;g_srp_loadCss").match(
                             matchs[0].text.replace("\n\n", "\n").replace(
                                 "\n", "").replace(" ", ""))
                     if data.lastindex > 0:
                         data = json.loads(data.group(1).encode('utf-8'))
                         if data.has_key('mods'):
                             self.crawlNid(data, i, agentip, agentipjj)
                     else:
                         print("无法匹配有效的json")
                 else:
                     print("无法匹配到宝贝列表")
             else:
                 logging.info("关键词{p}第{i}页抓取失败{m}".format(p=self.key_word,
                                                          i=i,
                                                          m=e.message))
         except Exception, e:
             logging.info("关键词{p}第{i}页抓取错误{m}".format(p=self.key_word,
                                                      i=i,
                                                      m=e.message))
Beispiel #3
0
 def crawlMonthSales(self, nid, agentip):
     try:
         month_Sales = ""
         nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}"
         refer_url = "https://detail.taobao.com/item.htm?id={nid}"
         nid_Url = nid_url.format(nid=nid)
         nid_refer = refer_url.format(nid=nid)
         cookies = "ab=12; UM_distinctid=15e7a46caf311b-0188d989dac01e-5c153d17-144000-15e7a46caf4552; thw=cn; ali_apache_id=11.131.226.119.1505353641652.239211.1; miid=2033888855982094870; l=AllZcEkSLTy0io2vJcWc-ksY6U4zk02Y; _cc_=WqG3DMC9EA%3D%3D; tg=0; _uab_collina=150780747345339957932139; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _tb_token_=3d73497b6b4b1; ali_ab=14.23.99.131.1510570522194.8; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; _m_h5_tk=c690a92415e1684e37a0d852f95c4237_1511139636041; _m_h5_tk_enc=03e0735d1910593631f521e6615c4e4b; x=124660357; uc3=sg2=AVMH%2FcTVYAeWJwo98UZ6Ld9wxpMCVcQb0e1XXZrd%2BhE%3D&nk2=&id2=&lg2=; uss=VAmowkFljKPmUhfhc%2B1GBuXNJWn9cLMEX%2FtIkJ5j0tQgoNppvUlaKrn3; tracknick=; sn=%E4%BE%9D%E4%BF%8A%E6%9C%8D%E9%A5%B0%3A%E8%BF%90%E8%90%A5; skt=53a079a2a620057d; v=0; cookie2=17f5415096176ca88c03d1fed693a1d4; unb=2077259956; t=1630b104e4d32df897451d6c96642469; uc1=cookie14=UoTdev2%2BYyNASg%3D%3D&lng=zh_CN; _umdata=85957DF9A4B3B3E8F872A3094256432F0F1549AE1C92C6CCF1E68B982581686F23BFC13A60CCABD1CD43AD3E795C914C5B383FEA6B5C410F78EAF10A11987746; isg=Au_vsoMX6XTuPe7jEO7aMMjafgM5PEijMRuJ0QF8i95lUA9SCWTTBu2ApHYV"
         # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0"
         cookie_dict = {item.split('=')[0]: item.split('=')[1] for item in cookies.split(';')}
         header = {'ip': agentip, 'Referer': nid_refer,
                   "cookies": cookie_dict,
                   'User-Agent': Html_Downloader.GetUserAgent()}
         ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
         if not ok:
             count = 0
             while count < 5:
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip, 'Referer': nid_refer,
                           'timeout': '5000',
                           "cookies": cookie_dict,
                           'User-Agent': Html_Downloader.GetUserAgent()}
                 ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
                 if ok:
                     break
                 count += 1
                 print "获取月销量第{count}试错".format(count=count)
         if ok and "sellCount\":" not in response.text:
             count = 0
             while count <5:
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip, 'Referer': nid_refer,
                           'timeout': '5000',
                           "cookies": cookie_dict,
                           'User-Agent': Html_Downloader.GetUserAgent()}
                 if count ==4:
                     header = {}
                 ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
                 if ok and "sellCount\":" in response.text:
                     break
                 count += 1
                 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count)
         if ok and "sellCount\":" in response.text:
             month_Sales = str(re.compile("sellCount\":(.*?)(?=\"success\")").findall(response.text)[0]).replace(",",
                                                                                                                 "").replace(
                     ",", "").strip()
             print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales)
             return month_Sales
     except Exception, e:
         logging.info("月销量爬取错误{m}".format(m=e.message))
Beispiel #4
0
 def testurl(self, url, agentIp):
     print("111")
     header = {'ip': agentIp}
     for i in range(1, 200):
         sleep(1)
         ok, response = Html_Downloader.Download_Html(url, {}, header)
         print(ok)
         if not ok:
             ok, response = Html_Downloader.Download_Html(url, {}, {})
         print(url)
         if ok:
             html = etree.HTML(response.text)
     print("1111")
Beispiel #5
0
def insertLog(crawl_content,message,shop_id,agentIp,shop_url,start_time,shop_name):
    end_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    getData="&shop_id="+shop_id+"&ip_addr="+agentIp+"&shop_name="+shop_name+"&crawl_url="+shop_url+"&start_time="+start_time+"&end_time="+end_time+"&crawl_content="+crawl_content+"&error_info="+message
    log="http://192.168.10.198:8080/pdd/CrawlerLogController/SaveCrawlerLog?"
    logZS="http://syjcapi.da-mai.com/CrawlerLogController/SaveCrawlerLog?"
    logUU=log+getData
    logUUZS=logZS+getData
    ok, result = Html_Downloader.Download_Html(logUU,{},{})
    if ok:
        result_json = json.loads(result.content)
        #result_ok = bool(result_json['status'])
    ok, result = Html_Downloader.Download_Html(logUUZS,{},{})
    if ok:
        result_json = json.loads(result.content)
Beispiel #6
0
 def crawl_shop_all_item(self):
     agentIp = Utils.GetAgentIp()
     header = {'ip': agentIp}
     shop_id = -1
     # agentIp=None
     # agentIp = '120.24.171.107:16816'
     url = "{shop_url}/search.htm?&search=y&orderType=hotsell_desc&scene=taobao_shop".format(
         shop_url=self.shop_url)
     url = self.shop_url
     print url
     # data=urllib2.urlopen(url).readlines()
     # soup=BeautifulSoup(''.join(data), fromEncoding='utf8')
     # primary_consumer = soup.find(id="bd")
     ok, response = Html_Downloader.Download_Html(url, {}, header)
     soup = BeautifulSoup(''.join(response), fromEncoding='utf8')
     header = soup.find(id="J_GlobalNav")
     div = header.text
     # print(ok)
     if ok:
         html = etree.HTML(response.text.encode('utf-8'))
         if html is not None and html.xpath("//header[@id='mp-header']"):
             if "shopId" in html.xpath("//header[@id='mp-header']")[0].get(
                     "mdv-cfg").split(':')[0]:
                 shop_id = html.xpath("//header[@id='mp-header']")[0].get(
                     "mdv-cfg").split(':')[1]
                 shop_id = shop_id.replace("\'}", "").replace("\'", "")
     url = "{shop_url}/shop/shop_auction_search.do?sort=d&p=1&page_size=90&from=h5&shop_id={shop_id}&ajson=1&_tm_source=tmallsearch&orderType=hotsell_desc".format(
         shop_url=self.shop_url, shop_id=shop_id)
     print(url)
     # driver = PhantomDriver(2, agentIp, 60)
     # driver.download_no_quit(self.shop_url)
     # sleep(1)
     # for i in range(20):
     #     result = driver.download_no_quit(url)
     #     sleep(3)
     # source = result['page_source']
     # driver.return_driver().quit()
     # if result['ok']:
     #     html = etree.HTML(source)
     ok, response = Html_Downloader.Download_Html(url, {}, header)
     print(ok)
     if not ok:
         ok, response = Html_Downloader.Download_Html(url, {}, {})
     print(url)
     if ok:
         html = etree.HTML(response.text)
         data = json.loads(html.group(1).encode('utf-8'))
         print
Beispiel #7
0
def get_total_page(self, url_params, agent_ip, cookie_dict, start, end):
    total_page = 0
    try:
        ok, result = Html_Downloader.Download_Html(self.url, {
            item.split('=')[0]: item.split('=')[1]
            for item in url_params.split('&')
        }, {
            "cookies":
            cookie_dict,
            "ip":
            agent_ip,
            "Referer":
            "https://trade.taobao.com/trade/itemlist/list_sold_items.htm?action=itemlist/SoldQueryAction&event_submit_do_query=1&auctionStatus=SUCCESS&tabCode=success"
        },
                                                   post=True)
        if ok:
            order_json = json.loads(
                result.text.replace("\n", "").replace("\r", ""))
            if order_json.has_key('page'):
                total_page = int(order_json['page']['totalNumber'])
        else:
            print(result)
    except:
        pass
    return total_page
Beispiel #8
0
 def process_request(self, url, data):
     result_ok = False
     ok, result = Html_Downloader.Download_Html(url, data, {'timeout':60}, post=True)
     if ok:
         result_json = json.loads(result.content)
         result_ok = bool(result_json['flag'])
     return result_ok
Beispiel #9
0
def process_request(url, data):
    result_ok = False
    ok, result = Html_Downloader.Download_Html(url, data, {'timeout': 10}, post=True)
    if ok:
        result_json = json.loads(result.content)
        result_ok = bool(result_json['flag'])
        logging.info("数据存储成功{result_ok}".format(result_ok=result_ok))
    return result_ok
Beispiel #10
0
def crawl_keyWord(shop_id_url):
    keyword = ""
    try:
        ok, result = Html_Downloader.Download_Html(shop_id_url, {}, {})
        if ok:
            result_json = json.loads(result.content)
            result_ok = bool(result_json['status'])
            keyword = result_json['data'][0]['keyword'].encode('utf-8').strip()
        else:
            print('调用接口获取关键词失败')
    except Exception, e:
        logging.info("获取关键词失败{e}".format(e=e.message))
Beispiel #11
0
 def process_request(self, url, data):
     result_ok = False
     ok, result = Html_Downloader.Download_Html(url, {
         item.split('=')[0]: item.split('=')[1]
         for item in data.split('&')
     }, {},
                                                post=True)
     if ok:
         result_json = json.loads(result.content)
         result_ok = bool(result_json['flag'])
         self.log_and_print(result_json['message'])
     else:
         self.log_and_print(result)
     return result_ok
Beispiel #12
0
 def crawlTotalpage(self, search_url, header):
     try:
         ok, response = Html_Downloader.Download_Html(
             search_url, {}, header)
         if not ok:
             count = 0
             while (count < 4):
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip}
                 ok, response = Html_Downloader.Download_Html(
                     search_url, {}, header)
                 if ok:
                     break
                 count += 1
                 if count == 3:
                     header = {}
         if ok:
             html = etree.HTML(response.text)
             matchs = html.xpath("//script[contains(.,'g_page_config')]")
             if len(matchs) > 0:
                 data = re.compile(
                     "g_page_config=(.*)?;g_srp_loadCss").match(
                         matchs[0].text.replace("\n\n", "\n").replace(
                             "\n", "").replace(" ", ""))
                 if data.lastindex > 0:
                     data = json.loads(data.group(1).encode('utf-8'))
                     if data.has_key('mods'):
                         totalpage = data['mods']['pager']['data'][
                             'totalPage']
             else:
                 print("无法匹配有效的json")
         else:
             print("无法匹配到宝贝列表")
     except Exception, e:
         logging.info("关键词{p}第{i}页抓取错误{m}".format(m=e.message))
Beispiel #13
0
 def crawl_yxl(self,auctionId,agentIp):
     yxl=-1
     count =0
     while(count<20):
         agentIp=Utils.GetMyAgent()
         userAgent=Html_Downloader.GetUserAgent()
         header = {'ip': agentIp,'user-agent':userAgent}
         text_detail_url="https://detail.m.tmall.com/item.htm?spm=a320p.7692363.0.0&id={auctionId}".format(auctionId=auctionId)
         ok, response = Html_Downloader.Download_Html(text_detail_url,{}, header)
         if ok:
             matchs=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text)
             if len(matchs) > 0:
              if "sellCount" in response.text:
                 yxl=re.compile("sellCount\":(.*?)(?=showShopActivitySize)").findall(response.text)[0].encode('utf-8')
                 yxl=yxl.replace(",\"","")
                 break
         sleep(3)
         count+=1
     return  yxl
Beispiel #14
0
    def get_sales_cnt(self, start, end):
        url = GET_SALES_ORDER_CNT_API
        trade_id = []
        post_data = "account_name=%sstart=%s&end=%s" % (self.account_name,
                                                        start, end)

        ok, result = Html_Downloader.Download_Html(url, {
            item.split('=')[0]: item.split('=')[1]
            for item in post_data.split('&')
        }, {},
                                                   post=True)
        if ok:
            result_json = json.loads(result.content)
            if bool(result_json['flag']):
                trade_id = set(result_json['data'])
            self.log_and_print(result_json['message'])
        else:
            self.log_and_print(result)
            self.log_and_print("获取id列表失败")
        return trade_id
Beispiel #15
0
    def crawl_shop_all_item(self, url):
        agentIp = Utils.GetMyAgent()
        shop_id = self.shop_id
        shop_name = self.shop_name
        userAgent = Html_Downloader.GetUserAgent()
        header = {'ip': agentIp, 'user-agent': userAgent}
        text_detail_url = url
        ok, response = Html_Downloader.Download_Html(text_detail_url, {},
                                                     header)
        if ok:
            jsonArray = json.loads(response.content)  # 获取解析的json
            total_page = jsonArray.get("total_page")
            total_results = jsonArray.get("total_results")
            page_size = jsonArray.get("page_size")
            jsonResult = jsonArray.get("items")
            for item in jsonResult:
                shop_item = {}
                item_id = str(item.get("item_id")).strip()
                shop_item['item_id'] = item_id
                shop_item['title'] = item.get('title').encode('utf-8')
                shop_item['picUrl'] = "http:" + item.get('img')
                #现在的销售价
                shop_item['salePrice'] = item.get('price')
                shop_item['totalSoldQuantity'] = item.get('totalSoldQuantity')
                shop_item['crawl_url'] = item.get('url')
                shop_item['crawl_time'] = long(time.time())
                #接口url 获取宝贝种类(颜色分类)不需要这个接口了,下面那个接口就可以得到颜色分类等信息
                '''
                test_Url="http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id="+item_id
                ok, response = Html_Downloader.Download_Html(test_Url,{}, header)
                if ok:
                   jsonItems=json.loads(response.content)  # 获取解析的json
                '''
                #接口url 获取SKU详细信息()
                shop_item['quantity'] = 0
                getSKU_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format(
                    item_id=item_id)
                ok, response = Html_Downloader.Download_Html(
                    getSKU_Url, {}, header)
                if ok:
                    jsonItems = json.loads(response.content)
                    total_data = jsonItems.get("data")
                    for date in total_data:
                        quantity = date.get("quantity")
                        shop_item[
                            'quantity'] = shop_item['quantity'] + quantity
                #获取宝贝详情页信息 (第二屏信息)
                getDetail_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id={item_id}".format(
                    item_id=item_id)
                ok, response_detail = Html_Downloader.Download_Html(
                    getDetail_Url, {}, header)
                if ok:
                    shop_item['attribute'] = []
                    #jsonDetails = response_detail['data']['data']
                    jsonDetails = json.loads(response_detail.content)
                    properties = jsonDetails['data']['data']['properties']
                    stringName = ""
                    for attri in properties:
                        #string = "{name}:{value}&&||".format(name=attri.get('name'),value=attri.get('value'))
                        name = attri.get('name')
                        value = attri.get('value')
                        if name in stringName:
                            #shop_item['attribute'].append(name)
                            string = "{value} ".format(value=value)
                            shop_item['attribute'].append(string)
                        if name not in stringName:
                            string = "{name}:{value}&&||".format(name=name,
                                                                 value=value)
                            shop_item['attribute'].append(string)
                            stringName = name + stringName

        for page in total_page:
            #重写json的URL并完成回调函数
            ###!!!!!注意这里店铺的url写死了,应该传参进来!!!!
            getlist_url="https://yiqianny.m.tmall.com/shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&" \
                    "spm=a320p.7692171.0.0&sort=d&p={page}&page_size=24&from=h5".format(page=page)
            p = multiprocessing.Process(
                target=self.crawl_shop_all_item(getlist_url), args=(page, ))
            p.start()
            logging.info("开始多进程爬虫,爬取的json列表为:{url}".format(url=getlist_url))
            self.crawl_shop_all_item(getlist_url)
Beispiel #16
0
 def parse_items(self,jsonArray, shop_id,agentIp):
     shop_items=[]
     # agentIp=None
     header = {'ip': agentIp}
     for item in jsonArray:
          shop_item = {}
          shop_item['shop_id']=shop_id
          auctionId=item.get('auctionId')
          shop_item['item_id']=auctionId
          shop_item['title']= item.get('title')
          shop_item['picUrl']="http:"+item.get('picUrl')
          # shop_item['picUrl']=re.compile("/([^/]*)(?=_)").findall(item.get('picUrl'))[0]
          shop_item['salePrice']= item.get('salePrice')
          shop_item['reservePrice']= item.get('reservePrice')
          shop_item['quantity']= item.get('quantity')
          shop_item['totalSoldQuantity']= item.get('totalSoldQuantity')
          #获取链接里面的详情
          t_detail_url="https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-14766145001.18.6584ae82X93XhC&id={auctionId}".format(auctionId=auctionId)
          shop_item['crawl_url']=t_detail_url
          print(t_detail_url)
          shop_item['crawl_time'] = long(time.time())
          sold=item.get('sold')
          if "tmall" in self.shop_url:
              sold=""
              sold=self.crawl_yxl(auctionId,agentIp)
          #天猫的月销量另外获取
          shop_item['sold']=sold
          try:
                ok, response = Html_Downloader.Download_Html(t_detail_url,{}, header)
                if not ok:
                    count =0
                    while(count<4):
                         sleep(2)
                         agentip = Utils.GetMyAgent()
                         header = {'ip': agentip}
                         ok, response = Html_Downloader.Download_Html(t_detail_url,{},header)
                         if ok and "category=item" in response.text:
                             break
                         count+=1
                         if count==3:
                             header={}
                if ok and  "category=item" not in response.text:
                    count =0
                    while(count<4):
                         sleep(2)
                         agentip = Utils.GetMyAgent()
                         header = {'ip': agentip}
                         ok, response = Html_Downloader.Download_Html(t_detail_url,{},header)
                         if ok and "category=item" in response.text:
                             break
                         count+=1
                         if count==3:
                             header={}
                if ok and "category=item" in response.text:
                   html = etree.HTML(response.text)
                   # shop_id = ""
                   category_id= re.compile("item%5f(.*?)(?=&)").findall(response.text)[0]
                   shop_item['category_id']=category_id
                   if html.xpath("//dl[contains(@class,'tb-prop')]"):
                       for prop in html.xpath("//dl[contains(@class,'tb-prop')]"):
                             if not prop in html.xpath("//dl[contains(@class,'tb-hidden')]"):
                                 prop_value_id=[]
                                 prop_name = prop.xpath(".//dt/text()")[0].encode('utf-8')
                                 for value in prop.xpath(".//dd/ul/li"):
                                        sub_value_id= []
                                        sku_id = value.get('data-value')
                                        sub_value_id.append(sku_id)
                                        if value.xpath('./a/span/text()'):
                                            sku_name = value.xpath('./a/span/text()')[0].encode('utf-8')
                                            sub_value_id.append(sku_name)
                                            # prop_value_id.append(";".join(sub_value_id))
                                        if value.xpath('./a')[0].get('style') and re.compile("/([^/]*)(?=_!!|_M2)").findall(
                                            value.xpath('./a')[0].get('style')):
                                            sku_img_url=re.compile("/([^/]*)(?=_!!|_M2)").findall(value.xpath('./a')[0].get('style'))[0]
                                            sub_value_id.append(sku_img_url)
                                        prop_value_id.append(";".join(sub_value_id))
                             # shop_item[prop_name] ="&&||".join(prop_value_id)
                                 shop_item[prop_name] =prop_value_id
                   if html.xpath("//ul[@id='J_UlThumb']"):
                         stype_img_id=[]
                         if html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li/div"):
                             for value1 in html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li/div"):
                                      if value1.xpath('./a')[0].xpath('./img')[0].get('data-src') and re.compile("/([^/]*)(?=_!!|_M2)").findall(
                                            value1.xpath('./a')[0].xpath('./img')[0].get('data-src')):
                                            sku_img_id=re.compile("/([^/]*)(?=_!!|_M2)").findall(value1.xpath('./a')[0].xpath('./img')[0].get('data-src'))[0]
                                            stype_img_id.append(sku_img_id)
                         elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li"):
                             for value1 in html.xpath("//ul[@id='J_UlThumb']")[0].xpath(".//li"):
                                      if value1.xpath('./a')[0].xpath('./img')[0].get('src') and re.compile("/([^/]*)(?=_!!|_M2)").findall(
                                            value1.xpath('./a')[0].xpath('./img')[0].get('src')):
                                            sku_img_id=re.compile("/([^/]*)(?=_!!|_M2)").findall(value1.xpath('./a')[0].xpath('./img')[0].get('src'))[0]
                                            stype_img_id.append(sku_img_id)
                         shop_item["img_attr"]="&&||".join(stype_img_id)
                   if html.xpath("//ul[@id='J_AttrUL']"):
                       styleliList=[]
                       for styleli in html.xpath("//ul[@id='J_AttrUL']")[0].xpath(".//li"):
                            if styleli.xpath('./text()'):
                                 styleliText= styleli.xpath('./text()')[0].encode('utf-8').strip()
                                 styleliList.append(styleliText)
                   elif html.xpath("//div[@id='attributes']"):
                       styleliList=[]
                       for styleli in html.xpath("//div[@id='attributes']")[0].xpath(".//ul/li"):
                            if styleli.xpath('./text()'):
                                 styleliText= styleli.xpath('./text()')[0].encode('utf-8').strip()
                                 styleliList.append(styleliText)
                   shop_item["attribute"]="&&||".join(styleliList)
          except Exception, e:
              logging.info("----详情抓取错误----".format(e=e.message))
          shop_items.append(shop_item)
 def crawl_shop_all_item(self):
     agentIp = Utils.GetMyAgent()
     shop_id = self.shop_id
     shop_name = self.shop_name
     userAgent = Html_Downloader.GetUserAgent()
     header = {'ip': agentIp, 'user-agent': userAgent}
     test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort" \
                       "=d&p={page}&page_size={page_size}&from=h5".format(shop_url=self.shop_url, page_size=1,
                                                                          page=1)
     test_detail_url = test_detail_url.replace(".tmall.com", ".m.tmall.com")
     try:
         ok, response = Html_Downloader.Download_Html(
             test_detail_url, {}, header)
         if not ok:
             count = 0
             while (count < 4):
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {'ip': agentip}
                 ok, response = Html_Downloader.Download_Html(
                     test_detail_url, {}, header)
                 if ok:
                     break
                 count += 1
                 if count == 3:
                     header = {}
         if ok:
             jsonArray = json.loads(response.content)  # 获取解析的json
             total_page = jsonArray.get("total_page")
             total_results = jsonArray.get("total_results")
             page_size = jsonArray.get("page_size")
             logging.info("shopname:" + shop_name + " total_page:" +
                          total_page + " total_results:" + total_results +
                          " page_size:" + page_size)
             print "total_page:" + total_page + "total_results:" + total_results + "page_size:" + page_size
             for i in range(int(total_page)):
                 print i + 1
                 test_detail_url = "{shop_url}shop/shop_auction_search.do?ajson=1&_tm_source=tmallsearch&spm=a320p.7692171.0.0&sort=d&p={page}&page_size={page_size}&from=h5".format(
                     shop_url=self.shop_url,
                     page_size=page_size,
                     page=i + 1)
                 test_detail_url = test_detail_url.replace(
                     ".tmall.com", ".m.tmall.com")
                 '''
                 if int(total_page)==(i+1):
                     lastCount=int(total_results)-i*int(page_size)
                     ok, response = Html_Downloader.Download_Html(test_detail_url,{}, header)
                     if not ok:
                         count =0
                         while(count<11):
                             sleep(2)
                             agentip = Utils.GetMyAgent()
                             header = {'ip': agentip}
                             ok, response = Html_Downloader.Download_Html(test_detail_url,{},header)
                             if ok  and "price" in response.text and lastCount-response.text.count("price")<2:
                                 break
                             count+=1
                             if count==10:
                                 header={}
                     print  response.text.count('price')
                     if ok and  "price" not in response.text:
                        print "111"
                        count =0
                        while(count<11):
                             sleep(2)
                             agentip = Utils.GetMyAgent()
                             header = {'ip': agentip}
                             ok, response = Html_Downloader.Download_Html(test_detail_url,{},header)
                             if ok and "price" in response.text and lastCount-response.text.count("price")<2:
                                 break
                             count+=1
                             if count==10:
                                 header={}
                     if ok and  lastCount-response.text.count("price")>2:
                        while(count<11):
                             sleep(2)
                             agentip = Utils.GetMyAgent()
                             header = {'ip': agentip}
                             ok, response = Html_Downloader.Download_Html(test_detail_url,{},header)
                             if ok and "price" in response.text and lastCount-response.text.count("price")<2:
                                 break
                             count+=1
                             if count==10:
                                 header={}
                     if ok  and lastCount-response.text.count("price")<2:
                         logging.info("成功获取price字符串并开始解析")
                         self.parse_items(response.content,shop_id,agentIp,shop_name,userAgent)
                 else:
                     '''
                 ok, response = Html_Downloader.Download_Html(
                     test_detail_url, {}, header)
                 if not ok:
                     count = 0
                     while (count < 11):
                         sleep(2)
                         agentip = Utils.GetMyAgent()
                         header = {'ip': agentip}
                         ok, response = Html_Downloader.Download_Html(
                             test_detail_url, {}, header)
                         if ok:
                             break
                         count += 1
                         if count == 10:
                             header = {}
                 if ok:
                     # logging.info("成功获取price字符串并开始解析")
                     self.parse_items(response.content, shop_id, agentIp,
                                      shop_name, userAgent)
     except Exception, e:
         logging.error("抓取店铺:{shop_name}失败,店铺id:{shop_id},错误内容{m}".format(
             shop_name=shop_name,
             shop_id=shop_id,
             m=e.message,
         ))
         crawl_content = "抓取列表页有错"
         message = e.message
         start_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
         insertLog(crawl_content, message, shop_id, agentIp,
                   test_detail_url, start_time, shop_name)
    def parse_items(self, content, shop_id, agentIp, shop_name, userAgent):
        try:
            # start_time2=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            jsonArray = json.loads(content)
            jsonResult = jsonArray.get("items")
            shop_items = []
            header = {'ip': agentIp, 'user-agent': userAgent}
            print "开始解析列表json数据"
            for item in jsonResult:
                shop_item = {}
                shop_item['shop_id'] = str(shop_id)
                shop_item['shop_name'] = shop_name
                item_id = str(item.get("item_id")).strip()
                shop_item['item_id'] = item_id
                shop_item['title'] = item.get('title').encode('utf-8')
                shop_item['picUrl'] = "https:" + item.get('img')
                # print  item.get('price')
                # 现在的销售价
                # shop_item['salePrice'] = item.get('price')
                shop_item['totalSoldQuantity'] = str(
                    item.get('totalSoldQuantity'))
                crawl_url = "https:" + item.get('url')
                shop_item['crawl_url'] = crawl_url.replace(
                    ".m.tmall.com", ".tmall.com")
                shop_item['crawl_time'] = long(time.time())
                # 获取quantity接口url
                # 获取Items接口url
                category_id = ""
                category_id_Url = "http://d.da-mai.com/index.php?r=itemApi/getItemInfoByItemId&item_id=" + item_id
                ok, response = Html_Downloader.Download_Html(
                    category_id_Url, {}, header)
                if not ok:
                    count = 0
                    while count < 4:
                        sleep(1)
                        agentip = Utils.GetMyAgent()
                        header = {'ip': agentip}
                        if count == 3:
                            header = {}
                        ok, response = Html_Downloader.Download_Html(
                            category_id_Url, {}, header)
                        if ok:
                            break
                        count += 1

                if ok:
                    jsonItems = json.loads(response.content)
                    category_id = jsonItems['data']['data']['cid']
                total_quantity = 0
                quantity_Url = "http://yj.da-mai.com/index.php?r=itemskus/getSkus&fields=*&num_iids={item_id}".format(
                    item_id=item_id)
                ok, response = Html_Downloader.Download_Html(
                    quantity_Url, {}, header)
                if not ok:
                    count = 0
                    while (count < 4):
                        sleep(2)
                        agentip = Utils.GetMyAgent()
                        header = {'ip': agentip}
                        ok, response = Html_Downloader.Download_Html(
                            quantity_Url, {}, header)
                        if ok and "quantity" in response.text:
                            break
                        count += 1
                        if count == 3:
                            header = {}
                if ok and "quantity" not in response.text:
                    count = 0
                    while (count < 4):
                        sleep(2)
                        agentip = Utils.GetMyAgent()
                        header = {'ip': agentip}
                        ok, response = Html_Downloader.Download_Html(
                            quantity_Url, {}, header)
                        if ok and "quantity" in response.text:
                            break
                        count += 1
                        if count == 3:
                            header = {}
                if ok and "quantity" in response.text:
                    print "成功获取sku的json字符串并开始解析"
                    jsonItems = json.loads(response.content)  # 获取解析的json
                    total_data = jsonItems.get("data")
                    for date in total_data:
                        quantity = date.get("quantity")
                        total_quantity = total_quantity + quantity
                shop_item['category_id'] = str(category_id)
                shop_item['quantity'] = str(total_quantity)
                agentip = Utils.GetMyAgent()
                shop_item['month_Sales'] = self.crawlMonthSales(
                    item_id, agentip)
                shop_items.append(shop_item)
            post_data = {'data': json.dumps(shop_items)}
            if not self.process_request(SAVE__INSERT_API, post_data):
                sleep(3)
                self.process_request(SAVE__INSERT_API, post_data)
            # if not self.process_request(SAVE__INSERT_API_ZS, post_data):
            #     sleep(3)
            #     self.process_request(SAVE__INSERT_API_ZS, post_data)
        except Exception, e:
            logging.info(
                "抓取店铺:{shop_name}失败,抓取店铺链接:{shop_url},店铺id:{shop_id},错误内容{m}".
                format(shop_name=shop_name, shop_id=shop_id, m=e.message))
            crawl_content = "解析接口数据有误"
            message = e.message
            end_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                     time.localtime(time.time()))
            insertLog(crawl_content, message, shop_id, agentIp, "", start_time,
                      shop_name)
 def crawlMonthSales(self, nid, agentip):
     try:
         month_Sales = ""
         nid_url = "https://mdskip.taobao.com/core/initItemDetail.htm?itemId={nid}"
         refer_url = "https://detail.taobao.com/item.htm?id={nid}"
         nid_Url = nid_url.format(nid=nid)
         nid_refer = refer_url.format(nid=nid)
         cookies = "x=__ll%3D-1%26_ato%3D0; l=AhERSU92PmRba9QUgSCkQMF6oRaqOoXt; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; cna=dNU/EvGIRjsCAQ4XY4PdDkHN; _m_h5_tk=7d8d6e65e5c676a6d0a69c26f7436ea1_1510363282671; _m_h5_tk_enc=e32129060738b7ce01e9114c9bec037f; sm4=440100; hng=CN%7Czh-CN%7CCNY%7C156; uc1=cookie14=UoTde95xncLyFQ%3D%3D&lng=zh_CN; uc3=sg2=Vq0THzNyGHIH22DuvMx9ZEwXL5qc2kn7REWHdois6v0%3D&nk2=&id2=&lg2=; uss=AQDPJiEXAu47o41b5k%2BKpKRT3Ckpz9nqnJX2F%2F7kZG6ttuI82ZnQa7ZL; t=1630b104e4d32df897451d6c96642469; unb=2607292494; sn=sitiselected%E6%97%97%E8%88%B0%E5%BA%97%3A%E5%A4%A7%E9%BA%A6; _tb_token_=eef7bd7b7abd6; cookie2=23bb087c638814ce8a8e329ead5332d4; isg=ApqaMZmelJirXxuDoGSRqtW160B8YxWwfLxcMqQTRi34FzpRjFtutWDlkdVw"
         # cookies="_tb_token_=f3fe5d65a6591;cookie2=171e5eb92d66332b1d52d9e2730fed33;t=bf64b0d40d912c08dd434661471b2c98;v=0"
         cookie_dict = {
             item.split('=')[0]: item.split('=')[1]
             for item in cookies.split(';')
         }
         header = {
             'ip': agentip,
             'Referer': nid_refer,
             "cookies": cookie_dict,
             'User-Agent': Html_Downloader.GetUserAgent()
         }
         ok, response = Html_Downloader.Download_Html(nid_Url, {}, header)
         if not ok:
             count = 0
             while count < 5:
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {
                     'ip': agentip,
                     'Referer': nid_refer,
                     'timeout': '5000',
                     "cookies": cookie_dict,
                     'User-Agent': Html_Downloader.GetUserAgent()
                 }
                 ok, response = Html_Downloader.Download_Html(
                     nid_Url, {}, header)
                 if ok:
                     break
                 count += 1
                 print "获取月销量第{count}试错".format(count=count)
         if ok and "sellCount\":" not in response.text:
             count = 0
             while count < 10:
                 sleep(2)
                 agentip = Utils.GetMyAgent()
                 header = {
                     'ip': agentip,
                     'Referer': nid_refer,
                     'timeout': '5000',
                     "cookies": cookie_dict,
                     'User-Agent': Html_Downloader.GetUserAgent()
                 }
                 if count == 9:
                     header = {}
                 ok, response = Html_Downloader.Download_Html(
                     nid_Url, {}, header)
                 if ok and "sellCount\":" in response.text:
                     break
                 count += 1
                 print "sellCount不在反馈中,获取月销量第{count}试错".format(count=count)
         if ok and "sellCount\":" in response.text:
             month_Sales = str(
                 re.compile("sellCount\":(.*?)(?=\"success\")").findall(
                     response.text)[0]).replace(",",
                                                "").replace(",",
                                                            "").strip()
             print "获得月销量为:{month_Sales}".format(month_Sales=month_Sales)
             return month_Sales
     except Exception, e:
         logging.info("月销量爬取错误{m}".format(m=e.message))
Beispiel #20
0
# -*- coding: utf-8 -*-
# 用于抓取指定店铺所有宝贝及宝贝规格属性
#间隔时间小于一天拿item_id判重,间隔时间大于一天直接入库
from config import PROJECT_PATH, SEPARATOR
from lxml import etree
from utils.utils import Utils
from utils.html_downloader import Html_Downloader
import json

fq = open("d:\\1.txt", 'r')
json_str = fq.read()
fq.close()
post_data = {'data': json_str}
url = 'http://192.168.12.91:8080/pdd/competeShopRearController/saveCompeteShopInfo'
ok, result = Html_Downloader.Download_Html(url,
                                           post_data, {'timeout': 60},
                                           post=True)
print result
 def parse_items(self, html, shop_id, agentIp):
     shop_items = []
     if html.xpath("//div[contains(@class,'shop-hesper-bd')]"):
         for item in html.xpath("//div[contains(@class,'shop-hesper-bd')]"
                                )[0].xpath(".//dl"):
             shop_item = {}
             shop_item['shop_id'] = shop_id
             item_id = item.get('data-id').replace("\"",
                                                   "").replace("\\", "")
             shop_item['item_id'] = item_id
             shop_item['item_title'] = item.xpath(
                 './dd[1]/a/text()')[0].encode('utf-8').strip()
             shop_item['item_pic'] = item.xpath(
                 './dt/a/img/@src')[0].strip().replace("\\", "").replace(
                     "//", 'https://')
             shop_item['item_sales'] = int(
                 item.xpath(".//*[contains(@class,'sale-num')]/text()")
                 [0].encode('utf-8'))
             shop_item['item_old_price'] = float(
                 item.xpath(".//*[contains(@class,'s-price')]/text()")
                 [0].encode('utf-8')) if item.xpath(
                     ".//*[contains(@class,'s-price')]/text()") else None
             shop_item['item_new_price'] = float(
                 item.xpath(".//*[contains(@class,'c-price')]/text()")
                 [0].encode('utf-8'))
             shop_item['item_comment'] = int(
                 item.xpath(".//*[contains(@class,'rates')]")[0].xpath(
                     ".//span/text()")[0].encode('utf-8')) if item.xpath(
                         ".//*[contains(@class,'rates')]") else None
             shop_item['crawl_time'] = long(time.time())
             #获取链接里面的详情
             detail_url = "https://item.taobao.com/item.htm?spm=a1z10.3-c-s.w4002-14766145001.18.6584ae82X93XhC&id={item_id}"
             t_detail_url = detail_url.format(item_id=item_id)
             shop_item['crawl_url'] = t_detail_url
             header = {'ip': agentIp}
             try:
                 ok, response = Html_Downloader.Download_Html(
                     t_detail_url, {}, header)
                 print(ok)
                 if not ok:
                     ok, response = Html_Downloader.Download_Html(
                         t_detail_url, {}, {})
                 print(t_detail_url)
                 if ok:
                     html = etree.HTML(response.text)
                     # shop_id = ""
                     category_id = re.compile("item%5f(.*?)(?=&)").findall(
                         response.text)[0]
                     shop_item['category_id'] = category_id
                     # if html.xpath("//meta[@name='microscope-data']"):
                     # for meta in html.xpath("//meta[@name='microscope-data']")[0].get('content').split(';'):
                     # if 'shopid' in meta.lower():
                     # shop_id = meta.split("=")[1]
                     if html.xpath("//dl[contains(@class,'tb-prop')]"):
                         for prop in html.xpath(
                                 "//dl[contains(@class,'tb-prop')]"):
                             if not prop in html.xpath(
                                     "//dl[contains(@class,'tb-hidden')]"):
                                 prop_value_id = []
                                 prop_name = prop.xpath(
                                     ".//dt/text()")[0].encode('utf-8')
                                 for value in prop.xpath(".//dd/ul/li"):
                                     sub_value_id = []
                                     sku_id = value.get('data-value')
                                     sub_value_id.append(sku_id)
                                     if value.xpath('./a/span/text()'):
                                         sku_name = value.xpath(
                                             './a/span/text()')[0].encode(
                                                 'utf-8')
                                         sub_value_id.append(sku_name)
                                     prop_value_id.append(
                                         ";".join(sub_value_id))
                             shop_item[prop_name] = "&&||".join(
                                 prop_value_id)
                     if html.xpath("//ul[@id='J_UlThumb']"):
                         stype_img_id = []
                         if html.xpath("//ul[@id='J_UlThumb']")[0].xpath(
                                 ".//li/div"):
                             for value1 in html.xpath(
                                     "//ul[@id='J_UlThumb']")[0].xpath(
                                         ".//li/div"):
                                 if value1.xpath('./a')[0].xpath('./img')[
                                         0].get('data-src') and re.compile(
                                             "/([^/]*)(?=!!)").findall(
                                                 value1.xpath('./a')
                                                 [0].xpath('./img')[0].get(
                                                     'data-src')):
                                     sku_img_id = re.compile(
                                         "/([^/]*)(?=!!)").findall(
                                             value1.xpath('./a')[0].xpath(
                                                 './img')[0].get(
                                                     'data-src'))[0]
                                     stype_img_id.append(sku_img_id)
                         elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath(
                                 ".//li"):
                             for value1 in html.xpath(
                                     "//ul[@id='J_UlThumb']")[0].xpath(
                                         ".//li"):
                                 if value1.xpath('./a')[0].xpath(
                                         './img'
                                 )[0].get('src') and re.compile(
                                         "/([^/]*)(?=!!)").findall(
                                             value1.xpath('./a')[0].xpath(
                                                 './img')[0].get('src')):
                                     sku_img_id = re.compile(
                                         "/([^/]*)(?=!!)").findall(
                                             value1.xpath('./a')[0].xpath(
                                                 './img')[0].get('src'))[0]
                                     stype_img_id.append(sku_img_id)
                         shop_item["图片属性"] = "&&||".join(stype_img_id)
                     if html.xpath("//ul[@id='J_AttrUL']"):
                         styleliList = []
                         for styleli in html.xpath(
                                 "//ul[@id='J_AttrUL']")[0].xpath(".//li"):
                             if styleli.xpath('./text()'):
                                 styleliText = styleli.xpath(
                                     './text()')[0].encode('utf-8').strip()
                                 styleliList.append(styleliText)
                     elif html.xpath("//div[@id='attributes']"):
                         styleliList = []
                         for styleli in html.xpath("//div[@id='attributes']"
                                                   )[0].xpath(".//ul/li"):
                             if styleli.xpath('./text()'):
                                 styleliText = styleli.xpath(
                                     './text()')[0].encode('utf-8').strip()
                                 styleliList.append(styleliText)
                     shop_item["属性"] = "&&||".join(styleliList)
             except Exception, e:
                 print("----抓取错误----")
             shop_items.append(shop_item)
             self.shopall.insert_or_update(shop_items)
Beispiel #22
0
    for i in range(totalpage):
        page_num = i + 1
        logandprint("正在抓取第%s页..." % page_num)
        url_params = params.format(page_num=page_num,
                                   page_size=page_size,
                                   start=start,
                                   end=end,
                                   pre_page=1,
                                   query_more='true')
        ok, result = Html_Downloader.Download_Html(url, {
            item.split('=')[0]: item.split('=')[1]
            for item in url_params.split('&')
        }, {
            "cookies":
            cookie_dict,
            "ip":
            agent_ip,
            "Referer":
            "https://trade.taobao.com/trade/itemlist/list_sold_items.htm?action=itemlist/SoldQueryAction&event_submit_do_query=1&auctionStatus=SUCCESS&tabCode=success"
        },
                                                   post=True)
        if ok and "mainOrders" in result.text:
            try:
                order_json = json.loads(
                    result.text.replace("\r", "").replace("\n", ""))
            except Exception, e:
                logandprint("Error:%s,result:%s" % (e.message, result.text))
            parseSalesJson(order_json, start, end)
        else:
            logandprint(
                "抓取从%s到%s的订单数据第%s页数据失败:%s" %
Beispiel #23
0
def crawl_sales(self, finish_handler):
    driver = ChromeDriver()
    cookie_dic, cookies = driver.login_an_get(self.account_name,
                                              self.account_pwd)
    sleep(2)
    driver.quite()

    cookie_dict = {
        item.split('=')[0]: item.split('=')[1]
        for item in cookies.split(';')
    }

    # 时间切分成一个个时间段,切割粒度为小时,
    # 每段时间采之前对比数据库跟服务器的数量差距,
    # 差距在5条以内的不再采这天的数据
    hour_interval = 24  # 24小时检查一次

    # start_date = datetime.datetime.strptime("%s 00:00:00" % "2017-06-01", "%Y-%m-%d 00:00:00")
    # end_date = datetime.datetime.strptime("%s 00:00:00" % "2017-06-03", "%Y-%m-%d 00:00:00")
    # end_date + datetime.timedelta(days=1)

    start = long(time.mktime(self.start_date.timetuple()))
    final_end = long(time.mktime(self.end_date.timetuple()))
    end = start + (hour_interval * 3600)
    agent_ip = None
    while end <= final_end:

        url_params = self.params.format(page_num=1,
                                        page_size=1,
                                        start=start,
                                        end=end,
                                        pre_page=1,
                                        query_more='true')
        page_size = 100
        total = self.get_total_page(url_params, agent_ip, cookie_dict, start,
                                    end)

        sleep(random.randint(1, 2))
        totalpage = 0
        exist_count = self.db.get_order_count_start_end(start, end)
        if total and total > exist_count and (total - exist_count) > 5:
            totalpage = total / page_size
            if total % page_size:
                totalpage += 1
            self.log_and_print(
                "正在抓取从%s到%s的订单数据共%s条,分%s页获取" %
                (datetime.datetime.fromtimestamp(start),
                 datetime.datetime.fromtimestamp(end), total, totalpage))
        elif total:
            self.log_and_print(
                "从%s到%s订单数据已抓取完毕(服务端%s,数据库%s)" %
                (datetime.datetime.fromtimestamp(start),
                 datetime.datetime.fromtimestamp(end), total, exist_count))

        for i in range(totalpage):
            page_num = i + 1
            self.log_and_print("正在抓取第%s页..." % page_num)
            url_params = self.params.format(page_num=page_num,
                                            page_size=page_size,
                                            start=start,
                                            end=end,
                                            pre_page=1,
                                            query_more='true')
            ok, result = Html_Downloader.Download_Html(self.url, {
                item.split('=')[0]: item.split('=')[1]
                for item in url_params.split('&')
            }, {
                "cookies":
                cookie_dict,
                "ip":
                agent_ip,
                "Referer":
                "https://trade.taobao.com/trade/itemlist/list_sold_items.htm?action=itemlist/SoldQueryAction&event_submit_do_query=1&auctionStatus=SUCCESS&tabCode=success"
            },
                                                       post=True)
            if ok and "mainOrders" in result.text:
                try:
                    order_json = json.loads(
                        result.text.replace("\r", "").replace("\n", ""))
                except Exception, e:
                    self.log_and_print("Error:%s,result:%s" %
                                       (e.message, result.text))
                self.parse_sales_json(order_json, start, end)
            else:
                self.log_and_print(
                    "抓取从%s到%s的订单数据第%s页数据失败:%s" %
                    (datetime.datetime.fromtimestamp(start),
                     datetime.datetime.fromtimestamp(end), page_num, result))
            sec = random.randint(15, 40)
            self.log_and_print("等待%s秒...." % sec)
            sleep(sec)
        self.log_and_print("抓取从%s到%s的订单数据共%s条完成" %
                           (datetime.datetime.fromtimestamp(start),
                            datetime.datetime.fromtimestamp(end), total))
        start = end
        end = start + (hour_interval * 3600)
Beispiel #24
0
 def crawlNid(self, data, i, agentip, agentipjj):
     items = data['mods']['itemlist']['data']['auctions']
     x = (i - 1) * 44 + 1
     agentip = Utils.GetMyAgent()
     for item in items:
         shop_items = []
         shop_item = {}
         shop_item['keyword'] = self.key_word
         title = item['title']
         isTmall = item['shopcard']['isTmall']
         shop_item['isTmall'] = isTmall
         title = title.replace("<spanclass=H>", "").replace("</span>",
                                                            "").strip()
         shop_item['title'] = title
         nid = item['nid'].strip()
         shop_item['item_id'] = nid
         view_sales = item['view_sales'].strip()
         view_sales = view_sales.replace("人收货,", "").replace("人收货",
                                                             "").strip()
         shop_item['view_sales'] = view_sales
         shop_item['view_price'] = item['view_price'].strip()
         shop_item['picUrl'] = "http:" + item['pic_url'].strip()
         shop_item['idnick'] = item['nick'].strip()
         shop_item['crawl_time'] = long(time.time())
         shop_item['rank'] = x
         print(x)
         if x == 101:
             break
         x += 1
         #if(x<=5):
         # continue
         detail_url = "https://detail.tmall.com/item.htm?spm=a230r.1.14.1.ebb2eb2PXquhm&id={nid}&ns=1&abbucket=20"
         t_detail_url = detail_url.format(nid=nid)
         header = {'ip': agentip}
         try:
             sleep(2)
             ok, response = Html_Downloader.Download_Html(
                 t_detail_url, {}, header)
             if not ok:
                 count = 0
                 while (count < 4):
                     sleep(2)
                     agentip = Utils.GetMyAgent()
                     header = {'ip': agentip}
                     ok, response = Html_Downloader.Download_Html(
                         t_detail_url, {}, header)
                     if ok:
                         break
                     count += 1
                     if count == 3:
                         header = {}
             if ok:
                 html = etree.HTML(response.text)
                 if not "shopid" in response.text:
                     count = 0
                     while (count < 4):
                         sleep(2)
                         agentip = Utils.GetMyAgent()
                         header = {'ip': agentip}
                         ok, response = Html_Downloader.Download_Html(
                             t_detail_url, {}, header)
                         if ok:
                             html = etree.HTML(response.text)
                             if "shopid" in response.text:
                                 break
                         count += 1
                         if count == 3:
                             header = {}
                 shop_id = ""
                 # user_id=""
                 category_id = re.compile("item%5f(.*?)(?=&)").findall(
                     response.text)[0]
                 shop_item['category_id'] = category_id
                 if html.xpath("//meta[@name='microscope-data']"):
                     for meta in html.xpath(
                             "//meta[@name='microscope-data']")[0].get(
                                 'content').split(';'):
                         if 'shopid' in meta.lower():
                             shop_id = meta.split("=")[1]
                         # if 'userid=' in meta.lower():
                         #   user_id= meta.split("=")[1]
                 if html.xpath("//dl[contains(@class,'tb-prop')]"):
                     for prop in html.xpath(
                             "//dl[contains(@class,'tb-prop')]"):
                         if not prop in html.xpath(
                                 "//dl[contains(@class,'tb-hidden')]"):
                             prop_value_id = []
                             prop_name = prop.xpath(
                                 ".//dt/text()")[0].encode('utf-8')
                             for value in prop.xpath(".//dd/ul/li"):
                                 sub_value_id = []
                                 sku_id = value.get('data-value')
                                 sub_value_id.append(sku_id)
                                 if value.xpath('./a/span/text()'):
                                     sku_name = value.xpath(
                                         './a/span/text()')[0].encode(
                                             'utf-8')
                                     sub_value_id.append(sku_name)
                                 if value.xpath('./a')[0].get(
                                         'style') and re.compile(
                                             "/([^/]*)(?=_!!|_M2)").findall(
                                                 value.xpath('./a')[0].get(
                                                     'style')):
                                     sku_img_url = re.compile(
                                         "/([^/]*)(?=_!!|_M2)").findall(
                                             value.xpath('./a')[0].get(
                                                 'style'))[0]
                                     sub_value_id.append(sku_img_url)
                                 prop_value_id.append(
                                     ";".join(sub_value_id))
                             shop_item[prop_name] = prop_value_id
                 if html.xpath("//ul[@id='J_UlThumb']"):
                     stype_img_id = []
                     if html.xpath("//ul[@id='J_UlThumb']")[0].xpath(
                             ".//li/div"):
                         for value1 in html.xpath("//ul[@id='J_UlThumb']"
                                                  )[0].xpath(".//li/div"):
                             if value1.xpath('./a')[0].xpath(
                                     './img'
                             )[0].get('data-src') and re.compile(
                                     "/([^/]*)(?=_!!|_M2)").findall(
                                         value1.xpath('./a')[0].xpath(
                                             './img')[0].get('data-src')):
                                 sku_img_id = re.compile(
                                     "/([^/]*)(?=_!!|_M2)").findall(
                                         value1.xpath('./a')[0].xpath(
                                             './img')[0].get('data-src'))[0]
                                 stype_img_id.append(sku_img_id)
                     elif html.xpath("//ul[@id='J_UlThumb']")[0].xpath(
                             ".//li"):
                         for value1 in html.xpath(
                                 "//ul[@id='J_UlThumb']")[0].xpath(".//li"):
                             if value1.xpath('./a')[0].xpath(
                                     './img')[0].get('src') and re.compile(
                                         "/([^/]*)(?=_!!|_M2)").findall(
                                             value1.xpath('./a')[0].xpath(
                                                 './img')[0].get('src')):
                                 sku_img_id = re.compile(
                                     "/([^/]*)(?=_!!|_M2)").findall(
                                         value1.xpath('./a')[0].xpath(
                                             './img')[0].get('src'))[0]
                                 stype_img_id.append(sku_img_id)
                     shop_item["attr_img"] = "&&||".join(stype_img_id)
                 if html.xpath("//ul[@id='J_AttrUL']"):
                     styleliList = []
                     # dict={}
                     for styleli in html.xpath(
                             "//ul[@id='J_AttrUL']")[0].xpath(".//li"):
                         if styleli.xpath('./text()'):
                             styleliText = styleli.xpath(
                                 './text()')[0].encode('utf-8').strip()
                             # styleliText=styleliText.replace(":",":")
                             # str1=styleliText.split(":")[0].encode('utf-8').strip()
                             # str2=styleliText.split(":")[1].encode('utf-8').strip().replace("\xc2\xa0"," ").lstrip()
                             # dict[str1] =str2
                             styleliList.append(styleliText)
                 elif html.xpath("//div[@id='attributes']"):
                     styleliList = []
                     # dict={}
                     for styleli in html.xpath("//div[@id='attributes']"
                                               )[0].xpath(".//ul/li"):
                         if styleli.xpath('./text()'):
                             styleliText = styleli.xpath(
                                 './text()')[0].encode('utf-8').strip()
                             # styleliText=styleliText.replace(":",":")
                             # str1=styleliText.split(":")[0].encode('utf-8').strip()
                             # str2=styleliText.split(":")[1].encode('utf-8').strip().replace("\xc2\xa0"," ").lstrip()
                             # dict[str1] =str2
                             styleliList.append(styleliText)
                 shop_item["attribute"] = "&&||".join(styleliList)
                 # shop_item["attribute"]=dict
         except Exception, e:
             logging.info("关键词{p}抓取失败,nid={nid},{m}".format(p=self.key_word,
                                                            nid=nid,
                                                            m=e.message))
         shop_item['crawl_url'] = t_detail_url
         shop_item['shop_id'] = shop_id
         session = Session()
         self.get_total_sales(session, agentip, 1,
                              shop_id)  #首次获取会失败只为获取cookie
         total_page = 1
         for i in range(50):
             # 到最后一页就提前终止
             if total_page and i >= total_page:
                 break
             result = self.get_total_sales(session, agentip, (i + 1),
                                           shop_id)
             if not result:
                 result = self.get_total_sales(session, agentip, (i + 1),
                                               shop_id)
             if (result != None):
                 jobj = json.loads(
                     result.replace("mtopjsonp12(",
                                    "").replace("})", "}"))  # 获取解析的json
                 jsonArray = jobj['data']['itemsArray']
                 total_sales = self.parse_total_sales(jsonArray, nid)
                 if total_sales != -1:
                     break
                 if jobj and "SUCCESS" in jobj['ret'][0]:
                     total = int(jobj['data']['totalResults'])
                     total_page = total / 30  # 每页最多30个不能再多
                     if total % 30:
                         total_page += 1
                 else:
                     print("获取数据失败")
                     break
                 sleep(2)
             else:
                 total_sales = ""
         shop_item['totalSoldQuantity'] = total_sales
         shop_items.append(shop_item)
         post_data = {'data': json.dumps(shop_items)}
         if not self.process_request(SAVE_INSERT_KEYWORD_API, post_data):
             self.process_request(SAVE_INSERT_KEYWORD_API, post_data)