Beispiel #1
0
 def crawl_collection(self):
     if self.is_tmall:
         c_url = "http://count.tbcdn.cn/counter3?callback=jsonp126&keys=ICCP_1_" + self.num_id
         collectionData = self.crawl_page(c_url)
         if collectionData:
             self.collection = int(
                 get_num_val(collectionData, "ICCP_1_" + self.num_id))
         else:
             logger.warn("Can not parse tmall item collection %s",
                         self.item_id)
     else:
         counterApi = get_val(self.data, "counterApi")
         if counterApi:
             counterApi = get_val(self.data,
                                  "counterApi").replace(r'''\/''', "/")
             counterData = self.crawl_page(
                 counterApi +
                 "&callback=DT.mods.SKU.CountCenter.saveCounts")
             try:
                 self.collection = int(
                     get_num_val(counterData, 'ICCP_1_' + self.num_id))
                 self.browse = int(
                     get_num_val(counterData, 'ICVT_7_' + self.num_id))
             except:
                 self.collection = 0
                 self.browse = 0
Beispiel #2
0
    def crawl_volume(self):
        if self.is_tmall:
            apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/")
            self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
            try:
                self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8'))
            except:
                logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc())
            if self.tmallInitApijson:
                try:
                    self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount']
                except:
                    logger.warn("try to get volume from api failed %s", self.item_id)
            if self.volume < 0:
                try:
                    self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                except:
                    self.volume = 0
                    logger.warn("Can not parse tmall item volume %s", self.item_id)

        else:
            apiItemInfoVal = get_val(self.data, "apiItemInfo")
            if apiItemInfoVal:
                apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/")
                itemInfoData = self.crawl_page(apiItemInfoUrl)
                try:
                    self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    self.confirmVolume = int(get_num_val(itemInfoData, 'confirmGoods'))
                except:
                    self.volume = 0
                    logger.warn("Can not parse taobao item volume %s", self.item_id)
            else:
                self.volume = 0
Beispiel #3
0
 def crawl_collection(self):
     if self.is_tmall:
         c_url = "http://count.tbcdn.cn/counter3?callback=jsonp126&keys=ICCP_1_" + self.num_id
         collectionData = self.crawl_page(c_url)
         if collectionData:
             self.collection = int(get_num_val(collectionData, "ICCP_1_" + self.num_id))
         else:
             logger.warn("Can not parse tmall item collection %s", self.item_id)
     else:
         counterApi = get_val(self.data, "counterApi")
         if counterApi:
             counterApi = get_val(self.data, "counterApi").replace(r'''\/''', "/")
             counterData = self.crawl_page(counterApi + "&callback=DT.mods.SKU.CountCenter.saveCounts")
             try:
                 self.collection = int(get_num_val(counterData, 'ICCP_1_' + self.num_id))
                 self.browse = int(get_num_val(counterData, 'ICVT_7_' + self.num_id))
             except:
                 self.collection = 0
                 self.browse = 0
Beispiel #4
0
    def crawl_volume(self):
        if self.is_tmall:
            apiItemInfoUrl = get_val(self.data,
                                     "initApi").replace(r'''\/''', "/")
            self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
            try:
                self.tmallInitApijson = loads(
                    self.tmallInitApi.decode('gb18030').encode('utf8'))
            except:
                logger.info("parse tmall api json failed %s : %s",
                            self.item_id, traceback.format_exc())
            if self.tmallInitApijson:
                try:
                    self.volume = self.tmallInitApijson['defaultModel'][
                        'sellCountDO']['sellCount']
                except:
                    logger.warn("try to get volume from api failed %s",
                                self.item_id)
            if self.volume < 0:
                try:
                    self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                except:
                    self.volume = 0
                    logger.warn("Can not parse tmall item volume %s",
                                self.item_id)

        else:
            apiItemInfoVal = get_val(self.data, "apiItemInfo")
            if apiItemInfoVal:
                apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(
                    r'''\/''', "/")
                itemInfoData = self.crawl_page(apiItemInfoUrl)
                try:
                    self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    self.confirmVolume = int(
                        get_num_val(itemInfoData, 'confirmGoods'))
                except:
                    self.volume = 0
                    logger.warn("Can not parse taobao item volume %s",
                                self.item_id)
            else:
                self.volume = 0
Beispiel #5
0
 def crawl_stock(self):
     if self.is_tmall:
         try:
             self.stock = int(get_num_val(self.tmallInitApi, "icTotalQuantity"))
         except:
             logger.warn("Can not parse tmall item stock %s", self.item_id)
     else:
         try:
             if self.dynamicStockData:
                 self.stock = int(get_val(self.dynamicStockData, "stock").strip())
             else:
                 logger.warn("Can not parse taobao item stock %s", self.item_id)
         except:
             logger.error("Can not parse tmall item stock %s", self.item_id)
Beispiel #6
0
 def crawl_stock(self):
     if self.is_tmall:
         try:
             self.stock = int(
                 get_num_val(self.tmallInitApi, "icTotalQuantity"))
         except:
             logger.warn("Can not parse tmall item stock %s", self.item_id)
     else:
         try:
             if self.dynamicStockData:
                 self.stock = int(
                     get_val(self.dynamicStockData, "stock").strip())
             else:
                 logger.warn("Can not parse taobao item stock %s",
                             self.item_id)
         except:
             logger.error("Can not parse tmall item stock %s", self.item_id)
Beispiel #7
0
    def crawl_price(self):
        self.bidPrice = self.html_obj.xpath(
            "//input[@name='current_price']/@value")
        self.originPrice = self.html_obj.xpath(
            "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
        if not self.originPrice:
            self.originPrice = self.html_obj.xpath(
                "//strong[@class='J_originalPrice']/text()")

        self.promoteUrl2 = get_val(self.data, "apiPromoData")
        if self.promoteUrl2:
            self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/")

        price = ""
        if self.is_tmall and self.tmallInitApi and self.tmallInitApijson:
            try:
                priceInfo = self.tmallInitApijson['defaultModel'][
                    'itemPriceResultDO']['priceInfo']
                if priceInfo:
                    if priceInfo.has_key('def'):
                        defaultPriceInfo = priceInfo['def']
                    else:
                        defaultPriceInfo = priceInfo[priceInfo.keys()[0]]
                    # 2013-11-22 改为获取真实促销价格,而不是扣除佣金后的价格
                    if defaultPriceInfo.has_key(
                            'promotionList'
                    ) and defaultPriceInfo['promotionList']:
                        price = defaultPriceInfo['promotionList'][0]['price']
                    if not price:
                        if defaultPriceInfo.has_key('price'):
                            price = defaultPriceInfo['price']
                    if not price:
                        if defaultPriceInfo.has_key('promPrice'):
                            price = defaultPriceInfo['promPrice']['price']
                        elif defaultPriceInfo.has_key(
                                'promotionList'
                        ) and defaultPriceInfo['promotionList']:
                            price = str(
                                min([
                                    float(x.get('price', '100000000.0'))
                                    for x in defaultPriceInfo['promotionList']
                                ]))
            except:
                logger.warn("Parse tmall json price failed, %s", self.item_id)

        if not price:
            if self.promoteUrl2:
                self.promoteContent = self.crawl_page(
                    self.promoteUrl2).replace('&quot;', '"')
                tag = "low:"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find(',', pos)
                    price = self.promoteContent[pos:pos2]
                if not price:
                    price = get_num_val(self.promoteContent, 'price')
            else:
                self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id
                self.promoteContent = self.crawl_page(self.promoteUrl)
                if self.promoteContent:
                    self.promoteContent = self.promoteContent.replace(
                        '"', '&quot;')
                    tag = "promPrice&quot;:&quot;"
                    if self.promoteContent.find(tag) > 0:
                        pos = self.promoteContent.find(tag) + len(tag)
                        pos2 = self.promoteContent.find('&quot;', pos)
                        price = self.promoteContent[pos:pos2]

        if not price:
            tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()")
            tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()")
            if tbPrice and not tbPrice[0].strip():
                price = tbPrice[0].strip()
            elif tbPrice1 and not tbPrice1[0].strip():
                price = tbPrice1[0].strip()

        if price.find("-") > 0:
            price = price.split('-')[0].strip()

        if not price:
            rg_m = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE
                              | re.DOTALL).search(self.dynamicStockData)
            if rg_m:
                price_str = rg_m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # 2013-09-03  get price url
        if not price:
            #这里稍微有点麻烦,主要针对string进行处理
            pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id
            response = download(pirce_url, self.headers)
            rg = re.compile('price:\"[0-9]+[.][0-9]+\"',
                            re.IGNORECASE | re.DOTALL)
            m = rg.search(response.decode('gb18030').encode('utf8'))
            if m:
                price_str = m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # not chuxiao price, set origin price
        if not price:
            if self.originPrice:
                price = self.originPrice[0].strip()
            elif self.bidPrice:
                price = self.bidPrice[0].strip()
            if price.find("-") > 0:
                price = price.split('-')[0].strip()

        self.price = float(price)
        logger.debug("%s price is %s", self.item_id, self.price)
Beispiel #8
0
    def crawl(self):
        try:
            self.data = self.crawl_page(self.url)
            if FLAGS.debug_parser:
                import pdb
                pdb.set_trace()

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find(
                    'item.taobao.com'
            ) > 0 and self.data.find(
                    "window.location.href='http://detail.tmall.com/item.htm'+window.location.search"
            ) > 0:
                self.data = self.crawl_page(
                    self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True

            self.detailDiv = self.html_obj.xpath("//div[@id='detail']")
            self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']")
            self.originPrice = self.html_obj.xpath(
                "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
            if not self.originPrice:
                self.originPrice = self.html_obj.xpath(
                    "//strong[@class='J_originalPrice']/text()")
            #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()")
            self.bidPrice = self.html_obj.xpath(
                "//input[@name='current_price']/@value")
            self.thumbImages = self.html_obj.xpath(
                "//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)

            tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]

                apiItemInfoUrl = get_val(self.data,
                                         "initApi").replace(r'''\/''', "/")
                self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
                try:
                    self.tmallInitApijson = loads(
                        self.tmallInitApi.decode('gb18030').encode('utf8'))
                except:
                    logger.info("parse tmall api json failed %s : %s",
                                self.item_id, traceback.format_exc())
                if self.tmallInitApijson:
                    try:
                        self.volume = self.tmallInitApijson['defaultModel'][
                            'sellCountDO']['sellCount']
                    except:
                        logger.warn("try to get volume from api failed %s",
                                    self.item_id)
                if self.volume < 0:
                    try:
                        self.volume = int(
                            get_val(self.tmallInitApi, "sellCount"))
                    except:
                        logger.warn("Can not parse item volume %s",
                                    self.item_id)

                # 库存 :icTotalQuantity
                """"
                reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                m = RATECOUNT_RE.match(reviewInfoData)
                if m:
                    self.reviewCount = m.group(1)
                else:
                    self.reviewCount = None
                """
            else:
                self.cid = get_val(self.data, "cid")

                apiItemInfoVal = get_val(self.data, "apiItemInfo")
                if apiItemInfoVal:
                    apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(
                        r'''\/''', "/")
                    itemInfoData = self.crawl_page(apiItemInfoUrl)
                    try:
                        self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    except:
                        self.volume = -1
                else:
                    self.volume = -1

                #interval = get_val(data2, 'interval')
                # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/")
                """
                reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                self.reviewCount = get_val(reviewInfoData, 'total')
                """
        except:
            logger.error("crawling %s unknown exception %s",
                         self.item_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawlItemException',
                         ]})
            raise
Beispiel #9
0
    def crawl(self):
        try:
            self.data = self.crawl_page(self.url)
            if FLAGS.debug_parser:
                import pdb; pdb.set_trace()

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0:
                self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True

            self.detailDiv = self.html_obj.xpath("//div[@id='detail']")
            self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']")
            self.originPrice = self.html_obj.xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
            if not self.originPrice:
                self.originPrice = self.html_obj.xpath("//strong[@class='J_originalPrice']/text()")
            #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()")
            self.bidPrice = self.html_obj.xpath("//input[@name='current_price']/@value")
            self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)

            tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]

                apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/")
                self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
                try:
                    self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8'))
                except:
                    logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc())
                if self.tmallInitApijson:
                    try:
                        self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount']
                    except:
                        logger.warn("try to get volume from api failed %s", self.item_id)
                if self.volume < 0:
                    try:
                        self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                    except:
                        logger.warn("Can not parse item volume %s", self.item_id)

                # 库存 :icTotalQuantity
                """"
                reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                m = RATECOUNT_RE.match(reviewInfoData)
                if m:
                    self.reviewCount = m.group(1)
                else:
                    self.reviewCount = None
                """
            else:
                self.cid = get_val(self.data, "cid")

                apiItemInfoVal = get_val(self.data, "apiItemInfo")
                if apiItemInfoVal:
                    apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/")
                    itemInfoData = self.crawl_page(apiItemInfoUrl)
                    try:
                        self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    except:
                        self.volume = -1
                else:
                    self.volume = -1

                #interval = get_val(data2, 'interval')
                # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/")
                """
                reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                self.reviewCount = get_val(reviewInfoData, 'total')
                """
        except:
            logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
            raise
Beispiel #10
0
    def crawl_price(self):
        self.promoteUrl2 = get_val(self.data, "apiPromoData")
        if self.promoteUrl2:
            self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/")

        price = ""
        if self.is_tmall and self.tmallInitApi and self.tmallInitApijson:
            try:
                priceInfo = self.tmallInitApijson['defaultModel']['itemPriceResultDO']['priceInfo']
                if priceInfo:
                    if priceInfo.has_key('def'):
                        defaultPriceInfo = priceInfo['def']
                    else:
                        defaultPriceInfo = priceInfo[priceInfo.keys()[0]]

                    if defaultPriceInfo.has_key('promPrice'):
                        price = defaultPriceInfo['promPrice']['price']
                    elif defaultPriceInfo.has_key('promotionList') and defaultPriceInfo['promotionList']:
                        price = str(min([float(x.get('price','100000000.0')) for x in defaultPriceInfo['promotionList']]))
                    else:
                        price = defaultPriceInfo['price']
            except:
                logger.warn("Parse tmall json price failed, %s", self.item_id)

        if not price:
            if self.promoteUrl2:
                self.promoteContent = self.crawl_page(self.promoteUrl2).replace('&quot;', '"')
                tag = "low:"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find(',', pos)
                    price = self.promoteContent[pos:pos2]
                if not price:
                    price = get_num_val(self.promoteContent, 'price')
            else:
                self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id
                self.promoteContent = self.crawl_page(self.promoteUrl).replace('"', '&quot;')
                tag = "promPrice&quot;:&quot;"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find('&quot;', pos)
                    price = self.promoteContent[pos:pos2]

        if not price:
            tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()")
            tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()")
            if tbPrice and not tbPrice[0].strip():
                price = tbPrice[0].strip()
            elif tbPrice1 and not tbPrice1[0].strip():
                price = tbPrice1[0].strip()

        if price.find("-") > 0:
            price = price.split('-')[0].strip()

        # 2013-09-03  get price url
        if not price:
            #这里稍微有点麻烦,主要针对string进行处理
            pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id
            response = download(pirce_url, self.headers)
            rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE|re.DOTALL)
            m = rg.search(response.decode('gb18030').encode('utf8'))
            if m:
                price_str = m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # not chuxiao price, set origin price
        if not price:
            if self.originPrice:
                price = self.originPrice[0].strip()
            elif self.bidPrice:
                price = self.bidPrice[0].strip()
            if price.find("-") > 0:
                price = price.split('-')[0].strip()

        self.price = float(price)
        logger.debug("%s price is %s", self.item_id, self.price)