Example #1
0
 def crawl_collection(self):
     if self.is_tmall:
         c_url = "http://count.tbcdn.cn/counter3?callback=jsonp126&keys=ICCP_1_" + self.num_id
         collectionData = self.crawl_page(c_url)
         if collectionData:
             self.collection = int(
                 get_num_val(collectionData, "ICCP_1_" + self.num_id))
         else:
             logger.warn("Can not parse tmall item collection %s",
                         self.item_id)
     else:
         counterApi = get_val(self.data, "counterApi")
         if counterApi:
             counterApi = get_val(self.data,
                                  "counterApi").replace(r'''\/''', "/")
             counterData = self.crawl_page(
                 counterApi +
                 "&callback=DT.mods.SKU.CountCenter.saveCounts")
             try:
                 self.collection = int(
                     get_num_val(counterData, 'ICCP_1_' + self.num_id))
                 self.browse = int(
                     get_num_val(counterData, 'ICVT_7_' + self.num_id))
             except:
                 self.collection = 0
                 self.browse = 0
Example #2
0
    def crawl_volume(self):
        if self.is_tmall:
            apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/")
            self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
            try:
                self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8'))
            except:
                logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc())
            if self.tmallInitApijson:
                try:
                    self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount']
                except:
                    logger.warn("try to get volume from api failed %s", self.item_id)
            if self.volume < 0:
                try:
                    self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                except:
                    self.volume = 0
                    logger.warn("Can not parse tmall item volume %s", self.item_id)

        else:
            apiItemInfoVal = get_val(self.data, "apiItemInfo")
            if apiItemInfoVal:
                apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/")
                itemInfoData = self.crawl_page(apiItemInfoUrl)
                try:
                    self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    self.confirmVolume = int(get_num_val(itemInfoData, 'confirmGoods'))
                except:
                    self.volume = 0
                    logger.warn("Can not parse taobao item volume %s", self.item_id)
            else:
                self.volume = 0
Example #3
0
    def crawl_title(self):
        try:
            self.data = self.crawl_page(self.url)
            if not self.data:
                logger.warn("download %s %s page failed, possible network connection failure", self.item_id, self.num_id)
                return

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0:
                self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True
            if title:
                self.title = title[0].encode('utf8').replace("-淘宝网", "").replace("-tmall.com天猫", "")

            #tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not tmalllogo:
                tmalllogo = self.html_obj.xpath("//*[@id='simple-logo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)
            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]
            else:
                self.cid = get_val(self.data, "cid")

            logger.info("Got %s %s html success", self.item_id, self.num_id)
        except:
            logger.error("crawling %s %s unknown exception %s", self.item_id, self.num_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
            raise
Example #4
0
 def crawl_desc(self):
     try:
         self.descUrl = get_val(self.data, "apiItemDesc").replace(r'''\/''', "/")
     except:
         try:
             self.descUrl = get_val(self.data, "ItemDesc").replace(r'''\/''', "/")
         except:
             if not self.data.find(u"暂无描述".encode('gb18030')) > 0:
                 self.descUrl = DESCURL_RE.search(self.data).group(0)
     # find http://dsc.taobaocdn.com/[^.]+\.desc.* as descUrl
     if self.descUrl:
         self.descContent = self.crawl_page(self.descUrl)
         logger.debug("Got %s desc details %s", self.item_id, len(self.descContent))
     else:
         logger.warn("%s desc url not found", self.item_id)
Example #5
0
    def crawl_taobao_rate(self, from_rate_id):
        rateListUrlBase = get_val(self.data, "data-listApi")
        if rateListUrlBase:
            rateListUrlBase = rateListUrlBase.replace(r'''\/''', "/")

        maxPage = 1
        page = 1
        total = 0
        while page <= maxPage:
            if page >= FLAGS.mostPage:
                break
            page1Result = self.crawl_taobao_rate_page(rateListUrlBase, page)
            if not page1Result:
                return

            results = page1Result['comments']
            maxPage = page1Result['maxPage']
            if not results:
                break
            self.taobao_comments_to_pb(results)
            page += 1
            total += len(results)
            if from_rate_id:
                if self.cut_comments(from_rate_id):
                    break
            if self.max_comments > 0 and self.max_comments < total:
                break
        logger.debug("Got %s %s comments", self.item_id, len(self.comments))
Example #6
0
    def crawl_taobao_rate(self, from_rate_id):
        rateListUrlBase = get_val(self.data, "data-listApi")
        if rateListUrlBase:
            rateListUrlBase = rateListUrlBase.replace(r'''\/''', "/")

        maxPage = 1
        page = 1
        total = 0
        while page <= maxPage:
            if page >= FLAGS.mostPage:
                break
            page1Result = self.crawl_taobao_rate_page(rateListUrlBase, page)
            if not page1Result:
                return

            results = page1Result['comments']
            maxPage = page1Result['maxPage']
            if not results:
                break
            self.taobao_comments_to_pb(results, page)
            page += 1
            total += len(results)
            if from_rate_id:
                if self.cut_comments(from_rate_id):
                    break
            if self.max_comments > 0 and self.max_comments < total:
                break
        logger.debug("Got %s %s comments", self.item_id, len(self.comments))
Example #7
0
def crawler(sql):
    db = get_db_engine()
    items = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for item in items:
        shop_id = item[0]
        shop_type = item[1]
        item_id = item[2]
        url = item[3]

        try:
            htm = get_item_htm(item_id, url, db)
            if shop_type == 1:
                htm_obj = parse_html(htm, encoding='gb18030')
                discount_url = htm_obj.xpath("//div[@id='promote']/@data-default")
                if discount_url and len(discount_url) > 0:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(discount_url[0], item_headers)
                    if disc_content.strip():
                        disc_obj = parse_html(disc_content, encoding='gb18030')
                        content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip()
                        dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip()
                        st = dates.encode('utf-8').replace("--","—").split("—")
                        start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')
                        end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d')

                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                                   shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0])
                        logger.info("taobao shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
                else:
                    logger.warning("taobao shop %s:%s not discount.", shop_id, url)
            elif shop_type == 2:
                d_url = get_val(htm, "initApi")
                if d_url:
                    item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
                    disc_content = download(d_url, item_headers)
                    cjson = loads(disc_content.decode('gb18030').encode('utf8'))
                    shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm']
                    if shop_prom:
                        st = int(shop_prom['startTime'])/1000
                        et = int(shop_prom['endTime'])/1000
                        start_time = time.strftime("%Y-%m-%d", time.localtime(st))
                        end_time = time.strftime("%Y-%m-%d", time.localtime(et))
                        content = shop_prom['promPlan'][0]['msg']
                        db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())",
                            shop_id, content.encode('utf-8'), start_time, end_time, d_url)
                        logger.info("tmall shop %s get discount success", shop_id)
                    else:
                        logger.warning("taobao shop %s:%s not discount.", shop_id, url)
        except:
            logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
Example #8
0
 def crawl_desc(self):
     try:
         self.descUrl = get_val(self.data,
                                "apiItemDesc").replace(r'''\/''', "/")
     except:
         try:
             self.descUrl = get_val(self.data,
                                    "ItemDesc").replace(r'''\/''', "/")
         except:
             if not self.data.find(u"暂无描述".encode('gb18030')) > 0:
                 self.descUrl = DESCURL_RE.search(self.data).group(0)
     # find http://dsc.taobaocdn.com/[^.]+\.desc.* as descUrl
     if self.descUrl:
         self.descContent = self.crawl_page(self.descUrl)
         logger.info("Got %s %s desc details %s", self.item_id, self.num_id,
                     len(self.descContent))
     else:
         logger.warn("%s desc url not found", self.item_id)
Example #9
0
 def crawl_collection(self):
     if self.is_tmall:
         c_url = "http://count.tbcdn.cn/counter3?callback=jsonp126&keys=ICCP_1_" + self.num_id
         collectionData = self.crawl_page(c_url)
         if collectionData:
             self.collection = int(get_num_val(collectionData, "ICCP_1_" + self.num_id))
         else:
             logger.warn("Can not parse tmall item collection %s", self.item_id)
     else:
         counterApi = get_val(self.data, "counterApi")
         if counterApi:
             counterApi = get_val(self.data, "counterApi").replace(r'''\/''', "/")
             counterData = self.crawl_page(counterApi + "&callback=DT.mods.SKU.CountCenter.saveCounts")
             try:
                 self.collection = int(get_num_val(counterData, 'ICCP_1_' + self.num_id))
                 self.browse = int(get_num_val(counterData, 'ICVT_7_' + self.num_id))
             except:
                 self.collection = 0
                 self.browse = 0
Example #10
0
    def crawl_volume(self):
        if self.is_tmall:
            apiItemInfoUrl = get_val(self.data,
                                     "initApi").replace(r'''\/''', "/")
            self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
            try:
                self.tmallInitApijson = loads(
                    self.tmallInitApi.decode('gb18030').encode('utf8'))
            except:
                logger.info("parse tmall api json failed %s : %s",
                            self.item_id, traceback.format_exc())
            if self.tmallInitApijson:
                try:
                    self.volume = self.tmallInitApijson['defaultModel'][
                        'sellCountDO']['sellCount']
                except:
                    logger.warn("try to get volume from api failed %s",
                                self.item_id)
            if self.volume < 0:
                try:
                    self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                except:
                    self.volume = 0
                    logger.warn("Can not parse tmall item volume %s",
                                self.item_id)

        else:
            apiItemInfoVal = get_val(self.data, "apiItemInfo")
            if apiItemInfoVal:
                apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(
                    r'''\/''', "/")
                itemInfoData = self.crawl_page(apiItemInfoUrl)
                try:
                    self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    self.confirmVolume = int(
                        get_num_val(itemInfoData, 'confirmGoods'))
                except:
                    self.volume = 0
                    logger.warn("Can not parse taobao item volume %s",
                                self.item_id)
            else:
                self.volume = 0
Example #11
0
 def crawl_stock(self):
     if self.is_tmall:
         try:
             self.stock = int(get_num_val(self.tmallInitApi, "icTotalQuantity"))
         except:
             logger.warn("Can not parse tmall item stock %s", self.item_id)
     else:
         try:
             if self.dynamicStockData:
                 self.stock = int(get_val(self.dynamicStockData, "stock").strip())
             else:
                 logger.warn("Can not parse taobao item stock %s", self.item_id)
         except:
             logger.error("Can not parse tmall item stock %s", self.item_id)
Example #12
0
 def crawl_stock(self):
     if self.is_tmall:
         try:
             self.stock = int(
                 get_num_val(self.tmallInitApi, "icTotalQuantity"))
         except:
             logger.warn("Can not parse tmall item stock %s", self.item_id)
     else:
         try:
             if self.dynamicStockData:
                 self.stock = int(
                     get_val(self.dynamicStockData, "stock").strip())
             else:
                 logger.warn("Can not parse taobao item stock %s",
                             self.item_id)
         except:
             logger.error("Can not parse tmall item stock %s", self.item_id)
Example #13
0
    def crawl_price(self):
        self.bidPrice = self.html_obj.xpath(
            "//input[@name='current_price']/@value")
        self.originPrice = self.html_obj.xpath(
            "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
        if not self.originPrice:
            self.originPrice = self.html_obj.xpath(
                "//strong[@class='J_originalPrice']/text()")

        self.promoteUrl2 = get_val(self.data, "apiPromoData")
        if self.promoteUrl2:
            self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/")

        price = ""
        if self.is_tmall and self.tmallInitApi and self.tmallInitApijson:
            try:
                priceInfo = self.tmallInitApijson['defaultModel'][
                    'itemPriceResultDO']['priceInfo']
                if priceInfo:
                    if priceInfo.has_key('def'):
                        defaultPriceInfo = priceInfo['def']
                    else:
                        defaultPriceInfo = priceInfo[priceInfo.keys()[0]]
                    # 2013-11-22 改为获取真实促销价格,而不是扣除佣金后的价格
                    if defaultPriceInfo.has_key(
                            'promotionList'
                    ) and defaultPriceInfo['promotionList']:
                        price = defaultPriceInfo['promotionList'][0]['price']
                    if not price:
                        if defaultPriceInfo.has_key('price'):
                            price = defaultPriceInfo['price']
                    if not price:
                        if defaultPriceInfo.has_key('promPrice'):
                            price = defaultPriceInfo['promPrice']['price']
                        elif defaultPriceInfo.has_key(
                                'promotionList'
                        ) and defaultPriceInfo['promotionList']:
                            price = str(
                                min([
                                    float(x.get('price', '100000000.0'))
                                    for x in defaultPriceInfo['promotionList']
                                ]))
            except:
                logger.warn("Parse tmall json price failed, %s", self.item_id)

        if not price:
            if self.promoteUrl2:
                self.promoteContent = self.crawl_page(
                    self.promoteUrl2).replace('&quot;', '"')
                tag = "low:"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find(',', pos)
                    price = self.promoteContent[pos:pos2]
                if not price:
                    price = get_num_val(self.promoteContent, 'price')
            else:
                self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id
                self.promoteContent = self.crawl_page(self.promoteUrl)
                if self.promoteContent:
                    self.promoteContent = self.promoteContent.replace(
                        '"', '&quot;')
                    tag = "promPrice&quot;:&quot;"
                    if self.promoteContent.find(tag) > 0:
                        pos = self.promoteContent.find(tag) + len(tag)
                        pos2 = self.promoteContent.find('&quot;', pos)
                        price = self.promoteContent[pos:pos2]

        if not price:
            tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()")
            tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()")
            if tbPrice and not tbPrice[0].strip():
                price = tbPrice[0].strip()
            elif tbPrice1 and not tbPrice1[0].strip():
                price = tbPrice1[0].strip()

        if price.find("-") > 0:
            price = price.split('-')[0].strip()

        if not price:
            rg_m = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE
                              | re.DOTALL).search(self.dynamicStockData)
            if rg_m:
                price_str = rg_m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # 2013-09-03  get price url
        if not price:
            #这里稍微有点麻烦,主要针对string进行处理
            pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id
            response = download(pirce_url, self.headers)
            rg = re.compile('price:\"[0-9]+[.][0-9]+\"',
                            re.IGNORECASE | re.DOTALL)
            m = rg.search(response.decode('gb18030').encode('utf8'))
            if m:
                price_str = m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # not chuxiao price, set origin price
        if not price:
            if self.originPrice:
                price = self.originPrice[0].strip()
            elif self.bidPrice:
                price = self.bidPrice[0].strip()
            if price.find("-") > 0:
                price = price.split('-')[0].strip()

        self.price = float(price)
        logger.debug("%s price is %s", self.item_id, self.price)
Example #14
0
    def crawl(self):
        try:
            self.data = self.crawl_page(self.url)
            if FLAGS.debug_parser:
                import pdb; pdb.set_trace()

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0:
                self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True

            self.detailDiv = self.html_obj.xpath("//div[@id='detail']")
            self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']")
            self.originPrice = self.html_obj.xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
            if not self.originPrice:
                self.originPrice = self.html_obj.xpath("//strong[@class='J_originalPrice']/text()")
            #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()")
            self.bidPrice = self.html_obj.xpath("//input[@name='current_price']/@value")
            self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)

            tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]

                apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/")
                self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
                try:
                    self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8'))
                except:
                    logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc())
                if self.tmallInitApijson:
                    try:
                        self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount']
                    except:
                        logger.warn("try to get volume from api failed %s", self.item_id)
                if self.volume < 0:
                    try:
                        self.volume = int(get_val(self.tmallInitApi, "sellCount"))
                    except:
                        logger.warn("Can not parse item volume %s", self.item_id)

                # 库存 :icTotalQuantity
                """"
                reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                m = RATECOUNT_RE.match(reviewInfoData)
                if m:
                    self.reviewCount = m.group(1)
                else:
                    self.reviewCount = None
                """
            else:
                self.cid = get_val(self.data, "cid")

                apiItemInfoVal = get_val(self.data, "apiItemInfo")
                if apiItemInfoVal:
                    apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/")
                    itemInfoData = self.crawl_page(apiItemInfoUrl)
                    try:
                        self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    except:
                        self.volume = -1
                else:
                    self.volume = -1

                #interval = get_val(data2, 'interval')
                # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/")
                """
                reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                self.reviewCount = get_val(reviewInfoData, 'total')
                """
        except:
            logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]})
            raise
Example #15
0
    def crawl_price(self):
        self.promoteUrl2 = get_val(self.data, "apiPromoData")
        if self.promoteUrl2:
            self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/")

        price = ""
        if self.is_tmall and self.tmallInitApi and self.tmallInitApijson:
            try:
                priceInfo = self.tmallInitApijson['defaultModel']['itemPriceResultDO']['priceInfo']
                if priceInfo:
                    if priceInfo.has_key('def'):
                        defaultPriceInfo = priceInfo['def']
                    else:
                        defaultPriceInfo = priceInfo[priceInfo.keys()[0]]

                    if defaultPriceInfo.has_key('promPrice'):
                        price = defaultPriceInfo['promPrice']['price']
                    elif defaultPriceInfo.has_key('promotionList') and defaultPriceInfo['promotionList']:
                        price = str(min([float(x.get('price','100000000.0')) for x in defaultPriceInfo['promotionList']]))
                    else:
                        price = defaultPriceInfo['price']
            except:
                logger.warn("Parse tmall json price failed, %s", self.item_id)

        if not price:
            if self.promoteUrl2:
                self.promoteContent = self.crawl_page(self.promoteUrl2).replace('&quot;', '"')
                tag = "low:"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find(',', pos)
                    price = self.promoteContent[pos:pos2]
                if not price:
                    price = get_num_val(self.promoteContent, 'price')
            else:
                self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id
                self.promoteContent = self.crawl_page(self.promoteUrl).replace('"', '&quot;')
                tag = "promPrice&quot;:&quot;"
                if self.promoteContent.find(tag) > 0:
                    pos = self.promoteContent.find(tag) + len(tag)
                    pos2 = self.promoteContent.find('&quot;', pos)
                    price = self.promoteContent[pos:pos2]

        if not price:
            tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()")
            tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()")
            if tbPrice and not tbPrice[0].strip():
                price = tbPrice[0].strip()
            elif tbPrice1 and not tbPrice1[0].strip():
                price = tbPrice1[0].strip()

        if price.find("-") > 0:
            price = price.split('-')[0].strip()

        # 2013-09-03  get price url
        if not price:
            #这里稍微有点麻烦,主要针对string进行处理
            pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id
            response = download(pirce_url, self.headers)
            rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE|re.DOTALL)
            m = rg.search(response.decode('gb18030').encode('utf8'))
            if m:
                price_str = m.group(0).split(":")[1].replace("\"", "")
                price = Decimal(price_str)

        # not chuxiao price, set origin price
        if not price:
            if self.originPrice:
                price = self.originPrice[0].strip()
            elif self.bidPrice:
                price = self.bidPrice[0].strip()
            if price.find("-") > 0:
                price = price.split('-')[0].strip()

        self.price = float(price)
        logger.debug("%s price is %s", self.item_id, self.price)
Example #16
0
    def crawl_title(self):
        try:
            self.data = self.crawl_page(self.url)
            if not self.data:
                logger.warn(
                    "download %s %s page failed, possible network connection failure",
                    self.item_id, self.num_id)
                return

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find(
                    'item.taobao.com'
            ) > 0 and self.data.find(
                    "window.location.href='http://detail.tmall.com/item.htm'+window.location.search"
            ) > 0:
                self.data = self.crawl_page(
                    self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True
            if title:
                self.title = title[0].encode('utf8').replace(
                    "-淘宝网", "").replace("-tmall.com天猫", "")

            #tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not tmalllogo:
                tmalllogo = self.html_obj.xpath("//*[@id='simple-logo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            self.thumbImages = self.html_obj.xpath(
                "//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)
            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]
            else:
                self.cid = get_val(self.data, "cid")

            logger.info("Got %s %s html success", self.item_id, self.num_id)
        except:
            logger.error("crawling %s %s unknown exception %s",
                         self.item_id,
                         self.num_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawlItemException',
                         ]})
            raise
Example #17
0
    def crawl(self):
        try:
            self.data = self.crawl_page(self.url)
            if FLAGS.debug_parser:
                import pdb
                pdb.set_trace()

            # check tmall
            if not self.is_tmall and len(self.data) < 256 and self.url.find(
                    'item.taobao.com'
            ) > 0 and self.data.find(
                    "window.location.href='http://detail.tmall.com/item.htm'+window.location.search"
            ) > 0:
                self.data = self.crawl_page(
                    self.url.replace('item.taobao.com', 'detail.tmall.com'))

            if self.check_offline():
                self.is_offline = True

            self.html_obj = parse_html(self.data, encoding="gb18030")

            title = self.html_obj.xpath("//html/head/title/text()")
            if title and title[0].find(u"转卖") > 0:
                self.is_offline = True

            self.detailDiv = self.html_obj.xpath("//div[@id='detail']")
            self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']")
            self.originPrice = self.html_obj.xpath(
                "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()")
            if not self.originPrice:
                self.originPrice = self.html_obj.xpath(
                    "//strong[@class='J_originalPrice']/text()")
            #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()")
            self.bidPrice = self.html_obj.xpath(
                "//input[@name='current_price']/@value")
            self.thumbImages = self.html_obj.xpath(
                "//ul[@id='J_UlThumb']//img/@src")
            if not len(self.thumbImages):
                try:
                    # try load thumb images for tmall page
                    self.thumbImages = [
                        IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in
                        self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")
                    ]

                    # taobao @src to @data-src
                    if not len(self.thumbImages):
                        self.thumbImages = self.html_obj.xpath(
                            "//ul[@id='J_UlThumb']//img/@data-src")
                except:
                    logger.warn("No thumbs found %s", self.item_id)

            tblogo = self.html_obj.xpath("//*[@id='shop-logo']")
            tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']")
            if not self.is_tmall and tmalllogo:
                self.is_tmall = True

            if self.is_tmall:
                self.cid = get_val(self.data, "categoryId").split('&')[0]

                apiItemInfoUrl = get_val(self.data,
                                         "initApi").replace(r'''\/''', "/")
                self.tmallInitApi = self.crawl_page(apiItemInfoUrl)
                try:
                    self.tmallInitApijson = loads(
                        self.tmallInitApi.decode('gb18030').encode('utf8'))
                except:
                    logger.info("parse tmall api json failed %s : %s",
                                self.item_id, traceback.format_exc())
                if self.tmallInitApijson:
                    try:
                        self.volume = self.tmallInitApijson['defaultModel'][
                            'sellCountDO']['sellCount']
                    except:
                        logger.warn("try to get volume from api failed %s",
                                    self.item_id)
                if self.volume < 0:
                    try:
                        self.volume = int(
                            get_val(self.tmallInitApi, "sellCount"))
                    except:
                        logger.warn("Can not parse item volume %s",
                                    self.item_id)

                # 库存 :icTotalQuantity
                """"
                reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                m = RATECOUNT_RE.match(reviewInfoData)
                if m:
                    self.reviewCount = m.group(1)
                else:
                    self.reviewCount = None
                """
            else:
                self.cid = get_val(self.data, "cid")

                apiItemInfoVal = get_val(self.data, "apiItemInfo")
                if apiItemInfoVal:
                    apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(
                        r'''\/''', "/")
                    itemInfoData = self.crawl_page(apiItemInfoUrl)
                    try:
                        self.volume = int(get_num_val(itemInfoData, 'quanity'))
                    except:
                        self.volume = -1
                else:
                    self.volume = -1

                #interval = get_val(data2, 'interval')
                # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/")
                """
                reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/")
                reviewInfoData = self.crawl_page(reviewInfoUrl)
                self.reviewCount = get_val(reviewInfoData, 'total')
                """
        except:
            logger.error("crawling %s unknown exception %s",
                         self.item_id,
                         traceback.format_exc(),
                         extra={'tags': [
                             'crawlItemException',
                         ]})
            raise
def crawler(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for shop in shops:
        shop_id = shop[0]
        url = str(shop[1])
        type = shop[2]
        if url[-1] != '/':
            url += "/"
        try:
            shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
            dongtai_url = url + "dongtai.htm"
            dongtai_data = download(dongtai_url, shop_headers)
            if dongtai_data:
                dongtai_obj = parse_html(dongtai_data, encoding="gb18030")
                dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode('utf-8')
                if '店铺动态' in dongtai_title:
                    microscope_data = dongtai_obj.xpath("//*[@name='microscope-data']/@content")
                    userId = get_val(str(microscope_data), "userId")

                    if userId:
                        dongtai_headers = {'Referer': dongtai_url, 'User-Agent': DEFAULT_UA}
                        promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \
                                        "&userId=%s&vfeedTabId=115" % userId
                        promotion_data = download(promotion_url, dongtai_headers)

                        if promotion_data:
                            promotion_obj = parse_html(promotion_data, encoding="gb18030")
                            i = 0
                            while i < 10:
                                feedInfo = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()")[i].encode('utf-8')
                                if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo:
                                    #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i]
                                    link = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href")[i]
                                    promotion_price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()")[i]
                                    price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()")[i]
                                    promotion_time = promotion_obj.xpath(u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()")[i]
                                    pt = promotion_time.encode('utf-8').replace("起止日期:","").split(" - ")
                                    start_time = pt[0].replace(".", "-")
                                    end_time = pt[1].replace(".", "-")
                                    if '2013' not in pt[1] or '2014' not in pt[1]:
                                        end_time = '2013-' + end_time

                                    if start_time > end_time:
                                        end_time = end_time.replace("2013", "2014")

                                    num_id = get_numiid(link, dongtai_headers)
                                    if num_id:
                                        sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (shop_id, num_id)
                                        re = list(db.execute(sql))
                                        if not re:
                                            db.execute("insert into shop_promotion (shop_id, num_id, price, "
                                                       "promotion_price, start_time, end_time, create_time, "
                                                       "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())"
                                                       % (shop_id, num_id, price.replace(',', ''), promotion_price.replace(',', ''), start_time, end_time))
                                    else:
                                        logger.error("shop %s:%s crawler num_id failed", shop_id, url)

                                i += 1
                                logger.info("shop %s:%s crawler promotiom item num=%s", shop_id, url, i)

                        else:
                            logger.warning("shop %s:%s not promotion info", shop_id, url)
                    else:
                        logger.error("shop %s:%s crawler userId failed", shop_id, url)
                else:
                    logger.error("shop %s:%s not dongtai page", shop_id, url)
        except:
            logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())
Example #19
0
def crawler(sql):
    db = get_db_engine()
    shops = list(db.execute(sql))

    # debug
    if FLAGS.debug_parser:
        import pdb
        pdb.set_trace()

    for shop in shops:
        shop_id = shop[0]
        url = str(shop[1])
        type = shop[2]
        if url[-1] != '/':
            url += "/"
        try:
            shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA}
            dongtai_url = url + "dongtai.htm"
            dongtai_data = download(dongtai_url, shop_headers)
            if dongtai_data:
                dongtai_obj = parse_html(dongtai_data, encoding="gb18030")
                dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode(
                    'utf-8')
                if '店铺动态' in dongtai_title:
                    microscope_data = dongtai_obj.xpath(
                        "//*[@name='microscope-data']/@content")
                    userId = get_val(str(microscope_data), "userId")

                    if userId:
                        dongtai_headers = {
                            'Referer': dongtai_url,
                            'User-Agent': DEFAULT_UA
                        }
                        promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \
                                        "&userId=%s&vfeedTabId=115" % userId
                        promotion_data = download(promotion_url,
                                                  dongtai_headers)

                        if promotion_data:
                            promotion_obj = parse_html(promotion_data,
                                                       encoding="gb18030")
                            i = 0
                            while i < 10:
                                feedInfo = promotion_obj.xpath(
                                    "//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()"
                                )[i].encode('utf-8')
                                if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo:
                                    #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i]
                                    link = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href"
                                    )[i]
                                    promotion_price = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()"
                                    )[i]
                                    price = promotion_obj.xpath(
                                        "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()"
                                    )[i]
                                    promotion_time = promotion_obj.xpath(
                                        u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()"
                                    )[i]
                                    pt = promotion_time.encode(
                                        'utf-8').replace("起止日期:",
                                                         "").split(" - ")
                                    start_time = pt[0].replace(".", "-")
                                    end_time = pt[1].replace(".", "-")
                                    if '2013' not in pt[1] or '2014' not in pt[
                                            1]:
                                        end_time = '2013-' + end_time

                                    if start_time > end_time:
                                        end_time = end_time.replace(
                                            "2013", "2014")

                                    num_id = get_numiid(link, dongtai_headers)
                                    if num_id:
                                        sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (
                                            shop_id, num_id)
                                        re = list(db.execute(sql))
                                        if not re:
                                            db.execute(
                                                "insert into shop_promotion (shop_id, num_id, price, "
                                                "promotion_price, start_time, end_time, create_time, "
                                                "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())"
                                                % (shop_id, num_id,
                                                   price.replace(',', ''),
                                                   promotion_price.replace(
                                                       ',', ''), start_time,
                                                   end_time))
                                    else:
                                        logger.error(
                                            "shop %s:%s crawler num_id failed",
                                            shop_id, url)

                                i += 1
                                logger.info(
                                    "shop %s:%s crawler promotiom item num=%s",
                                    shop_id, url, i)

                        else:
                            logger.warning("shop %s:%s not promotion info",
                                           shop_id, url)
                    else:
                        logger.error("shop %s:%s crawler userId failed",
                                     shop_id, url)
                else:
                    logger.error("shop %s:%s not dongtai page", shop_id, url)
        except:
            logger.error("shop %s:%s crawler failed %s", shop_id, url,
                         traceback.format_exc())