def crawl_collection(self): if self.is_tmall: c_url = "http://count.tbcdn.cn/counter3?callback=jsonp126&keys=ICCP_1_" + self.num_id collectionData = self.crawl_page(c_url) if collectionData: self.collection = int( get_num_val(collectionData, "ICCP_1_" + self.num_id)) else: logger.warn("Can not parse tmall item collection %s", self.item_id) else: counterApi = get_val(self.data, "counterApi") if counterApi: counterApi = get_val(self.data, "counterApi").replace(r'''\/''', "/") counterData = self.crawl_page( counterApi + "&callback=DT.mods.SKU.CountCenter.saveCounts") try: self.collection = int( get_num_val(counterData, 'ICCP_1_' + self.num_id)) self.browse = int( get_num_val(counterData, 'ICVT_7_' + self.num_id)) except: self.collection = 0 self.browse = 0
def crawl_volume(self): if self.is_tmall: apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: self.volume = 0 logger.warn("Can not parse tmall item volume %s", self.item_id) else: apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) self.confirmVolume = int(get_num_val(itemInfoData, 'confirmGoods')) except: self.volume = 0 logger.warn("Can not parse taobao item volume %s", self.item_id) else: self.volume = 0
def crawl_title(self): try: self.data = self.crawl_page(self.url) if not self.data: logger.warn("download %s %s page failed, possible network connection failure", self.item_id, self.num_id) return # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0: self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True if title: self.title = title[0].encode('utf8').replace("-淘宝网", "").replace("-tmall.com天猫", "") #tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not tmalllogo: tmalllogo = self.html_obj.xpath("//*[@id='simple-logo']") if not self.is_tmall and tmalllogo: self.is_tmall = True self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] else: self.cid = get_val(self.data, "cid") logger.info("Got %s %s html success", self.item_id, self.num_id) except: logger.error("crawling %s %s unknown exception %s", self.item_id, self.num_id, traceback.format_exc(), extra={'tags':['crawlItemException',]}) raise
def crawl_desc(self): try: self.descUrl = get_val(self.data, "apiItemDesc").replace(r'''\/''', "/") except: try: self.descUrl = get_val(self.data, "ItemDesc").replace(r'''\/''', "/") except: if not self.data.find(u"暂无描述".encode('gb18030')) > 0: self.descUrl = DESCURL_RE.search(self.data).group(0) # find http://dsc.taobaocdn.com/[^.]+\.desc.* as descUrl if self.descUrl: self.descContent = self.crawl_page(self.descUrl) logger.debug("Got %s desc details %s", self.item_id, len(self.descContent)) else: logger.warn("%s desc url not found", self.item_id)
def crawl_taobao_rate(self, from_rate_id): rateListUrlBase = get_val(self.data, "data-listApi") if rateListUrlBase: rateListUrlBase = rateListUrlBase.replace(r'''\/''', "/") maxPage = 1 page = 1 total = 0 while page <= maxPage: if page >= FLAGS.mostPage: break page1Result = self.crawl_taobao_rate_page(rateListUrlBase, page) if not page1Result: return results = page1Result['comments'] maxPage = page1Result['maxPage'] if not results: break self.taobao_comments_to_pb(results) page += 1 total += len(results) if from_rate_id: if self.cut_comments(from_rate_id): break if self.max_comments > 0 and self.max_comments < total: break logger.debug("Got %s %s comments", self.item_id, len(self.comments))
def crawl_taobao_rate(self, from_rate_id): rateListUrlBase = get_val(self.data, "data-listApi") if rateListUrlBase: rateListUrlBase = rateListUrlBase.replace(r'''\/''', "/") maxPage = 1 page = 1 total = 0 while page <= maxPage: if page >= FLAGS.mostPage: break page1Result = self.crawl_taobao_rate_page(rateListUrlBase, page) if not page1Result: return results = page1Result['comments'] maxPage = page1Result['maxPage'] if not results: break self.taobao_comments_to_pb(results, page) page += 1 total += len(results) if from_rate_id: if self.cut_comments(from_rate_id): break if self.max_comments > 0 and self.max_comments < total: break logger.debug("Got %s %s comments", self.item_id, len(self.comments))
def crawler(sql): db = get_db_engine() items = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for item in items: shop_id = item[0] shop_type = item[1] item_id = item[2] url = item[3] try: htm = get_item_htm(item_id, url, db) if shop_type == 1: htm_obj = parse_html(htm, encoding='gb18030') discount_url = htm_obj.xpath("//div[@id='promote']/@data-default") if discount_url and len(discount_url) > 0: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(discount_url[0], item_headers) if disc_content.strip(): disc_obj = parse_html(disc_content, encoding='gb18030') content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip() dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip() st = dates.encode('utf-8').replace("--","—").split("—") start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0]) logger.info("taobao shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) elif shop_type == 2: d_url = get_val(htm, "initApi") if d_url: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(d_url, item_headers) cjson = loads(disc_content.decode('gb18030').encode('utf8')) shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm'] if shop_prom: st = int(shop_prom['startTime'])/1000 et = int(shop_prom['endTime'])/1000 start_time = time.strftime("%Y-%m-%d", time.localtime(st)) end_time = time.strftime("%Y-%m-%d", time.localtime(et)) content = shop_prom['promPlan'][0]['msg'] db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, d_url) logger.info("tmall shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) except: logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
def crawl_desc(self): try: self.descUrl = get_val(self.data, "apiItemDesc").replace(r'''\/''', "/") except: try: self.descUrl = get_val(self.data, "ItemDesc").replace(r'''\/''', "/") except: if not self.data.find(u"暂无描述".encode('gb18030')) > 0: self.descUrl = DESCURL_RE.search(self.data).group(0) # find http://dsc.taobaocdn.com/[^.]+\.desc.* as descUrl if self.descUrl: self.descContent = self.crawl_page(self.descUrl) logger.info("Got %s %s desc details %s", self.item_id, self.num_id, len(self.descContent)) else: logger.warn("%s desc url not found", self.item_id)
def crawl_collection(self): if self.is_tmall: c_url = "http://count.tbcdn.cn/counter3?callback=jsonp126&keys=ICCP_1_" + self.num_id collectionData = self.crawl_page(c_url) if collectionData: self.collection = int(get_num_val(collectionData, "ICCP_1_" + self.num_id)) else: logger.warn("Can not parse tmall item collection %s", self.item_id) else: counterApi = get_val(self.data, "counterApi") if counterApi: counterApi = get_val(self.data, "counterApi").replace(r'''\/''', "/") counterData = self.crawl_page(counterApi + "&callback=DT.mods.SKU.CountCenter.saveCounts") try: self.collection = int(get_num_val(counterData, 'ICCP_1_' + self.num_id)) self.browse = int(get_num_val(counterData, 'ICVT_7_' + self.num_id)) except: self.collection = 0 self.browse = 0
def crawl_volume(self): if self.is_tmall: apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads( self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel'][ 'sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: self.volume = 0 logger.warn("Can not parse tmall item volume %s", self.item_id) else: apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace( r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) self.confirmVolume = int( get_num_val(itemInfoData, 'confirmGoods')) except: self.volume = 0 logger.warn("Can not parse taobao item volume %s", self.item_id) else: self.volume = 0
def crawl_stock(self): if self.is_tmall: try: self.stock = int(get_num_val(self.tmallInitApi, "icTotalQuantity")) except: logger.warn("Can not parse tmall item stock %s", self.item_id) else: try: if self.dynamicStockData: self.stock = int(get_val(self.dynamicStockData, "stock").strip()) else: logger.warn("Can not parse taobao item stock %s", self.item_id) except: logger.error("Can not parse tmall item stock %s", self.item_id)
def crawl_stock(self): if self.is_tmall: try: self.stock = int( get_num_val(self.tmallInitApi, "icTotalQuantity")) except: logger.warn("Can not parse tmall item stock %s", self.item_id) else: try: if self.dynamicStockData: self.stock = int( get_val(self.dynamicStockData, "stock").strip()) else: logger.warn("Can not parse taobao item stock %s", self.item_id) except: logger.error("Can not parse tmall item stock %s", self.item_id)
def crawl_price(self): self.bidPrice = self.html_obj.xpath( "//input[@name='current_price']/@value") self.originPrice = self.html_obj.xpath( "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath( "//strong[@class='J_originalPrice']/text()") self.promoteUrl2 = get_val(self.data, "apiPromoData") if self.promoteUrl2: self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/") price = "" if self.is_tmall and self.tmallInitApi and self.tmallInitApijson: try: priceInfo = self.tmallInitApijson['defaultModel'][ 'itemPriceResultDO']['priceInfo'] if priceInfo: if priceInfo.has_key('def'): defaultPriceInfo = priceInfo['def'] else: defaultPriceInfo = priceInfo[priceInfo.keys()[0]] # 2013-11-22 改为获取真实促销价格,而不是扣除佣金后的价格 if defaultPriceInfo.has_key( 'promotionList' ) and defaultPriceInfo['promotionList']: price = defaultPriceInfo['promotionList'][0]['price'] if not price: if defaultPriceInfo.has_key('price'): price = defaultPriceInfo['price'] if not price: if defaultPriceInfo.has_key('promPrice'): price = defaultPriceInfo['promPrice']['price'] elif defaultPriceInfo.has_key( 'promotionList' ) and defaultPriceInfo['promotionList']: price = str( min([ float(x.get('price', '100000000.0')) for x in defaultPriceInfo['promotionList'] ])) except: logger.warn("Parse tmall json price failed, %s", self.item_id) if not price: if self.promoteUrl2: self.promoteContent = self.crawl_page( self.promoteUrl2).replace('"', '"') tag = "low:" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find(',', pos) price = self.promoteContent[pos:pos2] if not price: price = get_num_val(self.promoteContent, 'price') else: self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id self.promoteContent = self.crawl_page(self.promoteUrl) if self.promoteContent: self.promoteContent = self.promoteContent.replace( '"', '"') tag = "promPrice":"" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find('"', pos) price = self.promoteContent[pos:pos2] if not price: tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()") tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()") if tbPrice and not tbPrice[0].strip(): price = tbPrice[0].strip() elif tbPrice1 and not tbPrice1[0].strip(): price = tbPrice1[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() if not price: rg_m = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE | re.DOTALL).search(self.dynamicStockData) if rg_m: price_str = rg_m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # 2013-09-03 get price url if not price: #这里稍微有点麻烦,主要针对string进行处理 pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id response = download(pirce_url, self.headers) rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE | re.DOTALL) m = rg.search(response.decode('gb18030').encode('utf8')) if m: price_str = m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # not chuxiao price, set origin price if not price: if self.originPrice: price = self.originPrice[0].strip() elif self.bidPrice: price = self.bidPrice[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() self.price = float(price) logger.debug("%s price is %s", self.item_id, self.price)
def crawl(self): try: self.data = self.crawl_page(self.url) if FLAGS.debug_parser: import pdb; pdb.set_trace() # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find('item.taobao.com') > 0 and self.data.find("window.location.href='http://detail.tmall.com/item.htm'+window.location.search") > 0: self.data = self.crawl_page(self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True self.detailDiv = self.html_obj.xpath("//div[@id='detail']") self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']") self.originPrice = self.html_obj.xpath("//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath("//strong[@class='J_originalPrice']/text()") #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()") self.bidPrice = self.html_obj.xpath("//input[@name='current_price']/@value") self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not self.is_tmall and tmalllogo: self.is_tmall = True if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads(self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel']['sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int(get_val(self.tmallInitApi, "sellCount")) except: logger.warn("Can not parse item volume %s", self.item_id) # 库存 :icTotalQuantity """" reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) m = RATECOUNT_RE.match(reviewInfoData) if m: self.reviewCount = m.group(1) else: self.reviewCount = None """ else: self.cid = get_val(self.data, "cid") apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace(r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) except: self.volume = -1 else: self.volume = -1 #interval = get_val(data2, 'interval') # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/") """ reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) self.reviewCount = get_val(reviewInfoData, 'total') """ except: logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]}) raise
def crawl_price(self): self.promoteUrl2 = get_val(self.data, "apiPromoData") if self.promoteUrl2: self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/") price = "" if self.is_tmall and self.tmallInitApi and self.tmallInitApijson: try: priceInfo = self.tmallInitApijson['defaultModel']['itemPriceResultDO']['priceInfo'] if priceInfo: if priceInfo.has_key('def'): defaultPriceInfo = priceInfo['def'] else: defaultPriceInfo = priceInfo[priceInfo.keys()[0]] if defaultPriceInfo.has_key('promPrice'): price = defaultPriceInfo['promPrice']['price'] elif defaultPriceInfo.has_key('promotionList') and defaultPriceInfo['promotionList']: price = str(min([float(x.get('price','100000000.0')) for x in defaultPriceInfo['promotionList']])) else: price = defaultPriceInfo['price'] except: logger.warn("Parse tmall json price failed, %s", self.item_id) if not price: if self.promoteUrl2: self.promoteContent = self.crawl_page(self.promoteUrl2).replace('"', '"') tag = "low:" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find(',', pos) price = self.promoteContent[pos:pos2] if not price: price = get_num_val(self.promoteContent, 'price') else: self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id self.promoteContent = self.crawl_page(self.promoteUrl).replace('"', '"') tag = "promPrice":"" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find('"', pos) price = self.promoteContent[pos:pos2] if not price: tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()") tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()") if tbPrice and not tbPrice[0].strip(): price = tbPrice[0].strip() elif tbPrice1 and not tbPrice1[0].strip(): price = tbPrice1[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() # 2013-09-03 get price url if not price: #这里稍微有点麻烦,主要针对string进行处理 pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id response = download(pirce_url, self.headers) rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE|re.DOTALL) m = rg.search(response.decode('gb18030').encode('utf8')) if m: price_str = m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # not chuxiao price, set origin price if not price: if self.originPrice: price = self.originPrice[0].strip() elif self.bidPrice: price = self.bidPrice[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() self.price = float(price) logger.debug("%s price is %s", self.item_id, self.price)
def crawl_title(self): try: self.data = self.crawl_page(self.url) if not self.data: logger.warn( "download %s %s page failed, possible network connection failure", self.item_id, self.num_id) return # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find( 'item.taobao.com' ) > 0 and self.data.find( "window.location.href='http://detail.tmall.com/item.htm'+window.location.search" ) > 0: self.data = self.crawl_page( self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True if title: self.title = title[0].encode('utf8').replace( "-淘宝网", "").replace("-tmall.com天猫", "") #tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not tmalllogo: tmalllogo = self.html_obj.xpath("//*[@id='simple-logo']") if not self.is_tmall and tmalllogo: self.is_tmall = True self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] else: self.cid = get_val(self.data, "cid") logger.info("Got %s %s html success", self.item_id, self.num_id) except: logger.error("crawling %s %s unknown exception %s", self.item_id, self.num_id, traceback.format_exc(), extra={'tags': [ 'crawlItemException', ]}) raise
def crawl(self): try: self.data = self.crawl_page(self.url) if FLAGS.debug_parser: import pdb pdb.set_trace() # check tmall if not self.is_tmall and len(self.data) < 256 and self.url.find( 'item.taobao.com' ) > 0 and self.data.find( "window.location.href='http://detail.tmall.com/item.htm'+window.location.search" ) > 0: self.data = self.crawl_page( self.url.replace('item.taobao.com', 'detail.tmall.com')) if self.check_offline(): self.is_offline = True self.html_obj = parse_html(self.data, encoding="gb18030") title = self.html_obj.xpath("//html/head/title/text()") if title and title[0].find(u"转卖") > 0: self.is_offline = True self.detailDiv = self.html_obj.xpath("//div[@id='detail']") self.buyButton = self.html_obj.xpath("//a[@id='J_LinkBuy']") self.originPrice = self.html_obj.xpath( "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath( "//strong[@class='J_originalPrice']/text()") #self.bidPrice = self.html_obj.xpath("//li[contains(concat(' ',normalize-space(@class),' '),' detail-price ')]/strong/text()") self.bidPrice = self.html_obj.xpath( "//input[@name='current_price']/@value") self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@src") if not len(self.thumbImages): try: # try load thumb images for tmall page self.thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in self.html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(self.thumbImages): self.thumbImages = self.html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") except: logger.warn("No thumbs found %s", self.item_id) tblogo = self.html_obj.xpath("//*[@id='shop-logo']") tmalllogo = self.html_obj.xpath("//*[@id='mallLogo']") if not self.is_tmall and tmalllogo: self.is_tmall = True if self.is_tmall: self.cid = get_val(self.data, "categoryId").split('&')[0] apiItemInfoUrl = get_val(self.data, "initApi").replace(r'''\/''', "/") self.tmallInitApi = self.crawl_page(apiItemInfoUrl) try: self.tmallInitApijson = loads( self.tmallInitApi.decode('gb18030').encode('utf8')) except: logger.info("parse tmall api json failed %s : %s", self.item_id, traceback.format_exc()) if self.tmallInitApijson: try: self.volume = self.tmallInitApijson['defaultModel'][ 'sellCountDO']['sellCount'] except: logger.warn("try to get volume from api failed %s", self.item_id) if self.volume < 0: try: self.volume = int( get_val(self.tmallInitApi, "sellCount")) except: logger.warn("Can not parse item volume %s", self.item_id) # 库存 :icTotalQuantity """" reviewInfoUrl = get_val(self.data, "apiMallReviews").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) m = RATECOUNT_RE.match(reviewInfoData) if m: self.reviewCount = m.group(1) else: self.reviewCount = None """ else: self.cid = get_val(self.data, "cid") apiItemInfoVal = get_val(self.data, "apiItemInfo") if apiItemInfoVal: apiItemInfoUrl = get_val(self.data, "apiItemInfo").replace( r'''\/''', "/") itemInfoData = self.crawl_page(apiItemInfoUrl) try: self.volume = int(get_num_val(itemInfoData, 'quanity')) except: self.volume = -1 else: self.volume = -1 #interval = get_val(data2, 'interval') # 库存 skudata = get_val(self.data, 'valItemInfo').replace(r'''\/''', "/") """ reviewInfoUrl = get_val(self.data, "data-commonApi").replace(r'''\/''', "/") reviewInfoData = self.crawl_page(reviewInfoUrl) self.reviewCount = get_val(reviewInfoData, 'total') """ except: logger.error("crawling %s unknown exception %s", self.item_id, traceback.format_exc(), extra={'tags': [ 'crawlItemException', ]}) raise
def crawler(sql): db = get_db_engine() shops = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for shop in shops: shop_id = shop[0] url = str(shop[1]) type = shop[2] if url[-1] != '/': url += "/" try: shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} dongtai_url = url + "dongtai.htm" dongtai_data = download(dongtai_url, shop_headers) if dongtai_data: dongtai_obj = parse_html(dongtai_data, encoding="gb18030") dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode('utf-8') if '店铺动态' in dongtai_title: microscope_data = dongtai_obj.xpath("//*[@name='microscope-data']/@content") userId = get_val(str(microscope_data), "userId") if userId: dongtai_headers = {'Referer': dongtai_url, 'User-Agent': DEFAULT_UA} promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \ "&userId=%s&vfeedTabId=115" % userId promotion_data = download(promotion_url, dongtai_headers) if promotion_data: promotion_obj = parse_html(promotion_data, encoding="gb18030") i = 0 while i < 10: feedInfo = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()")[i].encode('utf-8') if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo: #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i] link = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href")[i] promotion_price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()")[i] price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()")[i] promotion_time = promotion_obj.xpath(u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()")[i] pt = promotion_time.encode('utf-8').replace("起止日期:","").split(" - ") start_time = pt[0].replace(".", "-") end_time = pt[1].replace(".", "-") if '2013' not in pt[1] or '2014' not in pt[1]: end_time = '2013-' + end_time if start_time > end_time: end_time = end_time.replace("2013", "2014") num_id = get_numiid(link, dongtai_headers) if num_id: sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (shop_id, num_id) re = list(db.execute(sql)) if not re: db.execute("insert into shop_promotion (shop_id, num_id, price, " "promotion_price, start_time, end_time, create_time, " "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())" % (shop_id, num_id, price.replace(',', ''), promotion_price.replace(',', ''), start_time, end_time)) else: logger.error("shop %s:%s crawler num_id failed", shop_id, url) i += 1 logger.info("shop %s:%s crawler promotiom item num=%s", shop_id, url, i) else: logger.warning("shop %s:%s not promotion info", shop_id, url) else: logger.error("shop %s:%s crawler userId failed", shop_id, url) else: logger.error("shop %s:%s not dongtai page", shop_id, url) except: logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())
def crawler(sql): db = get_db_engine() shops = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for shop in shops: shop_id = shop[0] url = str(shop[1]) type = shop[2] if url[-1] != '/': url += "/" try: shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} dongtai_url = url + "dongtai.htm" dongtai_data = download(dongtai_url, shop_headers) if dongtai_data: dongtai_obj = parse_html(dongtai_data, encoding="gb18030") dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode( 'utf-8') if '店铺动态' in dongtai_title: microscope_data = dongtai_obj.xpath( "//*[@name='microscope-data']/@content") userId = get_val(str(microscope_data), "userId") if userId: dongtai_headers = { 'Referer': dongtai_url, 'User-Agent': DEFAULT_UA } promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \ "&userId=%s&vfeedTabId=115" % userId promotion_data = download(promotion_url, dongtai_headers) if promotion_data: promotion_obj = parse_html(promotion_data, encoding="gb18030") i = 0 while i < 10: feedInfo = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()" )[i].encode('utf-8') if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo: #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i] link = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href" )[i] promotion_price = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()" )[i] price = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()" )[i] promotion_time = promotion_obj.xpath( u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()" )[i] pt = promotion_time.encode( 'utf-8').replace("起止日期:", "").split(" - ") start_time = pt[0].replace(".", "-") end_time = pt[1].replace(".", "-") if '2013' not in pt[1] or '2014' not in pt[ 1]: end_time = '2013-' + end_time if start_time > end_time: end_time = end_time.replace( "2013", "2014") num_id = get_numiid(link, dongtai_headers) if num_id: sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % ( shop_id, num_id) re = list(db.execute(sql)) if not re: db.execute( "insert into shop_promotion (shop_id, num_id, price, " "promotion_price, start_time, end_time, create_time, " "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())" % (shop_id, num_id, price.replace(',', ''), promotion_price.replace( ',', ''), start_time, end_time)) else: logger.error( "shop %s:%s crawler num_id failed", shop_id, url) i += 1 logger.info( "shop %s:%s crawler promotiom item num=%s", shop_id, url, i) else: logger.warning("shop %s:%s not promotion info", shop_id, url) else: logger.error("shop %s:%s crawler userId failed", shop_id, url) else: logger.error("shop %s:%s not dongtai page", shop_id, url) except: logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())