def download_image(self): headers = { 'Referer': str(self.url), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" } big_path = "%s/%s/big/%s" % (FLAGS.path, self.shop_id, self.local_pic_url) mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, self.shop_id, self.local_pic_url) mid_path = "%s/%s/mid/%s" % (FLAGS.path, self.shop_id, self.local_pic_url) sma_path = "%s/%s/sma/%s" % (FLAGS.path, self.shop_id, self.local_pic_url) small2_path = "%s/%s/small2/%s" % (FLAGS.path, self.shop_id, self.local_pic_url) small3_path = "%s/%s/small3/%s" % (FLAGS.path, self.shop_id, self.local_pic_url) # 上层try/except, 便于stat data = download(self.pic_url, headers) if not data: time.sleep(2) data = download(self.pic_url, headers) self.save_image(big_path, data) self.imagemagick_resize(300, 300, big_path, mid2_path) self.imagemagick_resize(210, 210, big_path, mid_path) self.imagemagick_resize(60, 60, big_path, sma_path) self.imagemagick_resize(100, 100, big_path, small2_path) self.imagemagick_resize(145, 145, big_path, small3_path) return self.get_image_size(big_path)
def download_image(pic_url, shop_id, item_id, local_pic_url, fdfs_client): big_path = "%s/%s/big/%s" % (FLAGS.path, shop_id, local_pic_url) mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url) mid_path = "%s/%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url) sma_path = "%s/%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url) sma2_path = "%s/%s/small2/%s" % (FLAGS.path, shop_id, local_pic_url) sma3_path = "%s/%s/small3/%s" % (FLAGS.path, shop_id, local_pic_url) headers = { 'Referer': "http://www.j.cn/product/%s.htm" % item_id, 'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" } # pic_url 包含http走下载,不包含即fastdfs get if "http://" in pic_url: data = download(pic_url, headers) save_image(big_path, data) else: try: if not os.path.exists(os.path.dirname(big_path)): make_dirs_for_file(big_path) fdfs_client.download_to_file(big_path, pic_url) except (ConnectionError, ResponseError, DataError), e: fastdfs_filename = "http://image2.guang.j.cn/images/%s/big/%s" % ( shop_id, local_pic_url) data = download(fastdfs_filename, headers) save_image(big_path, data) logger.info("%s:%s fdfs get failed: %s, usage http download", item_id, pic_url, e)
def download_image(pic_url, shop_id, item_id, local_pic_url, fdfs_client): big_path = "%s/%s/big/%s" % (FLAGS.path, shop_id, local_pic_url) mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url) mid_path = "%s/%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url) sma_path = "%s/%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url) sma2_path = "%s/%s/small2/%s" % (FLAGS.path, shop_id, local_pic_url) sma3_path = "%s/%s/small3/%s" % (FLAGS.path, shop_id, local_pic_url) headers = { 'Referer': "http://www.j.cn/product/%s.htm" % item_id, 'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" } # pic_url 包含http走下载,不包含即fastdfs get if "http://" in pic_url: data = download(pic_url, headers) save_image(big_path, data) else: try: if not os.path.exists(os.path.dirname(big_path)): make_dirs_for_file(big_path) fdfs_client.download_to_file(big_path, pic_url) except (ConnectionError, ResponseError, DataError), e: fastdfs_filename = "http://image2.guang.j.cn/images/%s/big/%s" % (shop_id, local_pic_url) data = download(fastdfs_filename, headers) save_image(big_path, data) logger.info("%s:%s fdfs get failed: %s, usage http download", item_id, pic_url, e)
def crawler(sql): db = get_db_engine() items = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for item in items: shop_id = item[0] shop_type = item[1] item_id = item[2] url = item[3] try: htm = get_item_htm(item_id, url, db) if shop_type == 1: htm_obj = parse_html(htm, encoding='gb18030') discount_url = htm_obj.xpath("//div[@id='promote']/@data-default") if discount_url and len(discount_url) > 0: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(discount_url[0], item_headers) if disc_content.strip(): disc_obj = parse_html(disc_content, encoding='gb18030') content = disc_obj.xpath("//div[@id='J_MjsData']/h3/text()")[0].strip() dates = disc_obj.xpath("//div[@id='J_MjsData']/h3/span[@class='tb-indate']/text()")[0].strip() st = dates.encode('utf-8').replace("--","—").split("—") start_time = datetime.datetime.strptime(st[0].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') end_time = datetime.datetime.strptime(st[1].strip().replace('年','-').replace("月","-").replace("日",""),'%Y-%m-%d') db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, discount_url[0]) logger.info("taobao shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) elif shop_type == 2: d_url = get_val(htm, "initApi") if d_url: item_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} disc_content = download(d_url, item_headers) cjson = loads(disc_content.decode('gb18030').encode('utf8')) shop_prom = cjson['defaultModel']['itemPriceResultDO']['tmallShopProm'] if shop_prom: st = int(shop_prom['startTime'])/1000 et = int(shop_prom['endTime'])/1000 start_time = time.strftime("%Y-%m-%d", time.localtime(st)) end_time = time.strftime("%Y-%m-%d", time.localtime(et)) content = shop_prom['promPlan'][0]['msg'] db.execute("replace into shop_discount (shop_id,content,start_time,end_time,discount_url,create_time,last_update_time) values (%s,%s,%s,%s,%s,now(),now())", shop_id, content.encode('utf-8'), start_time, end_time, d_url) logger.info("tmall shop %s get discount success", shop_id) else: logger.warning("taobao shop %s:%s not discount.", shop_id, url) except: logger.error("shop %s:%s xpath failed:%s", shop_id, url, traceback.format_exc())
def download_image(self, shop_id, num_id, pic_url, local_pic_url): headers = {'Referer': "http://item.taobao.com/item.htm?id=%s" % num_id, 'User-Agent': DEFAULT_UA } big_path = "%s/%s/big/%s" % (FLAGS.path, shop_id, local_pic_url) mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url) mid_path = "%s/%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url) sma_path = "%s/%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url) small2_path = "%s/%s/small2/%s" % (FLAGS.path, shop_id, local_pic_url) small3_path = "%s/%s/small3/%s" % (FLAGS.path, shop_id, local_pic_url) try: data = download(pic_url, headers) except KeyboardInterrupt: raise except: logger.error("download %s:%s failed reason: %s", num_id, pic_url, traceback.format_exc()) self.save_image(big_path, data) self.imagemagick_resize(300, 300, big_path, mid2_path) self.imagemagick_resize(210, 210, big_path, mid_path) self.imagemagick_resize(60, 60, big_path, sma_path) self.imagemagick_resize(100, 100, big_path, small2_path) self.imagemagick_resize(145, 145, big_path, small3_path) return self.get_image_size(big_path)
def download_image(self, shop_id, num_id, pic_url, local_pic_url): headers = { 'Referer': "http://item.taobao.com/item.htm?id=%s" % num_id, 'User-Agent': DEFAULT_UA } big_path = "%s/%s/big/%s" % (FLAGS.path, shop_id, local_pic_url) mid2_path = "%s/%s/mid2/%s" % (FLAGS.path, shop_id, local_pic_url) mid_path = "%s/%s/mid/%s" % (FLAGS.path, shop_id, local_pic_url) sma_path = "%s/%s/sma/%s" % (FLAGS.path, shop_id, local_pic_url) small2_path = "%s/%s/small2/%s" % (FLAGS.path, shop_id, local_pic_url) small3_path = "%s/%s/small3/%s" % (FLAGS.path, shop_id, local_pic_url) try: data = download(pic_url, headers) except KeyboardInterrupt: raise except: logger.error("download %s:%s failed reason: %s", num_id, pic_url, traceback.format_exc()) self.save_image(big_path, data) self.imagemagick_resize(300, 300, big_path, mid2_path) self.imagemagick_resize(210, 210, big_path, mid_path) self.imagemagick_resize(60, 60, big_path, sma_path) self.imagemagick_resize(100, 100, big_path, small2_path) self.imagemagick_resize(145, 145, big_path, small3_path) return self.get_image_size(big_path)
def convert_taobaoke_widget(items, fn_join_iids=lambda x:','.join(x), batch_size=40, calllimit=60, expire=600, outer_code='jcn', appkey=TAOBAOKE_APPKEY, appsec=TAOBAOKE_APPSECRET): ts = int(time.time()*1000) msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec sign = hmac.HMAC(appsec, msg).hexdigest().upper() headers = {'User-Agent' : DEFAULT_UA, 'Referer' : "http://www.j.cn/"} for chunk in waitlimit(calllimit, 60.0, chunks(items, batch_size)): # calllimit for minutes params = {'app_key' : appkey, '_t_sys' : 'args=4', 'method' : 'taobao.taobaoke.widget.items.convert', 'sign' : sign, 'timestamp' : ts, 'fields' : "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score", 'callback' : 'TOP.io.jsonpCbs.t%s' % md5( str(random.random()) ).hexdigest()[:13], 'partner_id' : 'top-sdk-js-20120801', } params['num_iids'] = fn_join_iids(chunk) if outer_code: params['outer_code'] = outer_code url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode(params) results = download(url, headers=headers) if results: Statsd.increment('guang.taobaoapi.widget_succ') else: Statsd.increment('guang.taobaoapi.widget_err') #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results) yield (chunk, results)
def download_with_referer(url, referer): """抓取店铺扩展信息时 强制要求加refer 如果不需要加 则refer赋值为None""" if referer: headers = { 'Referer': referer, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" } else: headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'User-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)" } return download(url, headers)
def download_image(kwargs): item_id = kwargs['item_id'] num_id = kwargs['num_id'] shop_id = kwargs['shop_id'] crawl_path = kwargs['crawl_path'] image_name = kwargs['image_name'] pic_url = kwargs['pic_url'] #先下主图,放到big目录 headers = { 'Referer': "http://item.taobao.com/item.htm?id=%s" % num_id, 'User-Agent': DEFAULT_UA } try: data = download(pic_url, headers) except KeyboardInterrupt: raise except: logger.info("download %s:%s failed reason: %s", item_id, pic_url, traceback.format_exc()) shop_image_path = "%s/%s" % (crawl_path, shop_id) if not os.path.exists(shop_image_path): os.mkdir(shop_image_path) shop_image_big_path = "%s/big" % shop_image_path if not os.path.exists(shop_image_big_path): os.mkdir(shop_image_big_path) big_image_fullpath = "%s/%s" % (shop_image_big_path, image_name) f = open(big_image_fullpath, "w") f.write(data) f.close() try: image = Image.open(cStringIO.StringIO(open(big_image_fullpath).read())) except IOError, e: logger.info("Open image failed %s:%s %s", item_id, pic_url, e.message)
def download_page(self, url, max_retry_count=5): result = download( url, max_retry=max_retry_count, fn_is_banned=lambda data: data.find(u"您的访问受到限制".encode('gbk')) > 0, throw_on_banned=True) return result
def GET(self): params = web.input(term_id=74, start=0, rows=120, sortby='region_ctr_0111_4', edismax=False, bfs='sum(mul(__SORT__,2.5),mul(__SIMI__,5.0))', qf='item_title^1.4 shop_name^0.5', boost=None, xks=12, wd='', debugQuery='on') bfs = boost = '' tagmatch = get_xks_tagmatch(params.xks) if params.edismax and params.bfs: bfs = replace_meta(params.bfs, params.sortby, tagmatch) if params.edismax and params.boost: boost = replace_meta(params.boost, params.sortby, tagmatch) url = build_solr_qs(params.term_id, params.start, params.rows, params.sortby, params.edismax, params.qf, bfs.split('|'), boost, params.debugQuery, params.wd) logger.debug('fetching %s', url) results = simplejson.loads(download(url)) #import pdb; pdb.set_trace() return render_html("list.htm", {'results' : results, 'solrurl' : url, 'xksinfo' : 'xks %s : tagmatch %s' % (params.xks, tagmatch), 'params' : params, })
def crawl_tao123(shops): base_url = "http://dianpu.tao123.com/nvzhuang/%s.php" end = 22 for i in range(1, end+1): url = base_url % i html = download(url) html_obj = parse_html(html) shops.update(html_obj.xpath("//div[@class='cg_shop_info']//a/@href"))
def crawl_tao123(shops): base_url = "http://www.meilishuo.com/shop/top/0/%s" end = 203 for i in range(end): logger.debug("processing %s", i) url = base_url % i html = download(url) html_obj = parse_html(html) shops.update(html_obj.xpath("//div[@class='shop_item']//a/@href"))
def crawl_dirtbshop(shops): base_url = "http://dirtbshop.com/list_shop_%s_1_1.html" end = 251 for i in range(1, end+1): url = base_url % i html = download(url) html_obj = parse_html(html) import pdb; pdb.set_trace() urls = html_obj.xpath("//span[@class='grebtn_in']/a/@href")
def get_dynamicStock(self): # 从script标签中获取并组装url # http://detailskip.taobao.com/json/sib.htm? s = self.html_obj.xpath("//script[contains(text(),'var b=')]/text()") s_re = re.compile("b=\"([^<>]*)\",a=") dynamicStock_url = s_re.search(str(s)).group(1) if dynamicStock_url: dynamicStock_url += "&ref=" + urllib.quote(self.url) self.dynamicStockData = download(dynamicStock_url, self.headers)
def get_dynamicStock(self): # 从script标签中获取并组装url # http://detailskip.taobao.com/json/sib.htm? s = self.html_obj.xpath("//script[contains(text(),'var b=')]/text()") s_re = re.compile("b=\"([^<>]*)\",a=") dynamicStock_url = s_re.search(str(s)).group(1) if dynamicStock_url: dynamicStock_url += "&ref=" + urllib.quote(self.url); self.dynamicStockData = download(dynamicStock_url, self.headers)
def check_graphite(server, target, n, warnv=0.0, errorv=0.0, gt=True, since="-1days", until="-"): url = "http://%s/render?format=json&from=%s&until=%s&target=%s" % (server, since, until, target) logger.debug("Fetching %s", url) data = download(url) json_data = simplejson.loads(data) data_points = json_data[0]['datapoints'] lastn_datapoints = list(takelastn(data_points, FLAGS.lastn, lambda x:not x[0])) logger.debug("Last n data point %s", lastn_datapoints) is_warn = all_matched(lambda x:not ((x[0]>warnv) ^ gt), lastn_datapoints) is_error = all_matched(lambda x:not ((x[0]>errorv) ^ gt), lastn_datapoints) return is_warn, is_error, lastn_datapoints
def crawl_tao123(shops): for line in open(FLAGS.path): try: line = line.strip() url = "http://www.meilishuo.com%s" % line html = download(url) html_obj = parse_html(html) shop_url = html_obj.xpath("//div[@class='shop_summary']/a/@href") logger.debug("processing %s -> %s", line, shop_url) shops.update(shop_url) except: logger.error("processing %s failed", line)
def get_item_htm(id, url, db): sql = "select html,last_modified from crawl_html where item_id=%s" % id item_htm = list(db.execute(sql)) last_modified = item_htm[0][1] now = datetime.datetime.now() days = now - last_modified if days > datetime.timedelta(days=7): item_headers = {'Referer': url,'User-Agent': DEFAULT_UA} item_htm = download(url, item_headers) db.execute("update crawl_html set html=%s,last_modified=now() where item_id=%s", item_htm.decode('gb18030').encode('utf8'), id) return item_htm else: return item_htm[0][0]
def main(): url = "http://%s:7080%s" % (FLAGS.solr_host, SOLR_URL) #import pdb; pdb.set_trace() results = simplejson.loads(download(url)) db = get_db_engine() counts = [] for doc in results['response']['docs']: item_id = doc['item_id'] count = db.execute("select count(id) from favourite where itemid=%s and acttime>'2012-12-01' and favstatus=1 and firstchoose=0;" % item_id) if count.rowcount: counts.append(list(count)[0][0]) else: counts.append(0) cs = Series(counts) logger.info(cs.describe())
def GET(self): params = web.input(term_id=74, start=0, rows=120, sortby='region_ctr_0111_4', xks=12, wd='', debugQuery='on') tagmatch = get_xks_tagmatch(params.xks) url = build_solr_custom_qs(params.term_id, params.start, params.rows, params.sortby, params.debugQuery, params.wd, tagmatch) logger.debug('fetching %s', url) results = simplejson.loads(download(url)) #import pdb; pdb.set_trace() return render_html("custlist.htm", {'results' : results, 'solrurl' : url, 'xksinfo' : 'xks %s : tagmatch %s' % (params.xks, tagmatch), 'params' : params, })
def convert_taobaoke_widget(items, fn_join_iids=lambda x: ','.join(x), batch_size=40, calllimit=60, expire=600, outer_code='jcn', appkey=TAOBAOKE_APPKEY, appsec=TAOBAOKE_APPSECRET): ts = int(time.time() * 1000) msg = appsec + 'app_key' + str(appkey) + "timestamp" + str(ts) + appsec sign = hmac.HMAC(appsec, msg).hexdigest().upper() headers = {'User-Agent': DEFAULT_UA, 'Referer': "http://www.j.cn/"} for chunk in waitlimit(calllimit, 60.0, chunks(items, batch_size)): # calllimit for minutes params = { 'app_key': appkey, '_t_sys': 'args=4', 'method': 'taobao.taobaoke.widget.items.convert', 'sign': sign, 'timestamp': ts, 'fields': "num_iid,nick,price,click_url,commission,commission_rate,commission_num,commission_volume,shop_click_url,seller_credit_score", 'callback': 'TOP.io.jsonpCbs.t%s' % md5(str(random.random())).hexdigest()[:13], 'partner_id': 'top-sdk-js-20120801', } params['num_iids'] = fn_join_iids(chunk) if outer_code: params['outer_code'] = outer_code url = "http://gw.api.taobao.com/widget/rest?%s" % urllib.urlencode( params) results = download(url, headers=headers) if results: Statsd.increment('guang.taobaoapi.widget_succ') else: Statsd.increment('guang.taobaoapi.widget_err') #logger.debug('Calling %s(%s) -> %s', request.method_name, request.api_params, results) yield (chunk, results)
def crawl_main(): for host in open(FLAGS.path): url = "http://%s" % (host.strip()) try: html = download(url) #import pdb; pdb.set_trace() html_obj = parse_html(html, 'gbk') if url.find('tmall.com') > 0: shop_url = html_obj.xpath("//h3[@class='shop-title']/a/@href")[0] shop_name = html_obj.xpath("//h3[@class='shop-title']/a/text()")[0] print shop_url, shop_name.encode('utf8') else: shop_url = html_obj.xpath("//div[@class='shop-info-simple']/a/@href")[0] shop_name = html_obj.xpath("//div[@class='shop-info-simple']/a/text()")[0] shop_rank = html_obj.xpath("//span[@class='shop-rank']//img/@src")[0] #good_rate = html_obj.xpath("//li[@class='goodrate']/text()")[0] print shop_url, shop_name.encode('utf8'), shop_rank except KeyboardInterrupt: raise except: logger.warn("processing %s failed, %s", url, traceback.format_exc())
def check_graphite(server, target, n, warnv=0.0, errorv=0.0, gt=True, since="-1days", until="-"): url = "http://%s/render?format=json&from=%s&until=%s&target=%s" % ( server, since, until, target) logger.debug("Fetching %s", url) data = download(url) json_data = simplejson.loads(data) data_points = json_data[0]['datapoints'] lastn_datapoints = list( takelastn(data_points, FLAGS.lastn, lambda x: not x[0])) logger.debug("Last n data point %s", lastn_datapoints) is_warn = all_matched(lambda x: not ((x[0] > warnv) ^ gt), lastn_datapoints) is_error = all_matched(lambda x: not ((x[0] > errorv) ^ gt), lastn_datapoints) return is_warn, is_error, lastn_datapoints
def download_page(url, headers, max_retry_count=5): return download(url, headers, max_retry=max_retry_count, throw_on_banned=True)
def crawler(sql): db = get_db_engine() shops = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for shop in shops: shop_id = shop[0] url = str(shop[1]) type = shop[2] if url[-1] != '/': url += "/" try: shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} dongtai_url = url + "dongtai.htm" dongtai_data = download(dongtai_url, shop_headers) if dongtai_data: dongtai_obj = parse_html(dongtai_data, encoding="gb18030") dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode( 'utf-8') if '店铺动态' in dongtai_title: microscope_data = dongtai_obj.xpath( "//*[@name='microscope-data']/@content") userId = get_val(str(microscope_data), "userId") if userId: dongtai_headers = { 'Referer': dongtai_url, 'User-Agent': DEFAULT_UA } promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \ "&userId=%s&vfeedTabId=115" % userId promotion_data = download(promotion_url, dongtai_headers) if promotion_data: promotion_obj = parse_html(promotion_data, encoding="gb18030") i = 0 while i < 10: feedInfo = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()" )[i].encode('utf-8') if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo: #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i] link = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href" )[i] promotion_price = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()" )[i] price = promotion_obj.xpath( "//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()" )[i] promotion_time = promotion_obj.xpath( u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()" )[i] pt = promotion_time.encode( 'utf-8').replace("起止日期:", "").split(" - ") start_time = pt[0].replace(".", "-") end_time = pt[1].replace(".", "-") if '2013' not in pt[1] or '2014' not in pt[ 1]: end_time = '2013-' + end_time if start_time > end_time: end_time = end_time.replace( "2013", "2014") num_id = get_numiid(link, dongtai_headers) if num_id: sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % ( shop_id, num_id) re = list(db.execute(sql)) if not re: db.execute( "insert into shop_promotion (shop_id, num_id, price, " "promotion_price, start_time, end_time, create_time, " "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())" % (shop_id, num_id, price.replace(',', ''), promotion_price.replace( ',', ''), start_time, end_time)) else: logger.error( "shop %s:%s crawler num_id failed", shop_id, url) i += 1 logger.info( "shop %s:%s crawler promotiom item num=%s", shop_id, url, i) else: logger.warning("shop %s:%s not promotion info", shop_id, url) else: logger.error("shop %s:%s crawler userId failed", shop_id, url) else: logger.error("shop %s:%s not dongtai page", shop_id, url) except: logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())
def crawler(sql): db = get_db_engine() shops = list(db.execute(sql)) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() for shop in shops: shop_id = shop[0] url = str(shop[1]) type = shop[2] if url[-1] != '/': url += "/" try: shop_headers = {'Referer': url, 'User-Agent': DEFAULT_UA} dongtai_url = url + "dongtai.htm" dongtai_data = download(dongtai_url, shop_headers) if dongtai_data: dongtai_obj = parse_html(dongtai_data, encoding="gb18030") dongtai_title = dongtai_obj.xpath("//title/text()")[0].encode('utf-8') if '店铺动态' in dongtai_title: microscope_data = dongtai_obj.xpath("//*[@name='microscope-data']/@content") userId = get_val(str(microscope_data), "userId") if userId: dongtai_headers = {'Referer': dongtai_url, 'User-Agent': DEFAULT_UA} promotion_url = "http://shuo.taobao.com/feed/v_front_feeds.htm?_input_charset=utf-8&page=1" \ "&userId=%s&vfeedTabId=115" % userId promotion_data = download(promotion_url, dongtai_headers) if promotion_data: promotion_obj = parse_html(promotion_data, encoding="gb18030") i = 0 while i < 10: feedInfo = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-text']//span[@class='J_FeedInfo']/text()")[i].encode('utf-8') if '店铺促销中' in feedInfo or '限时折扣' in feedInfo or '折扣限时' in feedInfo: #title = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dt//a/text()")[i] link = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//a[@class='fd-view-detail']/@href")[i] promotion_price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price']/strong/text()")[i] price = promotion_obj.xpath("//div[@class='fd-item show-detail']//div[@class='fd-container']//dd//span[@class='g_price g_price-original']/strong/text()")[i] promotion_time = promotion_obj.xpath(u"//div[@class='fd-item show-detail']//div[@class='fd-container']//dd[contains(text(),'起止日期')]/text()")[i] pt = promotion_time.encode('utf-8').replace("起止日期:","").split(" - ") start_time = pt[0].replace(".", "-") end_time = pt[1].replace(".", "-") if '2013' not in pt[1] or '2014' not in pt[1]: end_time = '2013-' + end_time if start_time > end_time: end_time = end_time.replace("2013", "2014") num_id = get_numiid(link, dongtai_headers) if num_id: sql = "select id from shop_promotion where shop_id=%s and num_id=%s" % (shop_id, num_id) re = list(db.execute(sql)) if not re: db.execute("insert into shop_promotion (shop_id, num_id, price, " "promotion_price, start_time, end_time, create_time, " "last_update_time) values (%s,'%s',%s,%s,'%s','%s',now(),now())" % (shop_id, num_id, price.replace(',', ''), promotion_price.replace(',', ''), start_time, end_time)) else: logger.error("shop %s:%s crawler num_id failed", shop_id, url) i += 1 logger.info("shop %s:%s crawler promotiom item num=%s", shop_id, url, i) else: logger.warning("shop %s:%s not promotion info", shop_id, url) else: logger.error("shop %s:%s crawler userId failed", shop_id, url) else: logger.error("shop %s:%s not dongtai page", shop_id, url) except: logger.error("shop %s:%s crawler failed %s", shop_id, url, traceback.format_exc())
def download_page(self, url, max_retry_count=5): result = download(url, max_retry=max_retry_count, fn_is_banned=lambda data:data.find(u"您的访问受到限制".encode('gbk')) > 0, throw_on_banned=True) return result
def crawl_price(self): self.bidPrice = self.html_obj.xpath( "//input[@name='current_price']/@value") self.originPrice = self.html_obj.xpath( "//strong[@id='J_StrPrice']/em[@class='tb-rmb-num']/text()") if not self.originPrice: self.originPrice = self.html_obj.xpath( "//strong[@class='J_originalPrice']/text()") self.promoteUrl2 = get_val(self.data, "apiPromoData") if self.promoteUrl2: self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/") price = "" if self.is_tmall and self.tmallInitApi and self.tmallInitApijson: try: priceInfo = self.tmallInitApijson['defaultModel'][ 'itemPriceResultDO']['priceInfo'] if priceInfo: if priceInfo.has_key('def'): defaultPriceInfo = priceInfo['def'] else: defaultPriceInfo = priceInfo[priceInfo.keys()[0]] # 2013-11-22 改为获取真实促销价格,而不是扣除佣金后的价格 if defaultPriceInfo.has_key( 'promotionList' ) and defaultPriceInfo['promotionList']: price = defaultPriceInfo['promotionList'][0]['price'] if not price: if defaultPriceInfo.has_key('price'): price = defaultPriceInfo['price'] if not price: if defaultPriceInfo.has_key('promPrice'): price = defaultPriceInfo['promPrice']['price'] elif defaultPriceInfo.has_key( 'promotionList' ) and defaultPriceInfo['promotionList']: price = str( min([ float(x.get('price', '100000000.0')) for x in defaultPriceInfo['promotionList'] ])) except: logger.warn("Parse tmall json price failed, %s", self.item_id) if not price: if self.promoteUrl2: self.promoteContent = self.crawl_page( self.promoteUrl2).replace('"', '"') tag = "low:" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find(',', pos) price = self.promoteContent[pos:pos2] if not price: price = get_num_val(self.promoteContent, 'price') else: self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id self.promoteContent = self.crawl_page(self.promoteUrl) if self.promoteContent: self.promoteContent = self.promoteContent.replace( '"', '"') tag = "promPrice":"" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find('"', pos) price = self.promoteContent[pos:pos2] if not price: tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()") tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()") if tbPrice and not tbPrice[0].strip(): price = tbPrice[0].strip() elif tbPrice1 and not tbPrice1[0].strip(): price = tbPrice1[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() if not price: rg_m = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE | re.DOTALL).search(self.dynamicStockData) if rg_m: price_str = rg_m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # 2013-09-03 get price url if not price: #这里稍微有点麻烦,主要针对string进行处理 pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id response = download(pirce_url, self.headers) rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE | re.DOTALL) m = rg.search(response.decode('gb18030').encode('utf8')) if m: price_str = m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # not chuxiao price, set origin price if not price: if self.originPrice: price = self.originPrice[0].strip() elif self.bidPrice: price = self.bidPrice[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() self.price = float(price) logger.debug("%s price is %s", self.item_id, self.price)
def crawl_page(self, url): result = download(url, self.headers) return result
def download_image(self, url): t = time.time() data = download(url, headers=self.headers) spent = time.time() - t Statsd.timing("guang.crawl.image", spent * 1000, host=self.statshost, port=self.statsport) return data
def crawl_price(self): self.promoteUrl2 = get_val(self.data, "apiPromoData") if self.promoteUrl2: self.promoteUrl2 = self.promoteUrl2.replace(r'''\/''', "/") price = "" if self.is_tmall and self.tmallInitApi and self.tmallInitApijson: try: priceInfo = self.tmallInitApijson['defaultModel']['itemPriceResultDO']['priceInfo'] if priceInfo: if priceInfo.has_key('def'): defaultPriceInfo = priceInfo['def'] else: defaultPriceInfo = priceInfo[priceInfo.keys()[0]] if defaultPriceInfo.has_key('promPrice'): price = defaultPriceInfo['promPrice']['price'] elif defaultPriceInfo.has_key('promotionList') and defaultPriceInfo['promotionList']: price = str(min([float(x.get('price','100000000.0')) for x in defaultPriceInfo['promotionList']])) else: price = defaultPriceInfo['price'] except: logger.warn("Parse tmall json price failed, %s", self.item_id) if not price: if self.promoteUrl2: self.promoteContent = self.crawl_page(self.promoteUrl2).replace('"', '"') tag = "low:" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find(',', pos) price = self.promoteContent[pos:pos2] if not price: price = get_num_val(self.promoteContent, 'price') else: self.promoteUrl = "http://marketing.taobao.com/home/promotion/item_promotion_list.do?itemId=%s" % self.num_id self.promoteContent = self.crawl_page(self.promoteUrl).replace('"', '"') tag = "promPrice":"" if self.promoteContent.find(tag) > 0: pos = self.promoteContent.find(tag) + len(tag) pos2 = self.promoteContent.find('"', pos) price = self.promoteContent[pos:pos2] if not price: tbPrice = self.html_obj.xpath("//strong[@class='tb-price']/text()") tbPrice1 = self.html_obj.xpath("//span[@class='tb-price']/text()") if tbPrice and not tbPrice[0].strip(): price = tbPrice[0].strip() elif tbPrice1 and not tbPrice1[0].strip(): price = tbPrice1[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() # 2013-09-03 get price url if not price: #这里稍微有点麻烦,主要针对string进行处理 pirce_url = "http://ajax.tbcdn.cn/json/umpStock.htm?itemId=%s&p=1" % self.num_id response = download(pirce_url, self.headers) rg = re.compile('price:\"[0-9]+[.][0-9]+\"', re.IGNORECASE|re.DOTALL) m = rg.search(response.decode('gb18030').encode('utf8')) if m: price_str = m.group(0).split(":")[1].replace("\"", "") price = Decimal(price_str) # not chuxiao price, set origin price if not price: if self.originPrice: price = self.originPrice[0].strip() elif self.bidPrice: price = self.bidPrice[0].strip() if price.find("-") > 0: price = price.split('-')[0].strip() self.price = float(price) logger.debug("%s price is %s", self.item_id, self.price)