def update_vip_shop(shop_id, db=None): if not db: db = get_db_engine() limitsql = "" if FLAGS.limit: limitsql += " limit " + str(FLAGS.limit) if shop_id: shop_str = " shop.id = %s and " % shop_id else: shop_str = " shop.id in (%s) and " % ','.join(map(str, FLAGS.vipshopids)) if FLAGS.interval > 0: from_date = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S") shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date) sql = "select item.id,item.num_id,shop.type,item.detail_url,item_re.detail_url from shop,item left join item_re on item.id=item_re.item_id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str results = db.connect().execute(sql + limitsql) total = results.rowcount logger.debug("Processing %s result %s", sql, total) if total == 0: logger.info("nothing to do with shop %s", shop_str) return pos = 0 converted = 0 SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$") for input, outputstr in convert_taobaoke_widget(list(filter_tbk_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None, appkey='21315963', appsec='549d623e612832df7720101f83f951b9'): if not outputstr: logger.debug("Converted failed %s null %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input))) continue output = extract_json_from_jsonp(outputstr) pos += len(input) if not output: logger.debug("Converted failed empty %s %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input))) continue if output['total_results'] == 0 or not output['taobaoke_items']: logger.debug("No output %s %s %s/%s/%s", input, shop_id, converted, pos, total) continue succ_len = len(output['taobaoke_items']['taobaoke_item']) logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len)) converted += succ_len Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len) try: numid2id = dict([(int(num_id), id) for id, num_id, shop_type, jn_url, re_url in input]) for result in output['taobaoke_items']['taobaoke_item']: isql = "" try: num_iid = result['num_iid'] click_url = result['click_url'] + "&u=re_UCTRAC_CLK_&unid=re_UCTRAC_CLK_" # conver spm to xtao if click_url.find('spm=') > 0: click_url = SPMRE.subn(r'\g<1>2014.21315963.1.0\g<3>', click_url)[0] else: click_url += '&spm=2014.21315963.1.0' id = numid2id[num_iid] isql = "insert into item_re (item_id, detail_url) values (%s, '%s') on duplicate key update detail_url='%s'" % (id, click_url, click_url) logger.debug("process %s %s/%s -> %s", shop_id, pos, total, isql) if not FLAGS.dryrun: db.execute(isql.replace('%', '%%')) except KeyboardInterrupt: raise except Exception, e: logger.debug("in %s out %s" % (numid2id, result)) logger.warn("convert failed %s %s" % (isql, traceback.format_exc())) except KeyboardInterrupt: raise except: logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc())) logger.info("Convert result %s - %s", converted, total) # retry sql results = db.connect().execute(sql + limitsql) for row in filter_tbk_items(results): id, num_id, shop_type, jn_url, re_url = row if not re_url: sql = "insert into item_re (item_id, detail_url) values (%s, '%s')" % (id, 'http://item.taobao.com/item.htm?id=%s&spm=2014.21315963.1.0' % num_id) db.execute(sql)
def crawl_item2(kwargs): #signal.signal(signal.SIGINT, signal.SIG_IGN) item = kwargs['item'] is_commit = kwargs['is_commit'] crawl_path = kwargs['crawl_path'] server_path = kwargs['server_path'] org_server_path = kwargs['org_server_path'] is_remove = kwargs['is_remove'] item_id = item[0] num_id = item[1] is_success = False crawl_result = ((item_id, {'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0}),) try: conn = get_db_engine(**kwargs).connect() try: items = conn.execute("select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id) result = list(items) if len(result) == 1: html = result[0][0] desc_content = result[0][1] html_obj = parse_html(html) thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if len(thumbImages) == 0: thumbImages = [IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style")] # taobao @src to @data-src if not len(thumbImages): thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@data-src") if len(thumbImages) == 0: logger.error("crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags':['crawl_thumb_empty',]}) return crawl_result r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M|re.S) tr = re.compile("(.*)_\d+x\d+\.jpg$") tr_new = re.compile("(.+\.(jpg|png|gif))[^$]*.jpg$") desc_thumbs = desc_table_thumbs = lazy_desc_thumbs = [] if desc_content: desc_html = r.subn(r'\2', desc_content)[0] desc_html_obj = parse_html(desc_html) if desc_html_obj is not None: desc_table_thumbs = desc_html_obj.xpath("//table/@background") desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img[not(@data-ks-lazyload)]/@src") lazy_desc_thumbs = desc_html_obj.xpath("//*[not(@href)]/img/@data-ks-lazyload") else: logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags':['crawl_nodesc',]}) images = [] pos = 1 for url in thumbImages: ori_url = None if tr.match(url): ori_url = tr.sub(r'\1', url) else: if tr_new.match(url): ori_url = tr_new.sub(r'\1', url) else: logger.error("crawl item %s %s thumb image urls can not be parsed!", item_id, num_id, extra={'tags':['crawl_exception',]}) images.append((ori_url, pos, 1)) pos += 1 for url in desc_table_thumbs: images.append((url, pos, 2)) pos += 1 for url in desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 2)) pos += 1 for url in lazy_desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 3)) pos += 1 logger.debug("crawling %s %s %s", item_id, num_id, images) item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport']) item_crawler.crawl(images, ((710,10000),), is_commit, conn, is_remove) is_success = item_crawler.success crawl_result = ((item_id, item_crawler.summary),) except Exception, e: logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags':['crawl_exception',]}) finally: conn.close() Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'], host = kwargs['statshost'], port = kwargs['statsport']) if is_success: logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result) Statsd.increment('guang.crawl.itemimg.succ', host = kwargs['statshost'], port = kwargs['statsport']) else: logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags':['crawl_failed',]}) Statsd.increment('guang.crawl.itemimg.failed', host = kwargs['statshost'], port = kwargs['statsport'])
def update_shop(shop_id, db): if not db: db = get_db_engine() tbk = list(db.execute("select * from tbk where shop_id=%s" % shop_id)) if tbk: tbk_pid = str(tbk[0][1]) else: tbk_pid = FLAGS.pid limitsql = "" if FLAGS.limit: limitsql += " limit " + str(FLAGS.limit) if shop_id: shop_str = " shop.id = %s and " % shop_id else: shop_str = "" if FLAGS.interval > 0: from_date = datetime.datetime.strftime(datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S") shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date) if not FLAGS.force: sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id and item.detail_url not like '%%%%s.click.taobao.com%%%%'" % shop_str results = db.connect().execute(sql + limitsql) else: sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str results = db.connect().execute(sql + limitsql) total = results.rowcount if total == 0: logger.info("nothing to do with shop %s", shop_id) return pos = 0 converted = 0 SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$") for input, outputstr in convert_taobaoke_widget(list(filter_retry_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None): if not outputstr: logger.debug("Converted failed null %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input))) continue output = extract_json_from_jsonp(outputstr) pos += len(input) if not output: logger.debug("Converted failed empty %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input))) continue if output['total_results'] == 0 or not output['taobaoke_items']: logger.debug("No output %s %s/%s/%s", shop_id, converted, pos, total) for row in input: if not FLAGS.dryrun: db.execute("insert into tbk_item_convert(item_id, failed_count, last_time) values(%s, 1, now()) on duplicate key update failed_count=failed_count+1, last_time=now()" % row[0]) continue succ_len = len(output['taobaoke_items']['taobaoke_item']) logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len)) converted += succ_len Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len) try: numid2id = dict([(int(num_id), id) for id, num_id, failed_count, last_time in input]) for result in output['taobaoke_items']['taobaoke_item']: sql = "" try: num_iid = result['num_iid'] click_url = result['click_url'] + "&u=jn_UCTRAC_CLK_&unid=jn_UCTRAC_CLK_" # conver spm to xtao if click_url.find('spm=') > 0: click_url = SPMRE.subn(r'\g<1>2014.12669715.1.0\g<3>', click_url)[0] else: click_url += '&spm=2014.12669715.1.0' id = numid2id[num_iid] sql = "update item set detail_url='%s' where id=%s" % (click_url, id) logger.debug("process %s %s/%s -> %s", shop_id, pos, total, sql) if not FLAGS.dryrun: db.execute(sql.replace('%', '%%')) db.execute("delete from tbk_item_convert where item_id=%s" % id) except KeyboardInterrupt: raise except Exception, e: logger.debug("in %s out %s" % (numid2id, result)) logger.warn("convert failed %s %s" % (sql, traceback.format_exc())) except KeyboardInterrupt: raise except: logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc())) logger.info("Convert result %s - %s", converted, total)
def update_shop(shop_id, db): if not db: db = get_db_engine() tbk = list(db.execute("select * from tbk where shop_id=%s" % shop_id)) if tbk: tbk_pid = str(tbk[0][1]) else: tbk_pid = FLAGS.pid limitsql = "" if FLAGS.limit: limitsql += " limit " + str(FLAGS.limit) if shop_id: shop_str = " shop.id = %s and " % shop_id else: shop_str = "" if FLAGS.interval > 0: from_date = datetime.datetime.strftime( datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S") shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date) if not FLAGS.force: sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id and item.detail_url not like '%%%%s.click.taobao.com%%%%'" % shop_str results = db.connect().execute(sql + limitsql) else: sql = "select item.id,item.num_id,tbk_item_convert.failed_count,tbk_item_convert.last_time from shop,item left join tbk_item_convert on tbk_item_convert.item_id=item.id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str results = db.connect().execute(sql + limitsql) total = results.rowcount if total == 0: logger.info("nothing to do with shop %s", shop_id) return pos = 0 converted = 0 SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$") for input, outputstr in convert_taobaoke_widget(list( filter_retry_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None): if not outputstr: logger.debug( "Converted failed null %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input))) continue output = extract_json_from_jsonp(outputstr) pos += len(input) if not output: logger.debug( "Converted failed empty %s progress %s/%s/%s -> in %s" % (shop_id, converted, pos, total, len(input))) continue if output['total_results'] == 0 or not output['taobaoke_items']: logger.debug("No output %s %s/%s/%s", shop_id, converted, pos, total) for row in input: if not FLAGS.dryrun: db.execute( "insert into tbk_item_convert(item_id, failed_count, last_time) values(%s, 1, now()) on duplicate key update failed_count=failed_count+1, last_time=now()" % row[0]) continue succ_len = len(output['taobaoke_items']['taobaoke_item']) logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len)) converted += succ_len Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len) try: numid2id = dict([(int(num_id), id) for id, num_id, failed_count, last_time in input]) for result in output['taobaoke_items']['taobaoke_item']: sql = "" try: num_iid = result['num_iid'] click_url = result[ 'click_url'] + "&u=jn_UCTRAC_CLK_&unid=jn_UCTRAC_CLK_" # conver spm to xtao if click_url.find('spm=') > 0: click_url = SPMRE.subn(r'\g<1>2014.12669715.1.0\g<3>', click_url)[0] else: click_url += '&spm=2014.12669715.1.0' id = numid2id[num_iid] sql = "update item set detail_url='%s' where id=%s" % ( click_url, id) logger.debug("process %s %s/%s -> %s", shop_id, pos, total, sql) if not FLAGS.dryrun: db.execute(sql.replace('%', '%%')) db.execute( "delete from tbk_item_convert where item_id=%s" % id) except KeyboardInterrupt: raise except Exception, e: logger.debug("in %s out %s" % (numid2id, result)) logger.warn("convert failed %s %s" % (sql, traceback.format_exc())) except KeyboardInterrupt: raise except: logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc())) logger.info("Convert result %s - %s", converted, total)
def update_vip_shop(shop_id, db=None): if not db: db = get_db_engine() limitsql = "" if FLAGS.limit: limitsql += " limit " + str(FLAGS.limit) if shop_id: shop_str = " shop.id = %s and " % shop_id else: shop_str = " shop.id in (%s) and " % ','.join( map(str, FLAGS.vipshopids)) if FLAGS.interval > 0: from_date = datetime.datetime.strftime( datetime.datetime.now() - datetime.timedelta(FLAGS.interval), "%Y-%m-%d %H:%M:%S") shop_str += " (created > '%s' or modified > '%s') and " % (from_date, from_date) sql = "select item.id,item.num_id,shop.type,item.detail_url,item_re.detail_url from shop,item left join item_re on item.id=item_re.item_id where %s shop.type <= 2 and shop.status = 1 and item.status = 1 and item.shop_id = shop.id" % shop_str results = db.connect().execute(sql + limitsql) total = results.rowcount logger.debug("Processing %s result %s", sql, total) if total == 0: logger.info("nothing to do with shop %s", shop_str) return pos = 0 converted = 0 SPMRE = re.compile("^(.*spm=)([^&]*)(.*)$") for input, outputstr in convert_taobaoke_widget( list(filter_tbk_items(results)), fn_join_iids=join_iids, calllimit=60, outer_code=None, appkey='21315963', appsec='549d623e612832df7720101f83f951b9'): if not outputstr: logger.debug( "Converted failed %s null %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input))) continue output = extract_json_from_jsonp(outputstr) pos += len(input) if not output: logger.debug( "Converted failed empty %s %s progress %s/%s/%s -> in %s" % (input, shop_id, converted, pos, total, len(input))) continue if output['total_results'] == 0 or not output['taobaoke_items']: logger.debug("No output %s %s %s/%s/%s", input, shop_id, converted, pos, total) continue succ_len = len(output['taobaoke_items']['taobaoke_item']) logger.info("Converted shop %s progress %s/%s/%s -> in %s out %s %s" % (shop_id, converted, pos, total, len(input), output['total_results'], succ_len)) converted += succ_len Statsd.update_stats('guang.taobaoapi.convert', delta=succ_len) try: numid2id = dict([(int(num_id), id) for id, num_id, shop_type, jn_url, re_url in input ]) for result in output['taobaoke_items']['taobaoke_item']: isql = "" try: num_iid = result['num_iid'] click_url = result[ 'click_url'] + "&u=re_UCTRAC_CLK_&unid=re_UCTRAC_CLK_" # conver spm to xtao if click_url.find('spm=') > 0: click_url = SPMRE.subn(r'\g<1>2014.21315963.1.0\g<3>', click_url)[0] else: click_url += '&spm=2014.21315963.1.0' id = numid2id[num_iid] isql = "insert into item_re (item_id, detail_url) values (%s, '%s') on duplicate key update detail_url='%s'" % ( id, click_url, click_url) logger.debug("process %s %s/%s -> %s", shop_id, pos, total, isql) if not FLAGS.dryrun: db.execute(isql.replace('%', '%%')) except KeyboardInterrupt: raise except Exception, e: logger.debug("in %s out %s" % (numid2id, result)) logger.warn("convert failed %s %s" % (isql, traceback.format_exc())) except KeyboardInterrupt: raise except: logger.warn("process failed %s %s reason %s" % (input, output, traceback.format_exc())) logger.info("Convert result %s - %s", converted, total) # retry sql results = db.connect().execute(sql + limitsql) for row in filter_tbk_items(results): id, num_id, shop_type, jn_url, re_url = row if not re_url: sql = "insert into item_re (item_id, detail_url) values (%s, '%s')" % ( id, 'http://item.taobao.com/item.htm?id=%s&spm=2014.21315963.1.0' % num_id) db.execute(sql)
def crawl_item2(kwargs): #signal.signal(signal.SIGINT, signal.SIG_IGN) item = kwargs['item'] is_commit = kwargs['is_commit'] crawl_path = kwargs['crawl_path'] server_path = kwargs['server_path'] org_server_path = kwargs['org_server_path'] is_remove = kwargs['is_remove'] item_id = item[0] num_id = item[1] is_success = False crawl_result = ((item_id, { 'suc1': 0, 'count1': 0, 'suc': 0, 'count': 0 }), ) try: conn = get_db_engine(**kwargs).connect() try: items = conn.execute( "select html, desc_content from crawl_html where crawl_html.item_id=%s;" % item_id) result = list(items) if len(result) == 1: html = result[0][0] desc_content = result[0][1] html_obj = parse_html(html) thumbImages = html_obj.xpath("//ul[@id='J_UlThumb']//img/@src") if len(thumbImages) == 0: thumbImages = [ IMAGESTYLE_RE.subn(r'\g<1>', x)[0] for x in html_obj.xpath("//ul[@id='J_UlThumb']//li/@style") ] # taobao @src to @data-src if not len(thumbImages): thumbImages = html_obj.xpath( "//ul[@id='J_UlThumb']//img/@data-src") if len(thumbImages) == 0: logger.error( "crawl item %s %s not found thumb images html size %s", item_id, num_id, len(html), extra={'tags': [ 'crawl_thumb_empty', ]}) return crawl_result r = re.compile("(var desc='|)(.*)(\\\\|';)", re.M | re.S) tr = re.compile("(.*)_\d+x\d+\.jpg$") desc_thumbs = lazy_desc_thumbs = [] if desc_content: desc_html = r.subn(r'\2', desc_content)[0] desc_html_obj = parse_html(desc_html) if desc_html_obj is not None: desc_thumbs = desc_html_obj.xpath( "//*[not(@href)]/img[not(@data-ks-lazyload)]/@src") lazy_desc_thumbs = desc_html_obj.xpath( "//*[not(@href)]/img/@data-ks-lazyload") else: logger.warn("crawl item %s %s desc content is empty!", item_id, num_id, extra={'tags': [ 'crawl_nodesc', ]}) images = [] pos = 1 for url in thumbImages: images.append((tr.sub(r'\1', url), pos, 1)) pos += 1 for url in desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 2)) pos += 1 for url in lazy_desc_thumbs: if "js/ckeditor" not in url: images.append((url, pos, 3)) pos += 1 logger.debug("crawling %s %s %s", item_id, num_id, images) item_crawler = ItemCrawler(item_id, num_id, crawl_path, server_path, org_server_path, kwargs['statshost'], kwargs['statsport']) item_crawler.crawl(images, ((710, 10000), ), is_commit, conn, is_remove) is_success = item_crawler.success crawl_result = ((item_id, item_crawler.summary), ) except Exception, e: logger.error("crawl item %s %s got exception %s", item_id, num_id, traceback.format_exc(), extra={'tags': [ 'crawl_exception', ]}) finally: conn.close() Statsd.update_stats("guang.crawl.downimgcount", crawl_result[0][1]['suc1'] + crawl_result[0][1]['suc'], host=kwargs['statshost'], port=kwargs['statsport']) if is_success: logger.info("crawl item %s %s success %s", item_id, num_id, crawl_result) Statsd.increment('guang.crawl.itemimg.succ', host=kwargs['statshost'], port=kwargs['statsport']) else: logger.warn("crawl item %s %s failed %s", item_id, num_id, crawl_result, extra={'tags': [ 'crawl_failed', ]}) Statsd.increment('guang.crawl.itemimg.failed', host=kwargs['statshost'], port=kwargs['statsport'])
def crawl_item2(kwargs): item = kwargs['item'] is_commit = kwargs['is_commit'] is_success = False item_id = item[0] num_id = item[2] crawl_result = ((item_id, (0, 0, 0, 0, 0, 0.0, 0)), ) tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments']) db = None if is_commit: db = get_db_engine() try: logger.info("progress %s/%s id %s iid %s", kwargs['i'], kwargs['total'], item_id, num_id) tb.crawl() if tb.is_offline and is_commit: db.execute("update item set status=2, modified=now() where id=%s" % item_id) if tb.detailDiv and not tb.is_offline: tb.crawl_price() if is_commit: # check old price and volume pv = list( db.execute("select price, volume from item where id=%s", item_id)) price = pv[0][0] volume = pv[0][1] if tb.price != price and tb.price > 0.001: is_price_update = True else: is_price_update = False if tb.volume > 0 and tb.volume != volume: is_volume_update = True else: is_volume_update = False if is_price_update: db.execute( "insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()" % item_id) if is_volume_update: db.execute( "update item set price=%s, volume=%s where id=%s", tb.price, tb.volume, item_id) else: db.execute("update item set price=%s where id=%s", tb.price, item_id) elif is_volume_update: db.execute("update item set volume=%s where id=%s", tb.volume, item_id) if is_price_update: Statsd.increment("taobao.crawl.price_update") if is_volume_update: Statsd.increment("taobao.crawl.volume_update") if FLAGS.update_main: tb.crawl_desc() if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html: db.execute("delete from crawl_html where item_id=%s" % item_id) db.execute( "insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)", item_id, tb.descUrl, tb.promoteUrl, tb.data.decode('gb18030').encode('utf8'), tb.descContent.decode('gb18030').encode('utf8'), tb.promoteContent.decode('gb18030').encode('utf8'), 1, "") db.execute("update item set crawl_status=1 where id=%s" % item_id) Statsd.increment("taobao.crawl.html_update") ############### processing comments ########### if FLAGS.update_comments: rediscli = get_redis(FLAGS.redishost, FLAGS.redisport) key = "guang:rate:%s" % item_id l = rediscli.llen(key) tb.crawl_rate() logger.info("replace comments %s %s -> %s", item_id, l, len(tb.comments)) #rediscli.lrange(key, 0, l) rediscli.delete(key) for c in tb.comments: rediscli.rpush(key, c.SerializeToString()) # if limit size #p = rediscli.pipeline() #p.rpush(key, c.SerializeToString()) #p.ltrim(0, 99) #p.execute() Statsd.increment("taobao.crawl.comments_update") Statsd.update_stats("taobao.crawl.comments_update_total", len(tb.comments)) is_success = True crawl_result = ((item_id, (len(tb.data), len(tb.promoteContent), len(tb.descContent), len(tb.thumbImages), len(tb.buyButton), tb.price, len(tb.comments))), ) else: logger.warn("crawl %s failed, no detail content or is_offline=%s", item_id, tb.is_offline) crawl_result = ((item_id, (len(tb.data), 0, 0, 0, 0, 0.0, 0)), ) except: logger.error("crawling %s unknown exception %s", item_id, traceback.format_exc(), extra={'tags': [ 'crawlItemException', ]}) logger.info("crawling %s result %s - %s", item_id, is_success, crawl_result) if is_success: Statsd.increment("taobao.crawl.itemhtml.succ") else: Statsd.increment("taobao.crawl.itemhtml.failed") return crawl_result
def crawl_item2(kwargs): item = kwargs['item'] is_commit = kwargs['is_commit'] is_success = False item_id = item[0] num_id = item[2] crawl_result = ((item_id, (0,0,0,0,0,0.0,0)),) tb = TaobaoHtml(item_id, num_id, max_comments=kwargs['max_comments']) db = None if is_commit: db = get_db_engine() try: logger.info("progress %s/%s id %s iid %s", kwargs['i'], kwargs['total'], item_id, num_id) tb.crawl() if tb.is_offline and is_commit: db.execute("update item set status=2, modified=now() where id=%s" % item_id) if tb.detailDiv and not tb.is_offline: tb.crawl_price() if is_commit: # check old price and volume pv = list(db.execute("select price, volume from item where id=%s", item_id)) price = pv[0][0] volume = pv[0][1] if tb.price != price and tb.price > 0.001: is_price_update = True else: is_price_update = False if tb.volume > 0 and tb.volume != volume: is_volume_update = True else: is_volume_update = False if is_price_update: db.execute("insert into price_update_track (item_id,time) values (%s,now()) on duplicate key update time=now()" % item_id) if is_volume_update: db.execute("update item set price=%s, volume=%s where id=%s", tb.price, tb.volume, item_id) else: db.execute("update item set price=%s where id=%s", tb.price, item_id) elif is_volume_update: db.execute("update item set volume=%s where id=%s", tb.volume, item_id) if is_price_update: Statsd.increment("taobao.crawl.price_update") if is_volume_update: Statsd.increment("taobao.crawl.volume_update") if FLAGS.update_main: tb.crawl_desc() if len(tb.thumbImages) > 0 and is_commit and FLAGS.commit_html: db.execute("delete from crawl_html where item_id=%s" % item_id) db.execute("insert into crawl_html (item_id,desc_url,promo_url,html,desc_content,promo_content,result,reason) values (%s, %s, %s, %s, %s, %s, %s, %s)", item_id, tb.descUrl, tb.promoteUrl, tb.data.decode('gb18030').encode('utf8'), tb.descContent.decode('gb18030').encode('utf8'), tb.promoteContent.decode('gb18030').encode('utf8'), 1, "") db.execute("update item set crawl_status=1 where id=%s" % item_id) Statsd.increment("taobao.crawl.html_update") ############### processing comments ########### if FLAGS.update_comments: rediscli = get_redis(FLAGS.redishost, FLAGS.redisport) key = "guang:rate:%s" % item_id l = rediscli.llen(key) tb.crawl_rate() logger.info("replace comments %s %s -> %s", item_id, l, len(tb.comments)) #rediscli.lrange(key, 0, l) rediscli.delete(key) for c in tb.comments: rediscli.rpush(key, c.SerializeToString()) # if limit size #p = rediscli.pipeline() #p.rpush(key, c.SerializeToString()) #p.ltrim(0, 99) #p.execute() Statsd.increment("taobao.crawl.comments_update") Statsd.update_stats("taobao.crawl.comments_update_total", len(tb.comments)) is_success = True crawl_result = ((item_id, (len(tb.data),len(tb.promoteContent),len(tb.descContent),len(tb.thumbImages),len(tb.buyButton),tb.price,len(tb.comments))),) else: logger.warn("crawl %s failed, no detail content or is_offline=%s", item_id, tb.is_offline) crawl_result = ((item_id, (len(tb.data),0,0,0,0,0.0,0)),) except: logger.error("crawling %s unknown exception %s", item_id, traceback.format_exc(), extra={'tags':['crawlItemException',]}) logger.info("crawling %s result %s - %s", item_id, is_success, crawl_result) if is_success: Statsd.increment("taobao.crawl.itemhtml.succ") else: Statsd.increment("taobao.crawl.itemhtml.failed") return crawl_result