def update_item(sql): t = time.time() db = get_db_engine() item = db.execute(sql) results = get_taobao_items(get_top(), item, fn_join_iids=lambda x:','.join([str(i[1]) for i in x]), calllimit=60) for batch_item in results: for iid, item in batch_item.items.iteritems(): try: item_id = item['req'][0] item_iid = item['req'][1] shop_id = item['req'][2] item_title = item['req'][3] item_picurl = item['req'][4] local_pic_url = item['req'][5] #直接用数据库的文件名,不更新,类似"18142957186_28924096.jpg" if item['resp']: taobao_title = item['resp']['title'] taobao_picurl = item['resp']['pic_url'] #item_picurl != taobao_picurl,则需要重新获取,并存入dfs,再更新item #title, pic_url, pic_width, pic_height, modified if FLAGS.forcibly: #强制更新 is_title_update = True is_picurl_update = True # local_pic_url = "%s_%s.%s" % (item_iid, str(id(item)), item_picurl.split('.')[-1].split('?')[0].split('/')[-1]) else: if cmp(item_title, taobao_title): is_title_update = True else: is_title_update = False if cmp(item_picurl, taobao_picurl): is_picurl_update = True else: is_picurl_update = False if is_title_update: if is_picurl_update: width, height = download_image({'item_id': item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path}) db.execute("update item set modified=now(), title=%s, pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_title, taobao_picurl, width, height, item_id) logger.info("item %s num_id %s update title from %s to %s, pic_url from %s to %s", item_id, item_iid, item_title, taobao_title, item_picurl, taobao_picurl) else: db.execute("update item set modified=now(), title=%s where id=%s", taobao_title, item_id) logger.info("item %s update title from %s to %s", item_id, item_title, taobao_title) elif is_picurl_update: width, height = download_image({'item_id':item_id, 'num_id': item_iid, 'shop_id': shop_id, 'pic_url': taobao_picurl, 'image_name': local_pic_url, 'crawl_path': FLAGS.crawl_path}) db.execute("update item set modified=now(), pic_url=%s, pic_width=%s, pic_height=%s where id=%s", taobao_picurl, width, height, item_id) logger.info("item %s num_id %s update pic_url from %s to %s", item_id, item_iid, item_picurl, taobao_picurl) except: logger.error("update failed %s", traceback.format_exc()) spent = time.time() - t logger.info("update_item_title_image use time : %s", spent*1000)
def crawl_main(): write_db, read_db = get_db_engines(**{'dbconnstrs': FLAGS.xdbconnstrs}) sql = "select item.id, item.num_id, item.price, item.pic_url, item.volume from item_hotest, item, shop where item_hotest.item_id = item.id and item.status = 1 and item.shop_id = shop.id and shop.type <= 2 and shop.status=1 limit %s" % FLAGS.limit rows = read_db.execute(sql) counter = 0 off_counter = 0 change_counter = 0 vol_change_counter = 0 total = rows.rowcount results = get_taobao_items( get_top(), rows, fn_join_iids=lambda x: ','.join([str(i[1]) for i in x]), calllimit=300) for batch_item in results: for iid, item in batch_item.items.items(): try: counter += 1 item_id = item['req'][0] item_iid = item['req'][1] item_price = item['req'][2] #item_picurl = item['req'][3] if item['resp']: if item['resp']['approve_status'] != 'onsale': logger.debug("Item %s/%s %s %s is offshelf", counter, total, item_id, item_iid) off_counter += 1 write_db.execute( "update item set status=2, modified=now() where id=%s" % item_id) else: price = float(item['resp']['price']) #title = item['resp']['title'] #pic_url = item['resp']['pic_url'] if abs(item_price - price) / (item_price + 0.0000001) > 0.2 or abs( item_price - price) > 2.0: change_counter += 1 logger.debug("Item %s/%s %s %s price %s -> %s", counter, total, item_id, item_iid, item_price, price) if FLAGS.commit_price: write_db.execute( "update item set price=%s where id=%s" % (price, item_id)) logger.debug("req %s resp %s", item['req'], item['resp']) except: logger.error("update failed %s", traceback.format_exc()) logger.info( "Taobao quickupdate, total %s, off %s, price change %s, volume change %s", total, off_counter, change_counter, vol_change_counter)
def doCrawl(shop_id, numids_set): """ 注意: 下面这3行完全是为了满足get_taobao_items的第二个参数限制,组装成类似数据库查询结果,没啥意义 """ num_iids = [] for id in numids_set: num_iids.append((shop_id, id)) return_item_list = [] results = get_taobao_items(get_top(), num_iids, fn_join_iids=lambda x: ','.join([str(i[1]) for i in x])) for r in results: for iid, item in r.items.iteritems(): if item['resp']: return_item_list.append(dict(item['resp'])) return return_item_list
def doCrawl(shop_id, numids_set): """ 注意: 下面这3行完全是为了满足get_taobao_items的第二个参数限制,组装成类似数据库查询结果,没啥意义 """ num_iids = [] for id in numids_set: num_iids.append((shop_id, id)) # taobao.tbk.items.detail.get这个api有时间限制 time.sleep(3) return_item_list = [] results = get_taobao_items(get_taobaoke_top(), num_iids, fn_join_iids=lambda x: ",".join([str(i[1]) for i in x])) for r in results: for iid, item in r.items.iteritems(): if item["resp"]: return_item_list.append(dict(item["resp"])) return return_item_list
def crawl_main(): write_db, read_db = get_db_engines(**{'dbconnstrs' : FLAGS.xdbconnstrs}) sql = "select item.id, item.num_id, item.price, item.pic_url, item.volume from item_hotest, item, shop where item_hotest.item_id = item.id and item.status = 1 and item.shop_id = shop.id and shop.type <= 2 and shop.status=1 limit %s" % FLAGS.limit rows = read_db.execute(sql) counter = 0 off_counter = 0 change_counter = 0 vol_change_counter = 0 total = rows.rowcount results = get_taobao_items(get_top(), rows, fn_join_iids=lambda x:','.join([str(i[1]) for i in x]), calllimit=300) for batch_item in results: for iid, item in batch_item.items.items(): try: counter += 1 item_id = item['req'][0] item_iid = item['req'][1] item_price = item['req'][2] #item_picurl = item['req'][3] if item['resp']: if item['resp']['approve_status'] != 'onsale': logger.debug("Item %s/%s %s %s is offshelf", counter, total, item_id, item_iid) off_counter += 1 write_db.execute("update item set status=2, modified=now() where id=%s" % item_id) else: price = float(item['resp']['price']) #title = item['resp']['title'] #pic_url = item['resp']['pic_url'] if abs(item_price - price) / (item_price + 0.0000001) > 0.2 or abs(item_price - price) > 2.0: change_counter += 1 logger.debug("Item %s/%s %s %s price %s -> %s", counter, total, item_id, item_iid, item_price, price) if FLAGS.commit_price: write_db.execute("update item set price=%s where id=%s" % (price, item_id)) logger.debug("req %s resp %s", item['req'], item['resp']) except: logger.error("update failed %s", traceback.format_exc()) logger.info("Taobao quickupdate, total %s, off %s, price change %s, volume change %s", total, off_counter, change_counter, vol_change_counter)