def crawl_one_shop(shop, tb_category, term_factory, db): shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] shop_termLimits = shop[3] # 白名单模式暂时没有使用上,shop.mode defaultCampaign = list(db.execute( "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id)) if not defaultCampaign: logger.error("can not get the default campaign for shop: %s", shop_id) return """ 1.setting shop crawl_status=2 2.crawler 3.setting shop crawl_status=0 """ db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id) # 店铺的所有商品num id,从新品列表抓取获得,这里注意:可能有多条记录 allTbNumIds = list(db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id)) tb_numids = [] for ids in allTbNumIds: tb_numids.extend(ids[0].split(',')) tb_numids_set = set(tb_numids) logger.info("crawling shop: %s %s, taobao online num %s", shop_id, shop_url, len(tb_numids_set)) # 过滤 new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set) logger.info("stat taobao shop %s: new_num:%s, offline_num:%s, common_num:%s" % (shop_id, len(new_numids_set), len(offShelf_numids_set), len(common_numids_set))) new_num = 0 off2On_num = 0 black_num = 0 if len(new_numids_set) > 0: new_item_list = doCrawl(shop_id, new_numids_set) if new_item_list: for dict_item in new_item_list: num_id = str(dict_item['num_iid']) n_cid = dict_item['cid'] tb_title = dict_item['title'] tb_detail_url = str(dict_item['detail_url']) tb_price = float(dict_item['price']) tb_pic_url = str(dict_item['pic_url']) volume = 0 if dict_item.has_key('volume'): volume = dict_item['volume'] try: #检查该商品是否重新上架 db_item = list(db.execute( "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id='%s'" % (shop_id, num_id))) if db_item: #update db_status = int(db_item[0][6]) db_manual_set = int(db_item[0][5]) db_price = float(db_item[0][4]) db_local_pic_url = db_item[0][3] db_pic_url = db_item[0][2] db_title = db_item[0][1] db_item_id = int(db_item[0][0]) if db_status == ITEM_STATUS_BLACKLIST: black_num += 1 continue item = TaobaoItem(shop_id, db_item_id, num_id) item.status = ITEM_STATUS_ACTIVE # 先置为上线状态,再检查其他属性是否有变化 # 人工设置了图片和title if db_manual_set == 1: if not imgExists(shop_id, db_local_pic_url): # 图片不存在,需要重新下载,且检查价格 item.local_pic_url = db_local_pic_url item.setPicUrl(tb_pic_url) if tb_price != db_price and quickUpdatePrice(db_item_id, db): item.price = tb_price else: # 图片存在,只检查价格 if tb_price != db_price and quickUpdatePrice(db_item_id, db): item.price = tb_price else: if tb_title != db_title: item.title = tb_title if tb_price != db_price and quickUpdatePrice(db_item_id, db): item.price = tb_price # 图片路径有变化,或者原图片不存在了,都需要重新下载 if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url): item.local_pic_url = db_local_pic_url item.setPicUrl(tb_pic_url) # TODO # dbItem是下线状态,可能要重新匹配terms, # 原来下线时并没有删除对应的item_term, 但不排除其他渠道删除,以后有需求再处理 # item.db_update(db) off2On_num += 1 else: #add item = TaobaoItem(shop_id, 0, num_id) item.title = tb_title item.detail_url = tb_detail_url.replace("spm=(\\.|\\d)*", "spm=2014.12669715.0.0") item.price = tb_price item.pic_url = tb_pic_url item.volume = volume item.category = tb_category.getCategoryPath(n_cid) # ---> item.termIds = item.matchTaobaoTerms(term_factory, str(shop_termLimits)) # ---> item.setPicUrl(tb_pic_url) item.setCampaign(defaultCampaign) item.status = ITEM_STATUS_ACTIVE item.db_create(db) new_num += 1 except: logger.error("%s: %s creating failed %s", shop_id, num_id, traceback.format_exc()) continue logger.info("shop %s crawler: new %s, back on line %s, black %s", shop_id, new_num, off2On_num, black_num) if offShelf_numids_set: #offline db.execute("update item set status=2 where num_id in (%s)" % ', '.join("'" + str(s) + "'" for s in offShelf_numids_set)) logger.info("shop %s crawler: offline %s", shop_id, len(offShelf_numids_set)) """ # 原有的逻辑中,是将已经抓取过的item过滤掉,不进行处理。 # 如果想更新title/price/pic_url速度更块一些的话,可以打开此部分代码,可保证至少4小时内全部更新一遍 update_num = 0 if common_numids_set: #validate price pic_url common_item_list = doCrawl(shop_id, common_numids_set) if common_item_list: for dict_item in common_item_list: num_id = str(dict_item['num_iid']) tb_title = dict_item['title'] tb_price = float(dict_item['price']) tb_pic_url = str(dict_item['pic_url']) db_item = list(db.execute("select id, title, pic_url, local_pic_url, price, manual_set, volume from item where shop_id=%s and num_id=%s and status = 1" % (shop_id, num_id))) if db_item: db_volume = int(db_item[0][6]) db_manual_set = int(db_item[0][5]) db_price = float(db_item[0][4]) db_local_pic_url = db_item[0][3] db_pic_url = db_item[0][2] db_title = db_item[0][1] db_item_id = db_item[0][0] item = TaobaoItem(shop_id, db_item_id, num_id) is_update = False if db_manual_set == 1: if tb_price != db_price and quickUpdatePrice(db_item_id, db): item.price = tb_price is_update = True else: if dict_item.has_key('volume'): if int(dict_item['volume']) != db_volume: item.volume = int(dict_item['volume']) is_update = True if tb_price != db_price and quickUpdatePrice(db_item_id, db): item.price = tb_price is_update = True if tb_title != db_title: item.title = tb_title is_update = True if tb_pic_url != db_pic_url or not imgExists(shop_id, db_local_pic_url): item.local_pic_url = db_local_pic_url item.setPicUrl(tb_pic_url) is_update = True if is_update: item.db_update(db) update_num += 1 logger.info("shop %s: common %s, update %s ", shop_id, len(common_numids_set), update_num) """ db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_NONE, shop_id)
def crawl_shop(sql): db = get_db_engine() shops = db.execute(sql) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() # global, all shop use tb_category = TaobaoCategory() term_factory = TermFactory() for shop in shops: shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] shop_termLimits = shop[3] defaultCampaign = list( db.execute( "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id)) if not defaultCampaign: logger.error("can not get the default campaign for shop: %s", shop_id) continue #1.更新shop crawl_status=2 #2.crawl #3.更新shop crawl_status=0 #db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id) # 店铺的所有商品num id allTbNumIds = db.execute( "SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id) tb_numids = [] for ids in allTbNumIds: tb_numids.extend(ids[0].split(',')) tb_numids_set = set(tb_numids) # 过滤 new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds( db, shop_id, tb_numids_set) if len(new_numids_set) == 0: logger.info("crawling shop: %s %s", shop_id, shop_url) new_item_list = doCrawl(shop_id, new_numids_set) if new_item_list: new_num = 0 new_numiid = [] update_num = 0 update_numiid = [] for dict_item in new_item_list: num_id = str(dict_item['num_iid']) n_cid = dict_item['cid'] tb_title = dict_item['title'] tb_detail_url = str(dict_item['detail_url']) tb_price = float(dict_item['price']) tb_pic_url = str(dict_item['pic_url']) volume = 0 if dict_item.has_key('volume'): volume = dict_item['volume'] #检查该商品是否重新上架 db_item = list( db.execute( "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id=%s" % (shop_id, str(num_id)))) if db_item: db_status = db_item[0][6] db_manual_set = db_item[0][5] db_price = db_item[0][4] db_local_pic_url = db_item[0][3] db_pic_url = db_item[0][2] db_title = db_item[0][1] db_item_id = db_item[0][0] #update if db_status == ITEM_STATUS_BLACKLIST: continue if db_manual_set == 1: # 人工设置了图片和title if not imgExists(shop_id, db_local_pic_url): # 老图片不存在,需要重新下载 tb_item = TaobaoItem(shop_id, db_item_id, num_id) tb_item.setPicUrl(tb_pic_url, db_pic_url) if tb_price != db_price: tb_item.price = tb_price # TODO db update else: tb_item = TaobaoItem(shop_id, db_item_id, num_id, dict_item['title'], dict_item['detail_url'], dict_item['price'], dict_item['pic_url'], volume) tb_item.setPicUrl(dict_item['pic_url'], db_pic_url) update_numiid.append(num_id) # TODO db update update_num += 1 else: #add tb_item = TaobaoItem(shop_id, 0, num_id, tb_title, tb_detail_url, tb_price, dict_item['pic_url'], volume) tb_item.category = tb_category.getCategoryPath(n_cid) tb_item.termIds = tb_item.matchTaobaoTerms( term_factory, str(shop_termLimits)) tb_item.setPicUrl(dict_item['pic_url'], "") tb_item.setCampaign(defaultCampaign) new_numiid.append(num_id) # TODO db add new_num += 1 logger.info("shop %s new item num=%s,update item num=%s", shop_id, new_num, update_num) if offShelf_numids_set: #offline #db.execute("update item set status=2 where num_id in (%s)", ', '.join(offShelf_numids_set)) logger.info("shop %s off shelf item num=%s", shop_id, len(offShelf_numids_set)) if common_numids_set: #validate price pic_url common_item_list = doCrawl(shop_id, common_numids_set) if common_item_list: for dict_item in common_item_list: num_id = str(dict_item['num_iid']) title = dict_item['title'] price = float(dict_item['price']) pic_url = str(dict_item['pic_url']) db_item = list( db.execute( "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id=%s" % (shop_id, num_id))) if db_item: if db_item[0][6] == ITEM_STATUS_BLACKLIST: continue else: id = int(db_item[0][0]) tb_item = TaobaoItem(shop_id, id, num_id) if dict_item.has_key('volume'): tb_item.volume = int(dict_item['volume']) if price != db_item[0][4]: tb_item.price = price if title != db_item[0][1]: tb_item.title = title if pic_url != db_item[0][2]: tb_item.setPicUrl(pic_url, db_item[0][2]) # TODO db update logger.info("shop %s common item num=%s ", shop_id, len(common_numids_set))
result = False stack = [] for i in range(0, len(self.postFixExp)): pfx = self.postFixExp[i].encode("utf-8") if pfx == ',': b1 = stack.pop() b2 = stack.pop() result = b1 and b2 stack.append(result) elif pfx == '|': b1 = stack.pop() b2 = stack.pop() result = b1 or b2 stack.append(result) else: result = self.matchTerm(pfx, item) stack.append(result) return result if __name__ == "__main__": log_init("CrawlLogger", "sqlalchemy.*") db = get_db_engine() item = TaobaoItem(1624, 2000278, '18381030933') item.category = '蕾丝衫/雪纺衫,女装/女士精品,' item.title = '七格格 OTHERMIX 夏装新款 个性印花无袖雪纺衫 女中长款3MR2024P' term_factory = TermFactory(db) print item.matchTaobaoTerms(term_factory, "2,81,296")
def crawl_shop(sql): db = get_db_engine() shops = db.execute(sql) # debug if FLAGS.debug_parser: import pdb pdb.set_trace() # global, all shop use tb_category = TaobaoCategory() term_factory = TermFactory() for shop in shops: shop_id = shop[0] shop_type = shop[1] shop_url = shop[2] shop_termLimits = shop[3] defaultCampaign = list(db.execute( "select id, default_uctrac_price from campaign where shop_id=%s and system_status = 1 and delete_flag = 0" % shop_id)) if not defaultCampaign: logger.error("can not get the default campaign for shop: %s", shop_id) continue #1.更新shop crawl_status=2 #2.crawl #3.更新shop crawl_status=0 #db.execute("update shop set crawl_status=%s where id=%s", SHOP_CRAWL_STATUS_CRAWLING, shop_id) # 店铺的所有商品num id allTbNumIds = db.execute("SELECT itemids FROM tb_shop_item WHERE shopid = %s", shop_id) tb_numids = [] for ids in allTbNumIds: tb_numids.extend(ids[0].split(',')) tb_numids_set = set(tb_numids) # 过滤 new_numids_set, offShelf_numids_set, common_numids_set = filterNumIds(db, shop_id, tb_numids_set) if len(new_numids_set) == 0: logger.info("crawling shop: %s %s", shop_id, shop_url) new_item_list = doCrawl(shop_id, new_numids_set) if new_item_list: new_num = 0 new_numiid = [] update_num = 0 update_numiid = [] for dict_item in new_item_list: num_id = str(dict_item['num_iid']) n_cid = dict_item['cid'] tb_title = dict_item['title'] tb_detail_url = str(dict_item['detail_url']) tb_price = float(dict_item['price']) tb_pic_url = str(dict_item['pic_url']) volume = 0 if dict_item.has_key('volume'): volume = dict_item['volume'] #检查该商品是否重新上架 db_item = list(db.execute( "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id=%s" % ( shop_id, str(num_id)))) if db_item: db_status = db_item[0][6] db_manual_set = db_item[0][5] db_price = db_item[0][4] db_local_pic_url = db_item[0][3] db_pic_url = db_item[0][2] db_title = db_item[0][1] db_item_id = db_item[0][0] #update if db_status == ITEM_STATUS_BLACKLIST: continue if db_manual_set == 1: # 人工设置了图片和title if not imgExists(shop_id, db_local_pic_url): # 老图片不存在,需要重新下载 tb_item = TaobaoItem(shop_id, db_item_id, num_id) tb_item.setPicUrl(tb_pic_url, db_pic_url) if tb_price != db_price: tb_item.price = tb_price # TODO db update else: tb_item = TaobaoItem(shop_id, db_item_id, num_id, dict_item['title'], dict_item['detail_url'], dict_item['price'], dict_item['pic_url'], volume) tb_item.setPicUrl(dict_item['pic_url'], db_pic_url) update_numiid.append(num_id) # TODO db update update_num += 1 else: #add tb_item = TaobaoItem(shop_id, 0, num_id, tb_title, tb_detail_url, tb_price, dict_item['pic_url'], volume) tb_item.category = tb_category.getCategoryPath(n_cid) tb_item.termIds = tb_item.matchTaobaoTerms(term_factory, str(shop_termLimits)) tb_item.setPicUrl(dict_item['pic_url'], "") tb_item.setCampaign(defaultCampaign) new_numiid.append(num_id) # TODO db add new_num += 1 logger.info("shop %s new item num=%s,update item num=%s", shop_id, new_num, update_num) if offShelf_numids_set: #offline #db.execute("update item set status=2 where num_id in (%s)", ', '.join(offShelf_numids_set)) logger.info("shop %s off shelf item num=%s", shop_id, len(offShelf_numids_set)) if common_numids_set: #validate price pic_url common_item_list = doCrawl(shop_id, common_numids_set) if common_item_list: for dict_item in common_item_list: num_id = str(dict_item['num_iid']) title = dict_item['title'] price = float(dict_item['price']) pic_url = str(dict_item['pic_url']) db_item = list(db.execute( "select id, title, pic_url, local_pic_url, price, manual_set, status from item where shop_id=%s and num_id=%s" % ( shop_id, num_id))) if db_item: if db_item[0][6] == ITEM_STATUS_BLACKLIST: continue else: id = int(db_item[0][0]) tb_item = TaobaoItem(shop_id, id, num_id) if dict_item.has_key('volume'): tb_item.volume = int(dict_item['volume']) if price != db_item[0][4]: tb_item.price = price if title != db_item[0][1]: tb_item.title = title if pic_url != db_item[0][2]: tb_item.setPicUrl(pic_url, db_item[0][2]) # TODO db update logger.info("shop %s common item num=%s ", shop_id, len(common_numids_set))