def update_worker(goods_list, page): start = time.time() LOG.info("page: %s, start: %s", page, start) for goods in goods_list: now = time.time() * 1000 update_time = goods.get("update_time") if update_time and now - update_time < 3600000: continue title = goods['title'] _id = goods['num_id'] sp = SearchParams() sp.page = 1 sp.count = 100 sp.keyword = title data = _super_search(sp) ok = 0 for g in data: goods_data = _ship_goods_supers(g) if goods_data['num_id'] == _id: ok = 1 goods_obj = TbkGoods(**goods_data) goods_obj.save() break if not ok: goods_obj = TbkGoods(num_id=_id) goods_obj.delete() LOG.info("delete id: %s", _id) del goods_list LOG.info("page: %s process ok: %s", page, time.time() - start)
def crawler_similar(goods_id): res = client.tbk_goods_recommend(goods_id) or {} response = res['tbk_item_recommend_get_response'] if response.get("results") is None: return goods_list = response['results'].get('n_tbk_item', []) similar_ids = [] for goods in goods_list: num_iid = goods['num_iid'] title = goods['title'] similar_goods = _search_by_id(num_iid, title) if similar_goods is None: continue similar_ids.append(num_iid) similar_goods.update({'source': 'similar'}) goods_instance = TbkGoods(**similar_goods) goods_instance.save() loop_ids = copy.deepcopy(similar_ids) for num_iid in loop_ids: goods_instance = TbkGoods(num_id=num_iid) goods_info = goods_instance.find_goods_by_id() if goods_info: ori_similar_ids = goods_info.get("similar_goods", []) if ori_similar_ids is not None: similar_ids.extend(ori_similar_ids) goods_instance.update({'similar_goods': list(set(similar_ids))}) similar_ids = copy.deepcopy(loop_ids) return similar_ids
def update_goods(keyword, num_id, table): goods_info = _search_by_id(num_id, keyword) if goods_info: goods_instance = TbkGoods(**goods_info) goods_info.update({'table': table}) searcher.update_index(goods_info) goods_instance.save() else: goods_instance = TbkGoods(num_id=num_id) goods_instance.disabled_goods_by_id() searcher.delete_index(num_id)
def update_main(): page = 20 count = 2000 more_data = True pool = ThreadPool(16) goods_obj = TbkGoods() last_id = '' while more_data: # more_data = False if last_id: cond = {'_id': {'$gt': last_id}} else: cond = {} goods_list = goods_obj.find_goods_by_cond( cond, page, count, ['title', 'num_id', 'update_time']) last_id = '' for goods in goods_list: last_id = goods['_id'] if not last_id: print("done") break # goods_list = list(goods_list) # if len(goods_list) < count: # more_data = False # break # else: # more_data = True LOG.info("page: %s ok", page) # pool.apply_async(update_worker, (goods_list, page)) page += 1 pool.close() pool.join()
def local_search(params): keyword = params.get("keyword") page = int(params.get("page", 1)) count = int(params.get("count", 20)) sort = int(params.get('tid', 1)) if isinstance(keyword, unicode): keyword = keyword.encode("utf-8") share_text_info = find_goods_info(keyword) if share_text_info: num_id = share_text_info['num_id'] save_goods_info(num_id, share_text_info) tmp = _ship_miniapp(share_text_info) return {'errcode': 0, 'data': [tmp]} data = [] sort_dict = {} if sort == 8: sort_dict.update({'coupon_amount': -1}) elif sort == 6: sort_dict.update({'sales': -1}) elif sort == 7: sort_dict.update({'coupon_fee': 1}) elif sort == 9: super_params = { 'keyword': keyword, 'page': page, 'count': count, 'yq': 0, 'tid': 0 } return super_search_miniapp(super_params) data = searcher.search(keyword, sort_dict, page=page, count=count) # ids = map(lambda x: int(x['id']), data) # LOG.info("data: %s", data) LOG.info('keyword: %s, ret: %s', keyword, len(data)) table_dict = {} ordered_id = [] for item in data: table = item.get("table", 'goods') table_dict.setdefault(table, []) table_dict[table].append(int(item['id'])) ordered_id.append(int(item['id'])) goods_obj = TbkGoods() data_dict = {} for table, ids in table_dict.items(): goods_obj.__table__ = table cond = {'num_id': {'$in': ids}} goods_list = goods_obj.find_goods_by_cond(cond, 1, count=100) for goods in goods_list: if goods.get('num_id') is None: LOG.info(goods['_id']) continue tmp = _ship_miniapp(goods) data_dict[goods['num_id']] = tmp result = [] for _id in ordered_id: tmp = data_dict.get(_id) if not tmp: continue result.append(tmp) return {'errcode': 0, 'data': result}
def crawler(keyword, page, count, cat_list=''): if cat_list and isinstance(cat_list, list): cat = ','.join(cat_list) else: cat = '' goods_list = _crawler(keyword=keyword, page=page, count=count, cat=cat) if goods_list is None: return [] result = [] for goods in goods_list: tmp = _ship_goods_supers(goods) if not tmp: continue tmp.update({'table': 'goods'}) cat_obj = Category(id=tmp['category_id'], name=tmp['category_name']) cat_obj.save_category() if tmp.get("sub_category_id"): cat_obj = SubCategory(id=tmp['sub_category_id'], name=tmp.get('sub_category_name', ''), parent=tmp['category_id']) cat_obj.save_category() source = keyword if keyword else 'crawler' tmp.update({'source': source}) goods_instance = TbkGoods(**tmp) if goods_instance.check_save(): goods_info = goods_instance.find_goods_by_id() if not goods_info: similar_ids = crawler_similar(tmp['num_id']) goods_instance.similar_goods = similar_ids result.append(tmp) ret = goods_instance.save() searcher.update_index(tmp) LOG.debug(ret) return result
def crawler_one_page(link, table, mid): parse_ret = urlparse(link) domain = parse_ret.netloc config = DATA_FIELD.get(domain) if not config: LOG.info("domain: %s not config", domain) return res_data_field = config.get("res_data") id_field = config.get("id") start = time.time() client = HttpClient() res = client.get(link) goods_list = res.get(res_data_field, []) for goods in goods_list: num_id = goods.get(id_field) tmp = _ship_goods(num_id) if not tmp: continue tmp.update({'mid': mid}) if isinstance(table, unicode): table = table.encode("utf-8") tmp.update({'table': table}) searcher.update_index(tmp) goods_obj = TbkGoods(**tmp) goods_obj.__table__ = table goods_obj.save() LOG.info("link: %s takes: %s", link, time.time() - start)
def goods_detail(goods_id): goods = get_goods_info_by_id(goods_id) if not goods: goods_instance = TbkGoods(num_id=goods_id) goods = goods_instance.find_goods_by_id() if not goods: return False, u"找不到该商品" return True, _ship_db_goods(goods)
def list_goods(cid=None, page=1, count=20): goods_obj = TbkGoods() cond = {'coupon_expire': 0} if cid: cond.update({'category_id': cid}) goods = goods_obj.find_goods_by_cond(cond, page, count) goods_list = map(_ship_db_goods, goods) return {'goods': goods_list}
def update_similar(): page = 6 count = 100 have_data = True goods_obj = TbkGoods() while have_data: have_data = False goods_list = goods_obj.find_goods_by_cond({}, page, count) for goods in goods_list: have_data = True if goods.get('similar_goods'): continue _id = goods['num_id'] similar_ids = crawler_similar(_id) if similar_ids is None: continue goods_instance = TbkGoods(num_id=_id) goods_instance.update({'similar_goods': similar_ids}) page += 1 print page
def get_data(self, params): table = 'haitao' res = self.validate(params) if res['errcode'] != 0: return res goods_obj = TbkGoods() goods_obj.__table__ = table sort = self.get_sort_field() ret = goods_obj.find_goods_by_cond({}, self.page, self.count) if sort: ret.sort(sort) return {'errcode': 0, 'data': ret}
def get_data(self, params): res = self.validate(params) if res['errcode'] != 0: return res cond = self.build_condition() sort = self.get_sort_field() LOG.info("cond: %s, sort: %s", cond, sort) goods_obj = TbkGoods() ret = goods_obj.find_goods_by_cond(cond, self.page, self.count) if sort: ret.sort(sort) return {'errcode': 0, 'data': ret}
def similar_goods(goods_id): goods = get_goods_info_by_id(goods_id) if not goods: goods_instance = TbkGoods(num_id=goods_id) goods = goods_instance.find_goods_by_id() if not goods: return True, [] similar_goods_ids = goods.get("similar_goods", []) goods_obj = TbkGoods() if similar_goods_ids: cond = {'num_id': {'$in': similar_goods_ids}, 'coupon_expire': 0} else: cond = {"coupon_expire": 0} goods_list = goods_obj.find_goods_by_cond( cond, 1, 100).sort([('sales', -1)]).limit(20) result = [] for goods in goods_list: tmp = _ship_db_goods(goods) if tmp['is_tmall']: tmp['is_tmall'] = 'inline' else: tmp['is_tmall'] = 'none' result.append(tmp) return True, result
def miniapp_goods_detail(gid, mid): res = {'errcode': -1} gid = int(gid) goods = get_goods_info_by_id(gid) if goods: data = _ship_miniapp_detail(goods) res.update({'errcode': 0, 'data': data}) return res goods_obj = TbkGoods(num_id=gid) if mid and mid.isdigit(): table = get_table_by_mid(mid) if table: goods_obj.__table__ = table goods = goods_obj.find_goods_by_id() if not goods: res.update({'errmsg': u"找不到商品"}) return res data = _ship_miniapp_detail(goods) res.update({'errcode': 0, 'data': data}) return res
def get_send_goods(cat_id=None): goods = get_one_goods(cat_id) if not goods: return title = goods['title'] price = goods['price'] coupon_amount = goods['coupon_amount'] coupon_url = goods['coupon_share_url'] pic_url = goods['pic_url'] tpw = generate_tpw(coupon_url, title, pic_url) goods_obj = TbkGoods(num_id=goods['num_id']) goods_obj.update({"sended": 1}) if not tpw: return new_price = round(float(float(price) - coupon_amount), 2) commssion_rate = goods['commssion_rate'] commssion_fee = round(new_price * float(commssion_rate) * 0.2, 2) msg = TEXT_MSG % (title, price, coupon_amount, new_price, commssion_fee, tpw) return {'pic_url': pic_url, 'msg': msg, 'goods': goods}
def update_one_by_one(table): page = 1 count = 1000 have_data = True update_count = 0 goods_obj = TbkGoods() goods_obj.__table__ = table LOG.info(table) while have_data: have_data = False goods_list = goods_obj.find_goods_by_cond({}, page, count) now = int(time.time() * 1000) for goods in goods_list: have_data = True update_time = goods.get('update_time') if update_time and now - update_time < 3600000: continue update_goods(goods['title'], goods['num_id'], table) page += 1 LOG.info("page: %s" % page) print(update_count)
def get_one_goods(cat=None): if cat is None: cat_obj = Category(recommend=1) cats = cat_obj.all_category() cat_list = [] for cat in cats: cat_list.append(int(cat['id'])) # cat_list = [1801, 16, 30, 50002766, 50006843, 122952001] cat_id = random.choice(cat_list) else: cat_id = cat start = time.time() - 8 * 86400 cond = { "coupon_amount": { '$gt': 5 }, "created": { '$gt': start * 1000 }, "sales": { '$gt': 3000 }, 'category_id': cat_id, "sended": { '$exists': False }, "coupon_expire": 0 } LOG.debug(cond) goods_obj = TbkGoods() goods = goods_obj.find_goods_by_cond(cond, 1, count=20) goods_list = list(goods) length = len(goods_list) if length == 0: return {} index = random.randint(0, length - 1) return goods_list[index]
def goods_tpw(goods_id): cache_key = goods_id data = get_cache_coupon(cache_key) if data: return {"errcode": 0, 'data': data} res = {'errcode': -1} goods = get_goods_info_by_id(goods_id) if not goods: goods_instance = TbkGoods(num_id=goods_id) goods = goods_instance.find_goods_by_id() if goods: msg = _ship_coupon_goods(goods) cache_coupon_info(cache_key, msg, 3600) res.update({'errcode': 0, 'data': msg}) else: search_res = search_coupon_by_id(goods_id) if search_res.get("errcode") == 0: msg = search_res['data'] cache_coupon_info(cache_key, msg, 3600) res.update({'errcode': 0, 'data': msg}) else: res.update({'errmsg': u"找不到该商品"}) # return succ, msg return res
def _save(goods_info): goods_obj = TbkGoods(**goods_info) goods_obj.source = 'search' ret = goods_obj.save() LOG.info("save goods: %s, ret: %s", goods_info['num_id'], ret)
def delete_goods(goods_id): goods_obj = TbkGoods(num_id=goods_id) ret = goods_obj.delete() if ret.get("n") == 1: return True return False