def crawler_one_page(link, table, mid): parse_ret = urlparse(link) domain = parse_ret.netloc config = DATA_FIELD.get(domain) if not config: LOG.info("domain: %s not config", domain) return res_data_field = config.get("res_data") id_field = config.get("id") start = time.time() client = HttpClient() res = client.get(link) goods_list = res.get(res_data_field, []) for goods in goods_list: num_id = goods.get(id_field) tmp = _ship_goods(num_id) if not tmp: continue tmp.update({'mid': mid}) if isinstance(table, unicode): table = table.encode("utf-8") tmp.update({'table': table}) searcher.update_index(tmp) goods_obj = TbkGoods(**tmp) goods_obj.__table__ = table goods_obj.save() LOG.info("link: %s takes: %s", link, time.time() - start)
def crawler_similar(goods_id): res = client.tbk_goods_recommend(goods_id) or {} response = res['tbk_item_recommend_get_response'] if response.get("results") is None: return goods_list = response['results'].get('n_tbk_item', []) similar_ids = [] for goods in goods_list: num_iid = goods['num_iid'] title = goods['title'] similar_goods = _search_by_id(num_iid, title) if similar_goods is None: continue similar_ids.append(num_iid) similar_goods.update({'source': 'similar'}) goods_instance = TbkGoods(**similar_goods) goods_instance.save() loop_ids = copy.deepcopy(similar_ids) for num_iid in loop_ids: goods_instance = TbkGoods(num_id=num_iid) goods_info = goods_instance.find_goods_by_id() if goods_info: ori_similar_ids = goods_info.get("similar_goods", []) if ori_similar_ids is not None: similar_ids.extend(ori_similar_ids) goods_instance.update({'similar_goods': list(set(similar_ids))}) similar_ids = copy.deepcopy(loop_ids) return similar_ids
def update_worker(goods_list, page): start = time.time() LOG.info("page: %s, start: %s", page, start) for goods in goods_list: now = time.time() * 1000 update_time = goods.get("update_time") if update_time and now - update_time < 3600000: continue title = goods['title'] _id = goods['num_id'] sp = SearchParams() sp.page = 1 sp.count = 100 sp.keyword = title data = _super_search(sp) ok = 0 for g in data: goods_data = _ship_goods_supers(g) if goods_data['num_id'] == _id: ok = 1 goods_obj = TbkGoods(**goods_data) goods_obj.save() break if not ok: goods_obj = TbkGoods(num_id=_id) goods_obj.delete() LOG.info("delete id: %s", _id) del goods_list LOG.info("page: %s process ok: %s", page, time.time() - start)
def update_goods(keyword, num_id, table): goods_info = _search_by_id(num_id, keyword) if goods_info: goods_instance = TbkGoods(**goods_info) goods_info.update({'table': table}) searcher.update_index(goods_info) goods_instance.save() else: goods_instance = TbkGoods(num_id=num_id) goods_instance.disabled_goods_by_id() searcher.delete_index(num_id)
def crawler(keyword, page, count, cat_list=''): if cat_list and isinstance(cat_list, list): cat = ','.join(cat_list) else: cat = '' goods_list = _crawler(keyword=keyword, page=page, count=count, cat=cat) if goods_list is None: return [] result = [] for goods in goods_list: tmp = _ship_goods_supers(goods) if not tmp: continue tmp.update({'table': 'goods'}) cat_obj = Category(id=tmp['category_id'], name=tmp['category_name']) cat_obj.save_category() if tmp.get("sub_category_id"): cat_obj = SubCategory(id=tmp['sub_category_id'], name=tmp.get('sub_category_name', ''), parent=tmp['category_id']) cat_obj.save_category() source = keyword if keyword else 'crawler' tmp.update({'source': source}) goods_instance = TbkGoods(**tmp) if goods_instance.check_save(): goods_info = goods_instance.find_goods_by_id() if not goods_info: similar_ids = crawler_similar(tmp['num_id']) goods_instance.similar_goods = similar_ids result.append(tmp) ret = goods_instance.save() searcher.update_index(tmp) LOG.debug(ret) return result
def _save(goods_info): goods_obj = TbkGoods(**goods_info) goods_obj.source = 'search' ret = goods_obj.save() LOG.info("save goods: %s, ret: %s", goods_info['num_id'], ret)