Ejemplo n.º 1
0
def crawler_one_page(link, table, mid):
    parse_ret = urlparse(link)
    domain = parse_ret.netloc
    config = DATA_FIELD.get(domain)
    if not config:
        LOG.info("domain: %s not config", domain)
        return
    res_data_field = config.get("res_data")
    id_field = config.get("id")
    start = time.time()
    client = HttpClient()
    res = client.get(link)
    goods_list = res.get(res_data_field, [])
    for goods in goods_list:
        num_id = goods.get(id_field)
        tmp = _ship_goods(num_id)
        if not tmp:
            continue
        tmp.update({'mid': mid})
        if isinstance(table, unicode):
            table = table.encode("utf-8")
        tmp.update({'table': table})
        searcher.update_index(tmp)
        goods_obj = TbkGoods(**tmp)
        goods_obj.__table__ = table
        goods_obj.save()
    LOG.info("link: %s takes: %s", link, time.time() - start)
Ejemplo n.º 2
0
def crawler_similar(goods_id):
    res = client.tbk_goods_recommend(goods_id) or {}
    response = res['tbk_item_recommend_get_response']
    if response.get("results") is None:
        return
    goods_list = response['results'].get('n_tbk_item', [])
    similar_ids = []
    for goods in goods_list:
        num_iid = goods['num_iid']
        title = goods['title']
        similar_goods = _search_by_id(num_iid, title)
        if similar_goods is None:
            continue
        similar_ids.append(num_iid)
        similar_goods.update({'source': 'similar'})
        goods_instance = TbkGoods(**similar_goods)
        goods_instance.save()
    loop_ids = copy.deepcopy(similar_ids)
    for num_iid in loop_ids:
        goods_instance = TbkGoods(num_id=num_iid)
        goods_info = goods_instance.find_goods_by_id()
        if goods_info:
            ori_similar_ids = goods_info.get("similar_goods", [])
            if ori_similar_ids is not None:
                similar_ids.extend(ori_similar_ids)
        goods_instance.update({'similar_goods': list(set(similar_ids))})
        similar_ids = copy.deepcopy(loop_ids)
    return similar_ids
Ejemplo n.º 3
0
def update_worker(goods_list, page):
    start = time.time()
    LOG.info("page: %s, start: %s", page, start)
    for goods in goods_list:
        now = time.time() * 1000
        update_time = goods.get("update_time")
        if update_time and now - update_time < 3600000:
            continue
        title = goods['title']
        _id = goods['num_id']
        sp = SearchParams()
        sp.page = 1
        sp.count = 100
        sp.keyword = title
        data = _super_search(sp)
        ok = 0
        for g in data:
            goods_data = _ship_goods_supers(g)
            if goods_data['num_id'] == _id:
                ok = 1
                goods_obj = TbkGoods(**goods_data)
                goods_obj.save()
                break
        if not ok:
            goods_obj = TbkGoods(num_id=_id)
            goods_obj.delete()
            LOG.info("delete id: %s", _id)
    del goods_list
    LOG.info("page: %s process ok: %s", page, time.time() - start)
Ejemplo n.º 4
0
def update_goods(keyword, num_id, table):
    goods_info = _search_by_id(num_id, keyword)
    if goods_info:
        goods_instance = TbkGoods(**goods_info)
        goods_info.update({'table': table})
        searcher.update_index(goods_info)
        goods_instance.save()
    else:
        goods_instance = TbkGoods(num_id=num_id)
        goods_instance.disabled_goods_by_id()
        searcher.delete_index(num_id)
Ejemplo n.º 5
0
def crawler(keyword, page, count, cat_list=''):
    if cat_list and isinstance(cat_list, list):
        cat = ','.join(cat_list)
    else:
        cat = ''
    goods_list = _crawler(keyword=keyword, page=page, count=count, cat=cat)
    if goods_list is None:
        return []
    result = []
    for goods in goods_list:
        tmp = _ship_goods_supers(goods)
        if not tmp:
            continue
        tmp.update({'table': 'goods'})
        cat_obj = Category(id=tmp['category_id'], name=tmp['category_name'])
        cat_obj.save_category()
        if tmp.get("sub_category_id"):
            cat_obj = SubCategory(id=tmp['sub_category_id'],
                                  name=tmp.get('sub_category_name', ''),
                                  parent=tmp['category_id'])
            cat_obj.save_category()
        source = keyword if keyword else 'crawler'
        tmp.update({'source': source})
        goods_instance = TbkGoods(**tmp)
        if goods_instance.check_save():
            goods_info = goods_instance.find_goods_by_id()
            if not goods_info:
                similar_ids = crawler_similar(tmp['num_id'])
                goods_instance.similar_goods = similar_ids
                result.append(tmp)
            ret = goods_instance.save()
            searcher.update_index(tmp)
            LOG.debug(ret)
    return result
Ejemplo n.º 6
0
def _save(goods_info):
    goods_obj = TbkGoods(**goods_info)
    goods_obj.source = 'search'
    ret = goods_obj.save()
    LOG.info("save goods: %s, ret: %s", goods_info['num_id'], ret)