Esempio n. 1
0
def crawler(keyword, page, count, cat_list=''):
    if cat_list and isinstance(cat_list, list):
        cat = ','.join(cat_list)
    else:
        cat = ''
    goods_list = _crawler(keyword=keyword, page=page, count=count, cat=cat)
    if goods_list is None:
        return []
    result = []
    for goods in goods_list:
        tmp = _ship_goods_supers(goods)
        if not tmp:
            continue
        tmp.update({'table': 'goods'})
        cat_obj = Category(id=tmp['category_id'], name=tmp['category_name'])
        cat_obj.save_category()
        if tmp.get("sub_category_id"):
            cat_obj = SubCategory(id=tmp['sub_category_id'],
                                  name=tmp.get('sub_category_name', ''),
                                  parent=tmp['category_id'])
            cat_obj.save_category()
        source = keyword if keyword else 'crawler'
        tmp.update({'source': source})
        goods_instance = TbkGoods(**tmp)
        if goods_instance.check_save():
            goods_info = goods_instance.find_goods_by_id()
            if not goods_info:
                similar_ids = crawler_similar(tmp['num_id'])
                goods_instance.similar_goods = similar_ids
                result.append(tmp)
            ret = goods_instance.save()
            searcher.update_index(tmp)
            LOG.debug(ret)
    return result
Esempio n. 2
0
def get_shangxi_content(shangxi_id):
    url = 'https://so.gushiwen.org/shiwen2017/ajaxshangxi.aspx'
    params = {'id': shangxi_id}
    time.sleep(10)
    client = HttpClient()
    page_content = client.get(url, params=params)
    shangxi = ''
    if page_content:
        page_content = unicode(page_content, 'utf-8')
        dom = fromstring(page_content)
        elements = dom.xpath("//div[@class='contyishang']/p")
        for element in elements:
            tmp = element.xpath("string(.)")
            tmp = tmp.replace(u"▲", "")
            shangxi += tmp
            shangxi += '\n'
    else:
        LOG.debug("down page error: %s, params: %s", url, params)
    return shangxi
Esempio n. 3
0
def crawler_author_record(author):
    next_page = author['poetry_link']
    author_id = author['id']
    count = 0
    while next_page:
        detail_links, next_page = detail_crawler(next_page)
        for poetry_link in detail_links:
            try:
                poetry_data = get_detail_url(poetry_link, author_id)
                poetry_id = save_crawled_poetry(poetry_data)
                if poetry_id:
                    count += 1
                LOGGER.debug("save poetry: %s, authorid: %s", poetry_id,
                             author_id)
            except Exception as ex:
                LOGGER.error("link: %s, ex: %s",
                             poetry_link,
                             ex,
                             exc_info=True)
            # time.sleep(random.randint(6, 10))
        LOGGER.info("page: %s, save: %s", next_page, count)
        count = 0
Esempio n. 4
0
def get_one_goods(cat=None):
    if cat is None:
        cat_obj = Category(recommend=1)
        cats = cat_obj.all_category()
        cat_list = []
        for cat in cats:
            cat_list.append(int(cat['id']))
        # cat_list = [1801, 16, 30, 50002766, 50006843, 122952001]
        cat_id = random.choice(cat_list)
    else:
        cat_id = cat
    start = time.time() - 8 * 86400
    cond = {
        "coupon_amount": {
            '$gt': 5
        },
        "created": {
            '$gt': start * 1000
        },
        "sales": {
            '$gt': 3000
        },
        'category_id': cat_id,
        "sended": {
            '$exists': False
        },
        "coupon_expire": 0
    }
    LOG.debug(cond)
    goods_obj = TbkGoods()
    goods = goods_obj.find_goods_by_cond(cond, 1, count=20)
    goods_list = list(goods)
    length = len(goods_list)
    if length == 0:
        return {}
    index = random.randint(0, length - 1)
    return goods_list[index]