Exemple #1
0
def crawler_author_poetry(author_id=None):
    page = 1
    count = 100
    author_obj = Author()
    while True:
        if author_id is None:
            authors = author_obj.find_authors({"id": {">": 1229}}, page, count)
        else:
            authors = author_obj.find_authors({'id': {
                '=': author_id
            }}, page, count)
        LOGGER.info("type: %s, len: %s", type(authors), len(authors))
        if not authors:
            break
        for author in authors:
            try:
                LOGGER.info("start crawler author: %s", author['name'])
                crawler_author_record(author)
                LOGGER.info(author)
            except Exception as ex:
                LOGGER.error("author: %s, ex: %s",
                             author['name'],
                             ex,
                             exc_info=True)
            # time.sleep(60)
        page += 1
Exemple #2
0
def get_detail_url(detail_url, author_id):
    client = HttpClient()
    page_content = client.get(detail_url)
    if page_content:
        dom = fromstring(page_content)
        cont_xpath = '//div[@class="main3"]/div[@class="left"]/'\
            'div[@class="sons"][1]'
        title = dom.xpath("//h1/text()")
        dynasty = dom.xpath(cont_xpath + '/div[@class="cont"]/p/a[1]/text()')
        author = dom.xpath(cont_xpath + '/div[@class="cont"]/p/a[2]/text()')
        content = dom.xpath(cont_xpath +
                            '/div[@class="cont"]/div[@class="contson"]')
        content = split_content(content[0])
        keywords = dom.xpath(cont_xpath + '/div[@class="tag"]/a/text()')
        keywords = '&'.join(keywords)
        likes = dom.xpath(cont_xpath + '//div[@class="good"]/a/span/text()')
        if len(likes) >= 1:
            likes = match_number(likes[0])
        else:
            likes = 0
        fanyi = dom.xpath("//div[starts-with(@id, 'fanyi')][1]/@id")
        if fanyi:
            fanyi_id = match_number(fanyi[0])
            fanyi_con = get_fanyi_content(fanyi_id)
        else:
            fanyi_xpath = "//div[@class='left']/div[@class='sons'][2]/div[@class='contyishang']/p/text()"
            fanyi_con = dom.xpath(fanyi_xpath)
            if fanyi_con:
                fanyi_con = '\n'.join(fanyi_con)
            else:
                fanyi_con = ''
        shangxi = dom.xpath("//div[starts-with(@id, 'shangxi')][1]/@id")
        if shangxi:
            shangxi_id = match_number(shangxi[0])
            shangxi_con = get_shangxi_content(shangxi_id)
        else:
            shangxi_con = ''

        if not shangxi_con:
            LOG.info("url: %s no shangxi", detail_url)
        if not fanyi_con:
            LOG.info("url: %s no fanyi", detail_url)

        poetry_data = {
            'title': title[0],
            'dynasty': dynasty[0],
            'author': author[0],
            'content': content,
            'tags': keywords,
            'likes': likes,
            'author_id': author_id,
            'translate': fanyi_con,
            'shangxi': shangxi_con,
            'plink': detail_url
        }
        # print(poetry_data)
        return poetry_data
    else:
        LOG.error("download url: %s, error", detail_url)
        return {}
Exemple #3
0
def crawler_poetry_record(link, author_id):
    try:
        poetry_data = get_detail_url(link, author_id)
        poetry_id = save_crawled_poetry(poetry_data)
        if poetry_id:
            LOGGER.info("link: %s, author: %s ok", link, author_id)
        else:
            LOGGER.info("link: %s, not save")
    except Exception as ex:
        LOGGER.error("link: %s, ex: %s", link, ex, exc_info=True)
Exemple #4
0
def _ship_goods_supers(item):
    pattern = re.compile(COUPON_PATTERN)
    goods_data = {}
    try:
        coupon_share_url = item.get('coupon_share_url')
        if not coupon_share_url:
            coupon_share_url = item['url']
        if coupon_share_url and not coupon_share_url.startswith(
            ('http', 'https')):
            coupon_share_url = 'https:' + coupon_share_url
        goods_data['category_id'] = item['level_one_category_id']
        goods_data['sub_category_id'] = item['category_id']
        goods_data['small_images'] = item.get("small_images",
                                              {}).get("string", [])
        # goods_data['big_images'] = goods_data['small_images']
        goods_data['is_tmall'] = item['user_type']
        goods_data['coupon_id'] = item['coupon_id']
        goods_data['coupon_share_url'] = coupon_share_url
        goods_data['sales'] = int(item['volume'])
        goods_data['coupon_info'] = item['coupon_info']
        coupon_data = re.search(pattern, item['coupon_info'])
        if coupon_data:
            goods_data['coupon_start'] = float(
                coupon_data.group("coupon_start"))
            goods_data['coupon_amount'] = float(
                coupon_data.group("coupon_amount"))
        else:
            goods_data['coupon_start'] = 0
            goods_data['coupon_amount'] = 0
        goods_data['commssion_rate'] = float(
            float(item['commission_rate']) / 10000.0)
        goods_data['coupon_total_count'] = int(item['coupon_total_count'])
        goods_data['shop_id'] = item.get("seller_id", 0)
        goods_data['shop_title'] = item.get("nick", '')
        goods_data['category_name'] = item.get('level_one_category_name', '')
        # goods_data['sub_category_name'] = item.get('category_name')
        goods_data['end'] = item.get('coupon_end_time', '')
        goods_data['start'] = item.get('coupon_start_time', '')
        goods_data['price'] = round(float(item['zk_final_price']), 2)
        goods_data[
            'coupon_fee'] = goods_data['price'] - goods_data['coupon_amount']
        goods_data['num_id'] = item['num_iid']
        goods_data['created'] = int(time.time() * 1000)
        goods_data['update_time'] = int(time.time() * 1000)
        goods_data['pic_url'] = item['pict_url']
        goods_data['title'] = item['title']
        goods_data['coupon_remain'] = int(item['coupon_remain_count'])
    except Exception as ex:
        LOG.error("crawler item: %s, error: %s", item, ex, exc_info=True)
        goods_data = {}
    if item.get("category_id"):
        goods_data.update({"sub_category_id": item.get("category_id")})
    if item.get("category_name"):
        goods_data.update({"sub_category_name": item.get("category_name")})
    return goods_data
Exemple #5
0
def _crawler(**kwargs):
    keys = ("keyword", "page", "count", "platform", "is_overseas", "is_tmall",
            "sort", "has_coupon", "need_free_shipment", "cat")
    sp = SearchParams()
    for key in keys:
        if kwargs.get(key):
            sp[key] = kwargs[key]
    try:
        res = client.super_search(sp)
        return res['tbk_dg_material_optional_response']['result_list'][
            'map_data']
    except Exception as ex:
        LOG.error("ex: %s", ex, exc_info=True)
Exemple #6
0
def crawler_author_record(author):
    next_page = author['poetry_link']
    author_id = author['id']
    count = 0
    while next_page:
        detail_links, next_page = detail_crawler(next_page)
        for poetry_link in detail_links:
            try:
                poetry_data = get_detail_url(poetry_link, author_id)
                poetry_id = save_crawled_poetry(poetry_data)
                if poetry_id:
                    count += 1
                LOGGER.debug("save poetry: %s, authorid: %s", poetry_id,
                             author_id)
            except Exception as ex:
                LOGGER.error("link: %s, ex: %s",
                             poetry_link,
                             ex,
                             exc_info=True)
            # time.sleep(random.randint(6, 10))
        LOGGER.info("page: %s, save: %s", next_page, count)
        count = 0
Exemple #7
0
def save_centence(centence, source, c, t):
    pattern = re.compile(u"(?P<author>.*)《(?P<title>.*)》")
    match = pattern.search(source)
    if not match:
        LOG.info("cent: %s, source: %s error", centence, source)
        return
    author = match.group("author")
    title = match.group("title")
    poetry_obj = Poetry(title=title, author=author)
    poetry = poetry_obj.find_poetry_by_title()
    if not poetry:
        LOG.error("title: %s, author: %s found error", title, author)
        poetry = {}
    centence_data = {
            "title": title,
            "content": centence,
            "tags": '&'.join([c, t]),
            "author_id": poetry.get('author_id', 0),
            "author": author,
            "poetry_id": poetry.get('id', 0)
            }
    sentence_obj = Sentence(**centence_data)
    sentence_obj.save()
def _do_send_template(user):
    poetry_data = get_recommend_poetry(user['openid'])
    # poetry_id = user.pop("poetry_id", 1)
    # poetry_obj = Poetry(id=poetry_id)
    # poetry_data = poetry_obj.find_poetry_by_id()
    if not poetry_data:
        LOG.error("recommend failed: %s", user['openid'])
        return
    res = None
    try:
        res = send_template_poetry(user, poetry_data)
    except Exception as ex:
        LOG.error("openid: %s, ex: %s", user['openid'], ex)
    if res is None:
        LOG.error("openid: %s, send failed", user['openid'])
    else:
        LOG.info("openid: %s, res: %s", user['openid'], res)