コード例 #1
0
ファイル: poetry_detail.py プロジェクト: shiguofu2012/diandi
def get_detail_url(detail_url, author_id):
    client = HttpClient()
    page_content = client.get(detail_url)
    if page_content:
        dom = fromstring(page_content)
        cont_xpath = '//div[@class="main3"]/div[@class="left"]/'\
            'div[@class="sons"][1]'
        title = dom.xpath("//h1/text()")
        dynasty = dom.xpath(cont_xpath + '/div[@class="cont"]/p/a[1]/text()')
        author = dom.xpath(cont_xpath + '/div[@class="cont"]/p/a[2]/text()')
        content = dom.xpath(cont_xpath +
                            '/div[@class="cont"]/div[@class="contson"]')
        content = split_content(content[0])
        keywords = dom.xpath(cont_xpath + '/div[@class="tag"]/a/text()')
        keywords = '&'.join(keywords)
        likes = dom.xpath(cont_xpath + '//div[@class="good"]/a/span/text()')
        if len(likes) >= 1:
            likes = match_number(likes[0])
        else:
            likes = 0
        fanyi = dom.xpath("//div[starts-with(@id, 'fanyi')][1]/@id")
        if fanyi:
            fanyi_id = match_number(fanyi[0])
            fanyi_con = get_fanyi_content(fanyi_id)
        else:
            fanyi_xpath = "//div[@class='left']/div[@class='sons'][2]/div[@class='contyishang']/p/text()"
            fanyi_con = dom.xpath(fanyi_xpath)
            if fanyi_con:
                fanyi_con = '\n'.join(fanyi_con)
            else:
                fanyi_con = ''
        shangxi = dom.xpath("//div[starts-with(@id, 'shangxi')][1]/@id")
        if shangxi:
            shangxi_id = match_number(shangxi[0])
            shangxi_con = get_shangxi_content(shangxi_id)
        else:
            shangxi_con = ''

        if not shangxi_con:
            LOG.info("url: %s no shangxi", detail_url)
        if not fanyi_con:
            LOG.info("url: %s no fanyi", detail_url)

        poetry_data = {
            'title': title[0],
            'dynasty': dynasty[0],
            'author': author[0],
            'content': content,
            'tags': keywords,
            'likes': likes,
            'author_id': author_id,
            'translate': fanyi_con,
            'shangxi': shangxi_con,
            'plink': detail_url
        }
        # print(poetry_data)
        return poetry_data
    else:
        LOG.error("download url: %s, error", detail_url)
        return {}
コード例 #2
0
ファイル: link_crawler.py プロジェクト: shiguofu2012/diandi
def crawler_one_page(link, table, mid):
    parse_ret = urlparse(link)
    domain = parse_ret.netloc
    config = DATA_FIELD.get(domain)
    if not config:
        LOG.info("domain: %s not config", domain)
        return
    res_data_field = config.get("res_data")
    id_field = config.get("id")
    start = time.time()
    client = HttpClient()
    res = client.get(link)
    goods_list = res.get(res_data_field, [])
    for goods in goods_list:
        num_id = goods.get(id_field)
        tmp = _ship_goods(num_id)
        if not tmp:
            continue
        tmp.update({'mid': mid})
        if isinstance(table, unicode):
            table = table.encode("utf-8")
        tmp.update({'table': table})
        searcher.update_index(tmp)
        goods_obj = TbkGoods(**tmp)
        goods_obj.__table__ = table
        goods_obj.save()
    LOG.info("link: %s takes: %s", link, time.time() - start)
コード例 #3
0
ファイル: poetry_detail.py プロジェクト: shiguofu2012/diandi
def get_fanyi_content(fanyi_id):
    url = 'https://so.gushiwen.org/shiwen2017/ajaxfanyi.aspx'
    params = {'id': fanyi_id}
    time.sleep(10)
    client = HttpClient()
    page_content = client.get(url, params=params)
    # page_content = open("fanyi.html").read()
    fanyi = ''
    if page_content:
        page_content = unicode(page_content, 'utf-8')
        dom = fromstring(page_content)
        elements = dom.xpath("//div[@class='contyishang']/p")
        for element in elements:
            for sub in element:
                tag = sub.tag
                if tag == 'strong':
                    continue
                elif tag == 'a':
                    fanyi = fanyi[:-1]
                    tmp = sub.text
                elif tag == 'br':
                    tmp = sub.tail
                    if tmp is None:
                        continue
                    tmp += '\n'
                if tmp:
                    tmp = tmp.replace(u"▲", "")
                    fanyi += tmp
    else:
        LOG.info("down page error: %s, params: %s", url, params)
    return fanyi
コード例 #4
0
def update_main():
    page = 20
    count = 2000
    more_data = True
    pool = ThreadPool(16)
    goods_obj = TbkGoods()
    last_id = ''
    while more_data:
        # more_data = False
        if last_id:
            cond = {'_id': {'$gt': last_id}}
        else:
            cond = {}
        goods_list = goods_obj.find_goods_by_cond(
            cond, page, count, ['title', 'num_id', 'update_time'])
        last_id = ''
        for goods in goods_list:
            last_id = goods['_id']
        if not last_id:
            print("done")
            break
        # goods_list = list(goods_list)
        # if len(goods_list) < count:
        #     more_data = False
        #     break
        # else:
        #     more_data = True
        LOG.info("page: %s ok", page)
        # pool.apply_async(update_worker, (goods_list, page))
        page += 1
    pool.close()
    pool.join()
コード例 #5
0
ファイル: tbk_crawler.py プロジェクト: shiguofu2012/diandi
def crawler_worker(keyword):
    total_page = 10
    count = 0
    start = time.time()
    for i in range(total_page):
        saved_list = crawler(keyword, i + 1, 100)
        count += len(saved_list)
    LOG.info("keyword: %s, crawler: %s, takes: %s", keyword, count,
             time.time() - start)
コード例 #6
0
ファイル: start_crawler.py プロジェクト: shiguofu2012/diandi
def crawler_poetry_record(link, author_id):
    try:
        poetry_data = get_detail_url(link, author_id)
        poetry_id = save_crawled_poetry(poetry_data)
        if poetry_id:
            LOGGER.info("link: %s, author: %s ok", link, author_id)
        else:
            LOGGER.info("link: %s, not save")
    except Exception as ex:
        LOGGER.error("link: %s, ex: %s", link, ex, exc_info=True)
コード例 #7
0
ファイル: start_crawler.py プロジェクト: shiguofu2012/diandi
def check():
    page = 1
    count = 100
    author_obj = Author()
    while True:
        authors = author_obj.find_authors({}, page, count)
        LOGGER.info("type: %s, len: %s", type(authors), len(authors))
        if not authors:
            break
        for author in authors:
            _id = author['id']
            ps = Poetry(author_id=_id)
            ret = ps.find_poetry_by_author_id(1, 1)
            if len(ret) == 0:
                # print("_id: %s not found" % _id)
                crawler_author_poetry(_id)
        page += 1
コード例 #8
0
def _do_send_template(user):
    poetry_data = get_recommend_poetry(user['openid'])
    # poetry_id = user.pop("poetry_id", 1)
    # poetry_obj = Poetry(id=poetry_id)
    # poetry_data = poetry_obj.find_poetry_by_id()
    if not poetry_data:
        LOG.error("recommend failed: %s", user['openid'])
        return
    res = None
    try:
        res = send_template_poetry(user, poetry_data)
    except Exception as ex:
        LOG.error("openid: %s, ex: %s", user['openid'], ex)
    if res is None:
        LOG.error("openid: %s, send failed", user['openid'])
    else:
        LOG.info("openid: %s, res: %s", user['openid'], res)
コード例 #9
0
ファイル: tbk_crawler.py プロジェクト: shiguofu2012/diandi
def update_one_by_one(table):
    page = 1
    count = 1000
    have_data = True
    update_count = 0
    goods_obj = TbkGoods()
    goods_obj.__table__ = table
    LOG.info(table)
    while have_data:
        have_data = False
        goods_list = goods_obj.find_goods_by_cond({}, page, count)
        now = int(time.time() * 1000)
        for goods in goods_list:
            have_data = True
            update_time = goods.get('update_time')
            if update_time and now - update_time < 3600000:
                continue
            update_goods(goods['title'], goods['num_id'], table)
        page += 1
        LOG.info("page: %s" % page)
    print(update_count)
コード例 #10
0
ファイル: start_crawler.py プロジェクト: shiguofu2012/diandi
def crawler_author_poetry(author_id=None):
    page = 1
    count = 100
    author_obj = Author()
    while True:
        if author_id is None:
            authors = author_obj.find_authors({"id": {">": 1229}}, page, count)
        else:
            authors = author_obj.find_authors({'id': {
                '=': author_id
            }}, page, count)
        LOGGER.info("type: %s, len: %s", type(authors), len(authors))
        if not authors:
            break
        for author in authors:
            try:
                LOGGER.info("start crawler author: %s", author['name'])
                crawler_author_record(author)
                LOGGER.info(author)
            except Exception as ex:
                LOGGER.error("author: %s, ex: %s",
                             author['name'],
                             ex,
                             exc_info=True)
            # time.sleep(60)
        page += 1
コード例 #11
0
def update_worker(goods_list, page):
    start = time.time()
    LOG.info("page: %s, start: %s", page, start)
    for goods in goods_list:
        now = time.time() * 1000
        update_time = goods.get("update_time")
        if update_time and now - update_time < 3600000:
            continue
        title = goods['title']
        _id = goods['num_id']
        sp = SearchParams()
        sp.page = 1
        sp.count = 100
        sp.keyword = title
        data = _super_search(sp)
        ok = 0
        for g in data:
            goods_data = _ship_goods_supers(g)
            if goods_data['num_id'] == _id:
                ok = 1
                goods_obj = TbkGoods(**goods_data)
                goods_obj.save()
                break
        if not ok:
            goods_obj = TbkGoods(num_id=_id)
            goods_obj.delete()
            LOG.info("delete id: %s", _id)
    del goods_list
    LOG.info("page: %s process ok: %s", page, time.time() - start)
コード例 #12
0
ファイル: start_crawler.py プロジェクト: shiguofu2012/diandi
def crawler_author_record(author):
    next_page = author['poetry_link']
    author_id = author['id']
    count = 0
    while next_page:
        detail_links, next_page = detail_crawler(next_page)
        for poetry_link in detail_links:
            try:
                poetry_data = get_detail_url(poetry_link, author_id)
                poetry_id = save_crawled_poetry(poetry_data)
                if poetry_id:
                    count += 1
                LOGGER.debug("save poetry: %s, authorid: %s", poetry_id,
                             author_id)
            except Exception as ex:
                LOGGER.error("link: %s, ex: %s",
                             poetry_link,
                             ex,
                             exc_info=True)
            # time.sleep(random.randint(6, 10))
        LOGGER.info("page: %s, save: %s", next_page, count)
        count = 0
コード例 #13
0
def save_centence(centence, source, c, t):
    pattern = re.compile(u"(?P<author>.*)《(?P<title>.*)》")
    match = pattern.search(source)
    if not match:
        LOG.info("cent: %s, source: %s error", centence, source)
        return
    author = match.group("author")
    title = match.group("title")
    poetry_obj = Poetry(title=title, author=author)
    poetry = poetry_obj.find_poetry_by_title()
    if not poetry:
        LOG.error("title: %s, author: %s found error", title, author)
        poetry = {}
    centence_data = {
            "title": title,
            "content": centence,
            "tags": '&'.join([c, t]),
            "author_id": poetry.get('author_id', 0),
            "author": author,
            "poetry_id": poetry.get('id', 0)
            }
    sentence_obj = Sentence(**centence_data)
    sentence_obj.save()
コード例 #14
0
ファイル: tbk_crawler.py プロジェクト: shiguofu2012/diandi
 parser = argparse.ArgumentParser()
 parser.add_argument('--type',
                     '-t',
                     type=str,
                     required=True,
                     help='the type of the timed')
 parser.add_argument('--keyword',
                     '-k',
                     type=str,
                     required=False,
                     help='crawler words')
 args = parser.parse_args()
 start = time.time()
 type_ = args.type
 if type_ == 'update':
     LOG.info("update start: %s", start)
     # update_by_category()
     tables = ['haitao', 'jiukjiu', 'juhuasuan', 'xiaoliangbang', 'goods']
     pool = ThreadPool(len(tables))
     for table in tables:
         pool.apply_async(update_one_by_one, (table, ))
     pool.close()
     pool.join()
     # update_one_by_one()
     LOG.info("update takes: %ss", time.time() - start)
 elif type_ == 'crawler':
     LOG.info("crawler start: %s", start)
     crawler_main()
     LOG.info("crawler takes: %s", time.time() - start)
 elif type_ == 'word':
     if args.keyword:
コード例 #15
0
ファイル: service_script.py プロジェクト: shiguofu2012/diandi
def _save(goods_info):
    goods_obj = TbkGoods(**goods_info)
    goods_obj.source = 'search'
    ret = goods_obj.save()
    LOG.info("save goods: %s, ret: %s", goods_info['num_id'], ret)