Example #1
0
def _do_find(url, max_trys, printer, rss_proxy_url, rss_proxy_token):
    def message_handler(msg):
        print(msg)

    finder = FeedFinder(
        url,
        max_trys=max_trys,
        rss_proxy_url=rss_proxy_url,
        rss_proxy_token=rss_proxy_token,
        message_handler=message_handler,
    )
    with finder:
        found = finder.find()
    if found:
        response, raw_result = found
        printer('-> {}'.format(response))
        result = FeedParser().parse(raw_result)
        printer("-> {}".format(result))
        printer('-' * 79)
        printer(pretty_format_json(result.feed))
        for i, story in enumerate(result.storys):
            printer('{:03d}{}'.format(i, '-' * 76))
            story['content'] = shorten(story['content'], 60)
            story['summary'] = shorten(story['summary'], 60)
            printer(pretty_format_json(story))
Example #2
0
def _parse_found(parsed):
    feed = AttrDict()
    res = parsed.response
    feed.use_proxy = parsed.use_proxy
    feed.url = _get_url(res)
    feed.content_length = len(res.content)
    feed.content_hash_base64 = compute_hash_base64(res.content)
    parsed_feed = parsed.feed
    feed.title = shorten(parsed_feed["title"], 200)
    link = parsed_feed["link"]
    if not link.startswith('http'):
        # 有些link属性不是URL,用author_detail的href代替
        # 例如:'http://www.cnblogs.com/grenet/'
        author_detail = parsed_feed['author_detail']
        if author_detail:
            link = author_detail['href']
    if not link.startswith('http'):
        link = feed.url
    feed.link = link
    feed.author = shorten(parsed_feed["author"], 200)
    feed.icon = parsed_feed["icon"] or parsed_feed["logo"]
    feed.description = parsed_feed["description"] or parsed_feed["subtitle"]
    feed.dt_updated = _get_dt_updated(parsed_feed)
    feed.etag = _get_etag(res)
    feed.last_modified = _get_last_modified(res)
    feed.encoding = res.encoding
    feed.version = shorten(parsed.version, 200)
    entries = list(parsed.entries)  # entries will be modified by _get_storys
    del parsed, res, parsed_feed  # release memory in advance
    feed.storys = _get_storys(entries)
    return validate_feed(feed)
Example #3
0
def do_process_story_webpage(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
    url: T.url,
    text: T.str.maxlen(_MAX_STORY_HTML_LENGTH),
    num_sub_sentences: T.int.optional,
) -> SCHEMA_FETCH_STORY_RESULT:
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    DEFAULT_RESULT = dict(feed_id=feed_id, offset=offset, url=url)
    text = text.strip()
    if not text:
        return DEFAULT_RESULT
    text = story_html_clean(text)
    content = story_readability(text)
    content = process_story_links(content, url)
    content_info = StoryContentInfo(content)
    text_content = shorten(content_info.text, width=_MAX_STORY_CONTENT_LENGTH)
    num_sentences = len(split_sentences(text_content))
    if len(content) > _MAX_STORY_CONTENT_LENGTH:
        msg = 'too large story#%s,%s size=%s url=%r, will only save plain text'
        LOG.warning(msg, feed_id, offset, len(content), url)
        content = text_content
    # 如果取回的内容比RSS内容更短,就不是正确的全文
    if num_sub_sentences is not None:
        if not is_fulltext_content(content_info):
            if num_sentences <= num_sub_sentences:
                msg = 'fetched story#%s,%s url=%s num_sentences=%s less than num_sub_sentences=%s'
                LOG.info(msg, feed_id, offset, url, num_sentences,
                         num_sub_sentences)
                return DEFAULT_RESULT
    summary = shorten(text_content, width=_MAX_STORY_SUMMARY_LENGTH)
    if not summary:
        return DEFAULT_RESULT
    result = dict(
        **DEFAULT_RESULT,
        content=content,
        summary=summary,
        sentence_count=num_sentences,
    )
    if not ctx.message.is_ask:
        ctx.hope('harbor_rss.update_story', result)
    return result
Example #4
0
def _get_storys(entries: list):
    storys = deque(maxlen=300)  # limit num storys
    while entries:
        data = entries.pop()
        story = {}
        content = ''
        if data["content"]:
            # both content and summary will in content list, peek the longest
            for x in data["content"]:
                value = x["value"]
                if value and len(value) > len(content):
                    content = value
        if not content:
            content = data["description"]
        if not content:
            content = data["summary"]
        story['has_mathjax'] = story_has_mathjax(content)
        link = normlize_url(data["link"])
        valid_link = ''
        if link:
            try:
                valid_link = validate_url(link)
            except Invalid:
                LOG.warning(f'invalid story link {link!r}')
        story['link'] = valid_link
        content = story_html_clean(content)
        if len(content) >= 1024 * 1024:
            msg = 'too large story link=%r content length=%s, will only save plain text!'
            LOG.warning(msg, link, len(content))
            content = story_html_to_text(content)
        content = process_story_links(content, valid_link)
        story['content'] = content
        summary = data["summary"]
        if not summary:
            summary = content
        summary = shorten(story_html_to_text(summary), width=300)
        story['summary'] = summary
        title = shorten(data["title"] or link or summary, 200)
        unique_id = shorten(data['id'] or link or title, 200)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['unique_id'] = unique_id
        story['author'] = shorten(data["author"], 200)
        story['dt_published'] = _get_dt_published(data)
        story['dt_updated'] = _get_dt_updated(data)
        storys.append(story)
    return list(storys)
Example #5
0
def do_process_story_webpage(
        ctx: ActorContext,
        story_id: T.int,
        url: T.url,
        text: T.str.maxlen(5 * 1024 * 1024),
):
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    text = text.strip()
    if not text:
        return
    content = story_readability(text)
    content = process_story_links(content, url)
    summary = shorten(story_html_to_text(content), width=300)
    if not summary:
        return
    ctx.hope(
        'harbor_rss.update_story',
        dict(
            story_id=story_id,
            content=content,
            summary=summary,
            url=url,
        ))
Example #6
0
 def _parse_story(self, story: dict, feed_url: str):
     ident = story['ident'][:200]
     title = story_html_to_text(story['title'])[:200]
     url = normlize_url(story['url'] or story['ident'], base_url=feed_url)
     try:
         valid_url = validate_url(url)
     except Invalid:
         valid_url = None
     base_url = valid_url or feed_url
     image_url = normlize_url(story['image_url'], base_url=base_url)
     author_name = story_html_to_text(story['author_name'])[:100]
     author_url = normlize_url(story['author_url'], base_url=base_url)
     author_avatar_url = normlize_url(story['author_avatar_url'],
                                      base_url=base_url)
     content = self._process_content(story['content'], link=base_url)
     summary = story_html_clean(story['summary'])
     summary = shorten(story_html_to_text(summary), width=300)
     has_mathjax = story_has_mathjax(content)
     return dict(
         ident=ident,
         title=title,
         url=valid_url,
         content=content,
         summary=summary,
         has_mathjax=has_mathjax,
         image_url=image_url,
         dt_published=story['dt_published'],
         dt_updated=story['dt_updated'],
         author_name=author_name,
         author_url=author_url,
         author_avatar_url=author_avatar_url,
     )
Example #7
0
def _do_parse(
    url: str,
    printer,
    checksum,
    save_checksum,
    proxy_url,
    rss_proxy_url,
    rss_proxy_token,
):
    if not url.startswith('http://') and not url.startswith('https://'):
        response_file = FeedResponseFile(url)
        response = response_file.read()
    else:
        reader = FeedReader(
            proxy_url=proxy_url,
            rss_proxy_url=rss_proxy_url,
            rss_proxy_token=rss_proxy_token,
        )
        with reader:
            response = reader.read(url, use_proxy=reader.has_proxy)
    print('-> {}'.format(response))
    if not response.ok:
        return
    if checksum:
        with open(checksum, 'rb') as f:
            data = f.read()
        checksum = FeedChecksum.load(data)
        print('-> {}'.format(checksum))
    else:
        checksum = None
    raw_result = RawFeedParser().parse(response)
    if raw_result.warnings:
        print('Warning: ' + '; '.join(raw_result.warnings))
    result = FeedParser(checksum=checksum).parse(raw_result)
    print("-> {}".format(result))
    printer('-' * 79)
    printer(pretty_format_json(result.feed))
    for i, story in enumerate(result.storys):
        printer('{:03d}{}'.format(i, '-' * 76))
        story['content'] = shorten(story['content'], 60)
        story['summary'] = shorten(story['summary'], 60)
        printer(pretty_format_json(story))
    if save_checksum:
        print('-> save {}'.format(save_checksum))
        data = result.checksum.dump()
        with open(save_checksum, 'wb') as f:
            f.write(data)
Example #8
0
def do_process_story_webpage(
    ctx: ActorContext,
    story_id: T.int,
    url: T.url,
    text: T.str.maxlen(_MAX_STORY_HTML_LENGTH),
    num_sub_sentences: T.int.optional,
):
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    text = text.strip()
    if not text:
        return
    text = story_html_clean(text)
    content = story_readability(text)
    content = process_story_links(content, url)
    if len(content) > _MAX_STORY_CONTENT_LENGTH:
        msg = 'too large story#%s size=%s url=%r, will only save plain text'
        LOG.warning(msg, story_id, len(content), url)
        content = shorten(story_html_to_text(content),
                          width=_MAX_STORY_CONTENT_LENGTH)
    # 如果取回的内容比RSS内容更短,就不是正确的全文
    if num_sub_sentences is not None:
        if not is_fulltext_content(content):
            num_sentences = len(split_sentences(story_html_to_text(content)))
            if num_sentences <= num_sub_sentences:
                msg = 'fetched story#%s url=%s num_sentences=%s less than num_sub_sentences=%s'
                LOG.info(msg, story_id, url, num_sentences, num_sub_sentences)
                return
    summary = shorten(story_html_to_text(content),
                      width=_MAX_STORY_SUMMARY_LENGTH)
    if not summary:
        return
    ctx.hope(
        'harbor_rss.update_story',
        dict(
            story_id=story_id,
            content=content,
            summary=summary,
            url=url,
        ))
Example #9
0
 def _parse_story(self, story: dict, feed_url: str):
     ident = story['ident'][:200]
     title = story_html_to_text(story['title'])[:200]
     if not title:
         title = ident  # when title is empty after clean
     url = normalize_url(story['url'] or story['ident'], base_url=feed_url)
     try:
         valid_url = validate_url(url)
     except Invalid:
         valid_url = None
     base_url = valid_url or feed_url
     image_url = normalize_url(story['image_url'], base_url=base_url)
     audio_url = normalize_url(story['audio_url'], base_url=base_url)
     author_name = story_html_to_text(story['author_name'])[:100]
     author_url = normalize_url(story['author_url'], base_url=base_url)
     author_avatar_url = normalize_url(story['author_avatar_url'],
                                       base_url=base_url)
     iframe_url = None
     content, attach = self._process_content(story['content'],
                                             link=base_url)
     if attach:
         iframe_url = attach.iframe_url
         if (not audio_url) and attach.audio_url:
             audio_url = attach.audio_url
         if (not image_url) and attach.image_url:
             image_url = attach.image_url
     if story['summary']:
         summary = story_html_clean(story['summary'])
     else:
         summary = content
     summary = shorten(story_html_to_text(summary),
                       width=_MAX_SUMMARY_LENGTH)
     # TODO: summary with links
     has_mathjax = story_has_mathjax(content)
     return dict(
         ident=ident,
         title=title,
         url=valid_url,
         content=content,
         summary=summary,
         has_mathjax=has_mathjax,
         image_url=image_url,
         iframe_url=iframe_url,
         audio_url=audio_url,
         dt_published=story['dt_published'],
         dt_updated=story['dt_updated'],
         author_name=author_name,
         author_url=author_url,
         author_avatar_url=author_avatar_url,
     )
Example #10
0
File: rss.py Project: XZYCR7/rssant
def _get_storys(entries: list):
    storys = deque(maxlen=300)  # limit num storys
    while entries:
        data = entries.pop()
        story = {}
        story['unique_id'] = shorten(_get_story_unique_id(data), 200)
        content = ''
        if data["content"]:
            # both content and summary will in content list, peek the longest
            for x in data["content"]:
                value = x["value"]
                if value and len(value) > len(content):
                    content = value
        if not content:
            content = data["description"]
        if not content:
            content = data["summary"]
        story['has_mathjax'] = story_has_mathjax(content)
        content = story_html_clean(content)
        content = process_story_links(content, data["link"])
        story['content'] = content
        summary = data["summary"]
        if not summary:
            summary = content
        # TODO: performance
        summary = shorten(story_html_to_text(summary), width=300)
        story['summary'] = summary
        story['link'] = data["link"]
        title = shorten(data["title"] or story['link'] or story['unique_id'], 200)
        content_hash_base64 = compute_hash_base64(content, summary, title)
        story['title'] = title
        story['content_hash_base64'] = content_hash_base64
        story['author'] = shorten(data["author"], 200)
        story['dt_published'] = _get_dt_published(data)
        story['dt_updated'] = _get_dt_updated(data)
        storys.append(story)
    return list(storys)