Beispiel #1
0
    def test_delete_by_retention(self):
        storys_0_30 = self.storys[:30]
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           storys_0_30,
                                           batch_size=10)
        self.assertEqual(len(modified), 30)
        self.assert_feed_total_storys(30)
        self.assert_total_story_infos(0)

        storys_20_50 = self.storys[20:50]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_20_50,
                                                   batch_size=10)
        self.assertEqual(len(modified), 20)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(20)

        n = STORY_SERVICE.delete_by_retention(self.feed_id,
                                              retention=10,
                                              limit=10)
        self.assertEqual(n, 10)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(20)

        n = STORY_SERVICE.delete_by_retention(self.feed_id,
                                              retention=10,
                                              limit=50)
        self.assertEqual(n, 30)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(10)
Beispiel #2
0
def do_update_story(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
    content: T.str,
    summary: T.str,
    has_mathjax: T.bool.optional,
    url: T.url,
):
    story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    if not story:
        LOG.error('story#%s,%s not found', feed_id, offset)
        return
    if not is_fulltext_content(content):
        story_text = processor.story_html_to_text(story.content)
        text = processor.story_html_to_text(content)
        if not is_summary(story_text, text):
            msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content'
            LOG.info(msg, feed_id, offset, url)
            return
    data = dict(
        link=url,
        content=content,
        summary=summary,
        has_mathjax=has_mathjax,
    )
    STORY_SERVICE.update_story(feed_id, offset, data)
    _detect_story_images(ctx, story)
Beispiel #3
0
def _update_story(
    story: CommonStory,
    story_content_info: StoryContentInfo,
    content: str,
    summary: str,
    url: str,
    has_mathjax: bool = None,
    sentence_count: int = None
) -> FulltextAcceptStrategy:
    new_info = StoryContentInfo(content)
    accept = decide_accept_fulltext(new_info, story_content_info)
    if accept == FulltextAcceptStrategy.REJECT:
        msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content'
        LOG.info(msg, story.feed_id, story.offset, url)
        return accept
    if accept == FulltextAcceptStrategy.APPEND:
        content = (story.content or '') + '\n<hr/>\n' + (content or '')
    data = dict(
        link=url,
        content=content,
        summary=summary,
        has_mathjax=has_mathjax,
        sentence_count=sentence_count,
    )
    STORY_SERVICE.update_story(story.feed_id, story.offset, data)
    return accept
Beispiel #4
0
    def test_story_dt_and_content_length(self):
        dt = timezone.datetime(2019, 6, 1, 12, 12, 12, tzinfo=timezone.utc)
        story = {
            'unique_id': f'blog.example.com/1',
            'title': f'test story 1',
            'dt_published': dt,
            'dt_updated': dt,
        }
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           [validate_story(story)],
                                           batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(0)
        dt_created = modified[0].dt_created
        dt_published = modified[0].dt_published
        assert modified[0].dt_updated == dt

        dt = dt + timezone.timedelta(days=1)
        updated_content = 'updated_content 1'
        story.update(
            content=updated_content,
            content_hash_base64=compute_hash_base64(updated_content),
            dt_published=dt,
            dt_updated=dt,
        )
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   [validate_story(story)],
                                                   batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(1)
        assert modified[0].dt_created == dt_created
        assert modified[0].dt_published == dt_published
        assert modified[0].dt_updated == dt
        assert modified[0].content_length == len(updated_content)

        dt = dt + timezone.timedelta(days=2)
        updated_content = 'updated_content 22'
        story.update(
            content=updated_content,
            content_hash_base64=compute_hash_base64(updated_content),
            dt_published=dt,
            dt_updated=dt,
        )
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   [validate_story(story)],
                                                   batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(1)
        assert modified[0].dt_created == dt_created
        assert modified[0].dt_published == dt_published
        assert modified[0].dt_updated == dt
        assert modified[0].content_length == len(updated_content)
Beispiel #5
0
    def test_update_story(self):
        storys_0_20 = self.storys[:20]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_0_20,
                                                   batch_size=10)
        self.assertEqual(len(modified), 20)
        self.assert_feed_total_storys(20)
        self.assert_total_story_infos(20)

        story_10 = self.updated_storys[10]
        data = {k: story_10[k] for k in ['content', 'summary', 'dt_published']}
        STORY_SERVICE.update_story(self.feed_id, 10, data)
        content_data = {'content': data['content']}
        STORY_SERVICE.update_story(self.feed_id, 10, content_data)
Beispiel #6
0
def do_clean_by_retention(ctx: ActorContext):
    retention = CONFIG.feed_story_retention
    feeds = Feed.take_retention_feeds(retention=retention, limit=50)
    LOG.info('found {} feeds need clean by retention'.format(len(feeds)))
    for feed in feeds:
        feed_id = feed['feed_id']
        url = feed['url']
        n = STORY_SERVICE.delete_by_retention(feed_id, retention=retention)
        LOG.info(f'deleted {n} storys of feed#{feed_id} {url} by retention')
Beispiel #7
0
def do_sync_story_fulltext(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
) -> T.dict(
        feed_id=T.int,
        offset=T.int.min(0),
        use_proxy=T.bool,
        url=T.url,
        response_status=T.int,
        accept=T_ACCEPT,
):
    with log_django_context_metric('harbor_rss.sync_story_fulltext:read'):
        feed = Feed.get_by_pk(feed_id, detail='+use_proxy')
        story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    assert story, f'story#{feed_id},{offset} not found'
    story_content_info = StoryContentInfo(story.content)
    num_sub_sentences = len(split_sentences(story_content_info.text))
    ret = dict(
        feed_id=feed_id,
        offset=offset,
        url=story.link,
        use_proxy=feed.use_proxy,
        accept=FulltextAcceptStrategy.REJECT.value,
    )
    try:
        result = ctx.ask(
            'worker_rss.fetch_story',
            dict(
                url=story.link,
                use_proxy=feed.use_proxy,
                feed_id=feed_id,
                offset=offset,
                num_sub_sentences=num_sub_sentences,
            ))
    except _TIMEOUT_ERRORS as ex:
        LOG.error(f'Ask worker_rss.fetch_story timeout: {ex}')
        ret.update(response_status=FeedResponseStatus.CONNECTION_TIMEOUT)
        return ret
    else:
        ret.update(
            response_status=result['response_status'],
            use_proxy=result['use_proxy'],
        )
        if not result['content']:
            return ret
    with log_django_context_metric('harbor_rss.sync_story_fulltext:write'):
        accept = _update_story(
            story=story,
            story_content_info=story_content_info,
            content=result['content'],
            summary=None,  # not need update summary
            url=result['url'],
            sentence_count=result['sentence_count'],
        )
        ret.update(accept=accept.value)
    return ret
Beispiel #8
0
def _replace_story_images(feed_id, offset):
    story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    image_processor = StoryImageProcessor(story.link, story.content)
    image_indexs = image_processor.parse()
    image_urls = _image_urls_of_indexs(image_indexs)
    if not image_urls:
        return
    image_statuses = ImageInfo.batch_detect_images(image_urls)
    image_replaces = {}
    for url, status in image_statuses.items():
        if status in IMAGE_REFERER_DENY_STATUS:
            new_url_data = encode_image_url(url, story.link)
            image_replaces[url] = '/api/v1/image/{}?{}'.format(new_url_data, RSSANT_IMAGE_TAG)
    LOG.info(f'story#{feed_id},{offset} {story.link} '
             f'replace {len(image_replaces)} referer deny images')
    # image_processor.process will (1) fix relative url (2) replace image url
    # call image_processor.process regardless of image_replaces is empty or not
    content = image_processor.process(image_indexs, image_replaces)
    STORY_SERVICE.update_story(feed_id, offset, {'content': content})
Beispiel #9
0
def _create_test_story(
    feed: Feed,
    ident: str,
    title: str,
    content: str,
    summary: str = None,
    **kwargs,
):
    story_url = urljoin(feed.url, f'/story/{ident}')
    if summary is None:
        summary = content[:80]
    story_entry = dict(
        ident=ident,
        title=title,
        url=story_url,
        content=content,
        summary=summary,
        **kwargs,
    )
    story_entry = validate_story(story_entry)
    story = get_story_of_feed_entry(story_entry)
    STORY_SERVICE.bulk_save_by_feed(feed.id, [story], is_refresh=True)
Beispiel #10
0
    def test_bulk_save_by_feed_refresh(self):
        storys_0_20 = self.storys[:20]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_0_20,
                                                   batch_size=10)
        self.assertEqual(len(modified), 20)
        self.assert_feed_total_storys(20)
        self.assert_total_story_infos(20)

        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_0_20,
                                                   batch_size=10)
        self.assertEqual(len(modified), 0)
        self.assert_feed_total_storys(20)
        self.assert_total_story_infos(20)

        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_0_20,
                                                   batch_size=10,
                                                   is_refresh=True)
        self.assertEqual(len(modified), 20)
        self.assert_feed_total_storys(20)
        self.assert_total_story_infos(20)
Beispiel #11
0
    def test_new_bulk_save_by_feed(self):
        storys_0_30 = self.storys[:30]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_0_30,
                                                   batch_size=10)
        self.assertEqual(len(modified), 30)
        self.assert_feed_total_storys(30)
        self.assert_total_story_infos(30)

        storys_20_50 = self.storys[20:50]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_20_50,
                                                   batch_size=10)
        self.assertEqual(len(modified), 20)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(50)

        updated_storys_30_50 = self.updated_storys[30:50]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   updated_storys_30_50,
                                                   batch_size=10)
        self.assertEqual(len(modified), 20)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(50)
Beispiel #12
0
    def test_mix_bulk_save_by_feed(self):
        storys_0_30 = self.storys[:30]
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           storys_0_30,
                                           batch_size=10)
        self.assertEqual(len(modified), 30)
        self.assert_feed_total_storys(30)
        self.assert_total_story_infos(0)

        storys_10_50 = self.updated_storys[10:30] + self.storys[30:50]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_10_50,
                                                   batch_size=10)
        self.assertEqual(len(modified), 40)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(40)

        storys_40_60 = self.storys[40:60]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_40_60,
                                                   batch_size=10)
        self.assertEqual(len(modified), 10)
        self.assert_feed_total_storys(60)
        self.assert_total_story_infos(50)
Beispiel #13
0
def do_update_story(ctx: ActorContext, feed_id: T.int, offset: T.int,
                    content: T.str, summary: T.str,
                    has_mathjax: T.bool.optional, url: T.url,
                    response_status: T.int.optional,
                    sentence_count: T.int.min(0).optional):
    story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    if not story:
        LOG.error('story#%s,%s not found', feed_id, offset)
        return
    _update_story(
        story=story,
        story_content_info=StoryContentInfo(story.content),
        content=content,
        summary=summary,
        url=url,
        has_mathjax=has_mathjax,
        sentence_count=sentence_count,
    )
Beispiel #14
0
def do_update_feed(
        ctx: ActorContext,
        feed_id: T.int,
        feed: FeedSchema,
        is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            # FIXME: feed merge 无法正确处理订阅重定向问题。
            # 对于这种情况,暂时保留旧的订阅,以后再彻底解决。
            # if target_feed:
            #     LOG.info(f'merge feed#{feed.id} url={feed.url} into '
            #              f'feed#{target_feed.id} url={target_feed.url}')
            #     target_feed.merge(feed)
            #     return
            if target_feed:
                LOG.warning(
                    f'FIXME: redirect feed#{feed.id} url={feed.url!r} into '
                    f'feed#{target_feed.id} url={target_feed.url!r}')
                feed_dict.pop('url')
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if (v != '' and v is not None) or k in {'warnings'}:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.reverse_url = reverse_url(feed.url)
        feed.status = FeedStatus.READY
        feed.save()
    # save storys, bulk_save_by_feed has standalone transaction
    for s in storys:
        if not s['dt_updated']:
            s['dt_updated'] = now
        if not s['dt_published']:
            # set dt_published to now - 30d to avoid these storys
            # take over mushroom page, i.e. Story.query_recent_by_user
            s['dt_published'] = now_sub_30d
    modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id,
                                                      storys,
                                                      is_refresh=is_refresh)
    LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id,
             len(storys), len(modified_storys))
    feed = Feed.get_by_pk(feed_id)
    is_freezed = feed.freeze_level is None or feed.freeze_level > 1
    if modified_storys and is_freezed:
        Feed.unfreeze_by_id(feed_id)
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not _is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell(
                'worker_rss.fetch_story',
                dict(
                    url=story.link,
                    use_proxy=feed.use_proxy,
                    feed_id=story.feed_id,
                    offset=story.offset,
                    num_sub_sentences=num_sub_sentences,
                ))