Esempio n. 1
0
def run():
    with transaction.atomic():
        feed_ids = [feed.id for feed in Feed.objects.only('id').all()]
        LOG.info('total %s feeds', len(feed_ids))
        for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
            storys = query_old_storys_by_feed(feed_id)
            Story.bulk_save_by_feed(feed_id, storys)
Esempio n. 2
0
    def test_delete_by_retention(self):
        storys_0_30 = self.storys[:30]
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           storys_0_30,
                                           batch_size=10)
        self.assertEqual(len(modified), 30)
        self.assert_feed_total_storys(30)
        self.assert_total_story_infos(0)

        storys_20_50 = self.storys[20:50]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_20_50,
                                                   batch_size=10)
        self.assertEqual(len(modified), 20)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(20)

        n = STORY_SERVICE.delete_by_retention(self.feed_id,
                                              retention=10,
                                              limit=10)
        self.assertEqual(n, 10)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(20)

        n = STORY_SERVICE.delete_by_retention(self.feed_id,
                                              retention=10,
                                              limit=50)
        self.assertEqual(n, 30)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(10)
Esempio n. 3
0
    def test_story_dt_and_content_length(self):
        dt = timezone.datetime(2019, 6, 1, 12, 12, 12, tzinfo=timezone.utc)
        story = {
            'unique_id': f'blog.example.com/1',
            'title': f'test story 1',
            'dt_published': dt,
            'dt_updated': dt,
        }
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           [validate_story(story)],
                                           batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(0)
        dt_created = modified[0].dt_created
        dt_published = modified[0].dt_published
        assert modified[0].dt_updated == dt

        dt = dt + timezone.timedelta(days=1)
        updated_content = 'updated_content 1'
        story.update(
            content=updated_content,
            content_hash_base64=compute_hash_base64(updated_content),
            dt_published=dt,
            dt_updated=dt,
        )
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   [validate_story(story)],
                                                   batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(1)
        assert modified[0].dt_created == dt_created
        assert modified[0].dt_published == dt_published
        assert modified[0].dt_updated == dt
        assert modified[0].content_length == len(updated_content)

        dt = dt + timezone.timedelta(days=2)
        updated_content = 'updated_content 22'
        story.update(
            content=updated_content,
            content_hash_base64=compute_hash_base64(updated_content),
            dt_published=dt,
            dt_updated=dt,
        )
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   [validate_story(story)],
                                                   batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(1)
        assert modified[0].dt_created == dt_created
        assert modified[0].dt_published == dt_published
        assert modified[0].dt_updated == dt
        assert modified[0].content_length == len(updated_content)
Esempio n. 4
0
def do_update_feed(
    ctx: ActorContext,
    feed_id: T.int,
    feed: FeedSchema,
    is_refresh: T.bool.default(False).desc('Deprecated'),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        for k, v in feed_dict.items():
            if v != '' and v is not None:
                setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if not feed.dt_updated:
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id, storys)
        LOG.info(
            'feed#%s save storys total=%s num_modified=%s',
            feed.id, len(storys), len(modified_storys)
        )
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(feed, story)):
            ctx.tell('worker_rss.fetch_story', dict(
                url=story.link,
                story_id=str(story.id)
            ))
        else:
            _detect_story_images(ctx, story)
Esempio n. 5
0
    def test_mix_bulk_save_by_feed(self):
        storys_0_30 = self.storys[:30]
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           storys_0_30,
                                           batch_size=10)
        self.assertEqual(len(modified), 30)
        self.assert_feed_total_storys(30)
        self.assert_total_story_infos(0)

        storys_10_50 = self.updated_storys[10:30] + self.storys[30:50]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_10_50,
                                                   batch_size=10)
        self.assertEqual(len(modified), 40)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(40)

        storys_40_60 = self.storys[40:60]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_40_60,
                                                   batch_size=10)
        self.assertEqual(len(modified), 10)
        self.assert_feed_total_storys(60)
        self.assert_total_story_infos(50)
Esempio n. 6
0
def do_update_feed(
        ctx: ActorContext,
        feed_id: T.int,
        feed: FeedSchema,
        is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if (v != '' and v is not None) or k in {'warnings'}:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id,
                                                  storys,
                                                  is_refresh=is_refresh)
        LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id,
                 len(storys), len(modified_storys))
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell(
                'worker_rss.fetch_story',
                dict(
                    url=story.link,
                    use_proxy=feed.use_proxy,
                    story_id=str(story.id),
                    num_sub_sentences=num_sub_sentences,
                ))
        else:
            _detect_story_images(ctx, story)