Ejemplo n.º 1
0
def run():
    with transaction.atomic():
        feed_ids = [feed.id for feed in Feed.objects.only('id').all()]
        LOG.info('total %s feeds', len(feed_ids))
        for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
            storys = query_old_storys_by_feed(feed_id)
            Story.bulk_save_by_feed(feed_id, storys)
Ejemplo n.º 2
0
Archivo: rss.py Proyecto: sun816/rssant
def update_story_is_user_marked():
    user_storys = list(
        UserStory.objects.exclude(is_watched=False, is_favorited=False).all())
    LOG.info('total %s user marked storys', len(user_storys))
    if not user_storys:
        return
    for user_story in tqdm.tqdm(user_storys, ncols=80, ascii=True):
        Story.set_user_marked_by_id(user_story.story_id)
Ejemplo n.º 3
0
def update_feed_story_publish_period(feeds=None):
    """
    Deprecated since v3.1
    """
    with transaction.atomic():
        feed_ids = _get_feed_ids(feeds)
        LOG.info('total %s feeds', len(feed_ids))
        for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
            Story.update_feed_story_publish_period(feed_id)
Ejemplo n.º 4
0
Archivo: rss.py Proyecto: sun816/rssant
def update_feed_dryness(feeds=None):
    feed_ids = _get_feed_ids(feeds)
    LOG.info('total %s feeds', len(feed_ids))
    for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
        with transaction.atomic():
            feed = Feed.get_by_pk(feed_id)
            if feed.total_storys <= 0:
                continue
            cnt = feed.monthly_story_count
            if not cnt:
                Story.refresh_feed_monthly_story_count(feed_id)
            feed.refresh_from_db()
            feed.dryness = feed.monthly_story_count.dryness()
            feed.save()
Ejemplo n.º 5
0
Archivo: rss.py Proyecto: sun816/rssant
def fix_feed_total_storys(dry_run=False):
    incorrect_feeds = Story.query_feed_incorrect_total_storys()
    LOG.info('total %s incorrect feeds', len(incorrect_feeds))
    header = ['feed_id', 'total_storys', 'correct_total_storys']
    click.echo(format_table(incorrect_feeds, header=header))
    if dry_run:
        return
    with transaction.atomic():
        num_corrected = 0
        for feed_id, *__ in tqdm.tqdm(incorrect_feeds, ncols=80, ascii=True):
            fixed = Story.fix_feed_total_storys(feed_id)
            if fixed:
                num_corrected += 1
        LOG.info('correct %s feeds', num_corrected)
Ejemplo n.º 6
0
    def test_delete_by_retention(self):
        storys_0_30 = self.storys[:30]
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           storys_0_30,
                                           batch_size=10)
        self.assertEqual(len(modified), 30)
        self.assert_feed_total_storys(30)
        self.assert_total_story_infos(0)

        storys_20_50 = self.storys[20:50]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_20_50,
                                                   batch_size=10)
        self.assertEqual(len(modified), 20)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(20)

        n = STORY_SERVICE.delete_by_retention(self.feed_id,
                                              retention=10,
                                              limit=10)
        self.assertEqual(n, 10)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(20)

        n = STORY_SERVICE.delete_by_retention(self.feed_id,
                                              retention=10,
                                              limit=50)
        self.assertEqual(n, 30)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(10)
Ejemplo n.º 7
0
def do_clean_by_retention(ctx: ActorContext):
    retention = CONFIG.feed_story_retention
    feeds = Feed.take_retention_feeds(retention=retention)
    LOG.info('found {} feeds need clean by retention'.format(len(feeds)))
    for feed in feeds:
        feed_id = feed['feed_id']
        url = feed['url']
        n = Story.delete_by_retention(feed_id, retention=retention)
        LOG.info(f'deleted {n} storys of feed#{feed_id} {url} by retention')
Ejemplo n.º 8
0
def fix_story_offset(feeds=None):
    with transaction.atomic():
        feed_ids = _get_feed_ids(feeds)
        LOG.info('total %s feeds', len(feed_ids))
        num_fixed = 0
        for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
            num_reallocate = Story.reallocate_offset(feed_id)
            if num_reallocate > 0:
                num_fixed += 1
        LOG.info('correct %s feeds', num_fixed)
Ejemplo n.º 9
0
    def test_story_dt_and_content_length(self):
        dt = timezone.datetime(2019, 6, 1, 12, 12, 12, tzinfo=timezone.utc)
        story = {
            'unique_id': f'blog.example.com/1',
            'title': f'test story 1',
            'dt_published': dt,
            'dt_updated': dt,
        }
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           [validate_story(story)],
                                           batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(0)
        dt_created = modified[0].dt_created
        dt_published = modified[0].dt_published
        assert modified[0].dt_updated == dt

        dt = dt + timezone.timedelta(days=1)
        updated_content = 'updated_content 1'
        story.update(
            content=updated_content,
            content_hash_base64=compute_hash_base64(updated_content),
            dt_published=dt,
            dt_updated=dt,
        )
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   [validate_story(story)],
                                                   batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(1)
        assert modified[0].dt_created == dt_created
        assert modified[0].dt_published == dt_published
        assert modified[0].dt_updated == dt
        assert modified[0].content_length == len(updated_content)

        dt = dt + timezone.timedelta(days=2)
        updated_content = 'updated_content 22'
        story.update(
            content=updated_content,
            content_hash_base64=compute_hash_base64(updated_content),
            dt_published=dt,
            dt_updated=dt,
        )
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   [validate_story(story)],
                                                   batch_size=10)
        self.assertEqual(len(modified), 1)
        self.assert_feed_total_storys(1)
        self.assert_total_story_infos(1)
        assert modified[0].dt_created == dt_created
        assert modified[0].dt_published == dt_published
        assert modified[0].dt_updated == dt
        assert modified[0].content_length == len(updated_content)
Ejemplo n.º 10
0
def do_update_feed(
    ctx: ActorContext,
    feed_id: T.int,
    feed: FeedSchema,
    is_refresh: T.bool.default(False).desc('Deprecated'),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        for k, v in feed_dict.items():
            if v != '' and v is not None:
                setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if not feed.dt_updated:
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id, storys)
        LOG.info(
            'feed#%s save storys total=%s num_modified=%s',
            feed.id, len(storys), len(modified_storys)
        )
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(feed, story)):
            ctx.tell('worker_rss.fetch_story', dict(
                url=story.link,
                story_id=str(story.id)
            ))
        else:
            _detect_story_images(ctx, story)
Ejemplo n.º 11
0
Archivo: rss.py Proyecto: sun816/rssant
def update_feed_dt_first_story_published(feeds=None):
    feed_ids = _get_feed_ids(feeds)
    LOG.info('total %s feeds', len(feed_ids))
    for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
        with transaction.atomic():
            feed = Feed.get_by_pk(feed_id)
            if feed.dt_first_story_published:
                continue
            if feed.total_storys <= 0:
                continue
            try:
                story = Story.get_by_offset(feed_id, 0, detail=True)
            except Story.DoesNotExist:
                LOG.warning(f'story feed_id={feed_id} offset=0 not exists')
                continue
            feed.dt_first_story_published = story.dt_published
            feed.save()
Ejemplo n.º 12
0
    def test_mix_bulk_save_by_feed(self):
        storys_0_30 = self.storys[:30]
        modified = Story.bulk_save_by_feed(self.feed_id,
                                           storys_0_30,
                                           batch_size=10)
        self.assertEqual(len(modified), 30)
        self.assert_feed_total_storys(30)
        self.assert_total_story_infos(0)

        storys_10_50 = self.updated_storys[10:30] + self.storys[30:50]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_10_50,
                                                   batch_size=10)
        self.assertEqual(len(modified), 40)
        self.assert_feed_total_storys(50)
        self.assert_total_story_infos(40)

        storys_40_60 = self.storys[40:60]
        modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id,
                                                   storys_40_60,
                                                   batch_size=10)
        self.assertEqual(len(modified), 10)
        self.assert_feed_total_storys(60)
        self.assert_total_story_infos(50)
Ejemplo n.º 13
0
Archivo: rss.py Proyecto: sun816/rssant
def update_feed_monthly_story_count(feeds=None):
    feed_ids = _get_feed_ids(feeds)
    LOG.info('total %s feeds', len(feed_ids))
    for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
        with transaction.atomic():
            Story.refresh_feed_monthly_story_count(feed_id)
Ejemplo n.º 14
0
def do_update_feed(
        ctx: ActorContext,
        feed_id: T.int,
        feed: FeedSchema,
        is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if (v != '' and v is not None) or k in {'warnings'}:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id,
                                                  storys,
                                                  is_refresh=is_refresh)
        LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id,
                 len(storys), len(modified_storys))
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell(
                'worker_rss.fetch_story',
                dict(
                    url=story.link,
                    use_proxy=feed.use_proxy,
                    story_id=str(story.id),
                    num_sub_sentences=num_sub_sentences,
                ))
        else:
            _detect_story_images(ctx, story)