Beispiel #1
0
def synchronize_statsd_articles_gauges(full=False):
    """ synchronize all articles-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Article.*'):

        empty = Article.objects.empty()
        # empty_pending       = empty.filter(content_error='', url_error='')
        # empty_content_error = empty.filter(content_error__ne='')
        # empty_url_error     = empty.filter(url_error__ne='')

        parsed = Article.objects.parsed()
        html = parsed.filter(content_type=CONTENT_TYPES.HTML)
        markdown = parsed.filter(content_type=CONTENT_TYPES.MARKDOWN)

        absolutes = Article.objects.absolute()
        duplicates = Article.objects.duplicate()
        orphaned = Article.objects.orphaned().master()
        content_errors = Article.objects.exclude(content_error=None)
        url_errors = Article.objects.exclude(url_error=None)

        with statsd.pipeline() as spipe:
            spipe.gauge('articles.counts.total', Article.objects.all().count())
            spipe.gauge('articles.counts.markdown', markdown.count())
            spipe.gauge('articles.counts.html', html.count())
            spipe.gauge('articles.counts.empty', empty.count())
            spipe.gauge('articles.counts.content_errors',
                        content_errors.count())
            spipe.gauge('articles.counts.url_errors', url_errors.count())

            if full:
                spipe.gauge('articles.counts.orphaned', orphaned.count())
                spipe.gauge('articles.counts.absolutes', absolutes.count())
                spipe.gauge('articles.counts.duplicates', duplicates.count())
Beispiel #2
0
def synchronize_statsd_subscriptions_gauges(full=False):
    """ synchronize all subscription-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Subscription.*'):

        statsd.gauge('subscriptions.counts.total',
                     Subscription.objects.all().count())
Beispiel #3
0
def synchronize_mongodb_statsd_articles_gauges(full=False):
    """ synchronize all articles-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Article.*'):

        empty = Article.objects(content_type=0).no_cache()
        # empty_pending       = empty.filter(content_error='', url_error='')
        # empty_content_error = empty.filter(content_error__ne='')
        # empty_url_error     = empty.filter(url_error__ne='')

        parsed = Article.objects(content_type__ne=CONTENT_TYPES.NONE)
        html = parsed.filter(content_type=CONTENT_TYPES.HTML)
        markdown = parsed.filter(content_type=CONTENT_TYPES.MARKDOWN)

        absolutes = Article.objects(url_absolute=True).no_cache()
        duplicates = Article.objects(duplicate_of__ne=None).no_cache()
        orphaned = Article.objects(orphaned=True).no_cache()
        content_errors = Article.objects(content_error__ne='').no_cache()
        url_errors = Article.objects(url_error__ne='').no_cache()

        statsd.gauge('mongo.articles.counts.total',
                     Article._get_collection().count())
        statsd.gauge('mongo.articles.counts.markdown', markdown.count())
        statsd.gauge('mongo.articles.counts.html', html.count())
        statsd.gauge('mongo.articles.counts.empty', empty.count())
        statsd.gauge('mongo.articles.counts.content_errors',
                     content_errors.count())
        statsd.gauge('mongo.articles.counts.url_errors', url_errors.count())

        if full:
            statsd.gauge('mongo.articles.counts.orphaned', orphaned.count())
            statsd.gauge('mongo.articles.counts.absolutes', absolutes.count())
            statsd.gauge('mongo.articles.counts.duplicates',
                         duplicates.count())
Beispiel #4
0
def synchronize_mongodb_statsd_articles_gauges(full=False):
    """ synchronize all articles-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Article.*'):

        empty               = Article.objects(content_type=0).no_cache()
        # empty_pending       = empty.filter(content_error='', url_error='')
        # empty_content_error = empty.filter(content_error__ne='')
        # empty_url_error     = empty.filter(url_error__ne='')

        parsed             = Article.objects(
            content_type__ne=CONTENT_TYPES.NONE)
        html               = parsed.filter(content_type=CONTENT_TYPES.HTML)
        markdown           = parsed.filter(content_type=CONTENT_TYPES.MARKDOWN)

        absolutes          = Article.objects(url_absolute=True).no_cache()
        duplicates         = Article.objects(duplicate_of__ne=None).no_cache()
        orphaned           = Article.objects(orphaned=True).no_cache()
        content_errors     = Article.objects(content_error__ne='').no_cache()
        url_errors         = Article.objects(url_error__ne='').no_cache()

        statsd.gauge('mongo.articles.counts.total',
                     Article._get_collection().count())
        statsd.gauge('mongo.articles.counts.markdown', markdown.count())
        statsd.gauge('mongo.articles.counts.html', html.count())
        statsd.gauge('mongo.articles.counts.empty', empty.count())
        statsd.gauge('mongo.articles.counts.content_errors',
                     content_errors.count())
        statsd.gauge('mongo.articles.counts.url_errors', url_errors.count())

        if full:
            statsd.gauge('mongo.articles.counts.orphaned', orphaned.count())
            statsd.gauge('mongo.articles.counts.absolutes', absolutes.count())
            statsd.gauge('mongo.articles.counts.duplicates', duplicates.count())
Beispiel #5
0
def synchronize_statsd_gauges(full=False, force=False):
    """ Synchronize all counters to statsd. """

    # from oneflow.core.stats import (
    #     synchronize_mongodb_statsd_articles_gauges,
    #     synchronize_mongodb_statsd_tags_gauges,
    #     synchronize_mongodb_statsd_websites_gauges,
    #     synchronize_mongodb_statsd_authors_gauges,
    # )

    from oneflow.core.dbstats import (
        synchronize_statsd_articles_gauges,
        synchronize_statsd_tags_gauges,
        synchronize_statsd_websites_gauges,
        synchronize_statsd_authors_gauges,
        synchronize_statsd_feeds_gauges,
        synchronize_statsd_subscriptions_gauges,
        synchronize_statsd_reads_gauges,
    )

    my_lock = RedisExpiringLock(SYNCHRONIZE_STATSD_LOCK_NAME, expire_time=3600)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing statsd gauges synchronization…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'synchronize_statsd_gauges() is already locked, '
                           u'aborting.')
            return

    # with benchmark('synchronize_mongodb_statsd_gauges()'):
    #     try:
    #         synchronize_mongodb_statsd_articles_gauges(full=full)
    #         synchronize_mongodb_statsd_tags_gauges(full=full)
    #         synchronize_mongodb_statsd_websites_gauges(full=full)
    #         synchronize_mongodb_statsd_authors_gauges(full=full)
    #     except:
    #         LOGGER.exception(u'MongoDB stats failed at some point')

    with benchmark('synchronize_statsd_gauges()'):

        try:
            synchronize_statsd_articles_gauges(full=full)
            synchronize_statsd_tags_gauges(full=full)
            synchronize_statsd_websites_gauges(full=full)
            synchronize_statsd_authors_gauges(full=full)
            synchronize_statsd_feeds_gauges(full=full)
            synchronize_statsd_subscriptions_gauges(full=full)
            synchronize_statsd_reads_gauges(full=full)

        finally:
            my_lock.release()
Beispiel #6
0
def clean_obsolete_intermediate_versions():
    """ Purge cleaned HTML & markdown versions of external articles.

    Versions of internal articles (written by users) are not impacted
    by this function.
    """

    recent_delta = timedelta(days=config.CLEANUP_FEEDS_OLD_NON_HTML_DELTA)

    external_articles_ids = Article.objects.filter(
        feeds__is_internal=False).values_list('id', flat=True).iterator()

    old_versions = HistoricalArticle.objects.filter(
        history_date__lte=now() - recent_delta)

    old_cleaned_html_versions = old_versions.filter(
        content_type=CONTENT_TYPES.CLEANED_HTML)

    if old_cleaned_html_versions.exists():
        count = old_cleaned_html_versions.count()

        with benchmark('Purge {0} obsolete cleaned HTML versions'.format(
                       count)):
            old_cleaned_html_versions.delete()

        message_admins(u'{} cleaned-HTML version(s) purged'.format(count))

    else:
        LOGGER.info(u'No obsolete cleaned HTML versions purged.')

    external_recent_markdown_versions = old_versions.filter(
        content_type=CONTENT_TYPES.MARKDOWN).filter(
            id__in=external_articles_ids)

    if external_recent_markdown_versions.exists():
        count = external_recent_markdown_versions.count()

        with benchmark('Purge {0} obsolete Markdown versions'.format(
                       count)):
            external_recent_markdown_versions.delete()

        message_admins(u'{} markdown version(s) purged'.format(count))

    else:
        LOGGER.info(u'No obsolete Markdown versions purged.')
Beispiel #7
0
def synchronize_statsd_websites_gauges(full=False):

    with benchmark('synchronize statsd gauges for WebSite.*'):

        statsd.gauge('websites.counts.total', WebSite._get_collection().count())

        if full:
            duplicates = WebSite.objects(duplicate_of__ne=None).no_cache()
            statsd.gauge('websites.counts.duplicates', duplicates.count())
Beispiel #8
0
def synchronize_statsd_authors_gauges(full=False):

    with benchmark('synchronize statsd gauges for Author.*'):

        statsd.gauge('authors.counts.total', Author._get_collection().count())

        if full:
            duplicates = Author.objects(duplicate_of__ne=None).no_cache()
            statsd.gauge('authors.counts.duplicates', duplicates.count())
Beispiel #9
0
def synchronize_statsd_websites_gauges(full=False):
    """ synchronize all website-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for WebSite.*'):

        statsd.gauge('websites.counts.total', WebSite.objects.all().count())

        if full:
            duplicates = WebSite.objects.exclude(duplicate_of=None)
            statsd.gauge('websites.counts.duplicates', duplicates.count())
Beispiel #10
0
def synchronize_statsd_tags_gauges(full=False):
    """ synchronize all tag-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Tag.*'):

        statsd.gauge('tags.counts.total', Tag.objects.all().count())

        if full:
            duplicates = Tag.objects.exclude(duplicate_of=None)
            statsd.gauge('tags.counts.duplicates', duplicates.count())
Beispiel #11
0
def synchronize_statsd_authors_gauges(full=False):
    """ synchronize all author-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Author.*'):

        statsd.gauge('authors.counts.total', Author.objects.all().count())

        if full:
            duplicates = Author.objects.exclude(duplicate_of=None)
            statsd.gauge('authors.counts.duplicates', duplicates.count())
Beispiel #12
0
def synchronize_mongodb_statsd_tags_gauges(full=False):
    """ synchronize all tag-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Tag.*'):

        statsd.gauge('mongo.tags.counts.total', Tag._get_collection().count())

        if full:
            duplicates = Tag.objects(duplicate_of__ne=None).no_cache()
            statsd.gauge('mongo.tags.counts.duplicates', duplicates.count())
Beispiel #13
0
def synchronize_mongodb_statsd_tags_gauges(full=False):
    """ synchronize all tag-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Tag.*'):

        statsd.gauge('mongo.tags.counts.total', Tag._get_collection().count())

        if full:
            duplicates = Tag.objects(duplicate_of__ne=None).no_cache()
            statsd.gauge('mongo.tags.counts.duplicates', duplicates.count())
Beispiel #14
0
def clean_empty_versions():
    """ Remove empty historical articles from database. """

    empty_versions = HistoricalArticle.objects.filter(content=None)

    if empty_versions.exists():
        count = empty_versions.count()

        with benchmark('Purge {0} empty versions'.format(count)):
            empty_versions.delete()

        message_admins(u'{} empty version(s) purged'.format(count))

    else:
        LOGGER.info(u'No empty versions purged.')
Beispiel #15
0
def refresh_all_mailaccounts(force=False):
    """ Check all unusable e-mail accounts. """

    if config.MAIL_ACCOUNT_REFRESH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'E-mail accounts check disabled in configuration.')
        return

    accounts = MailAccount.objects.unusable()

    my_lock = RedisExpiringLock(REFRESH_ALL_MAILACCOUNTS_LOCK_NAME,
                                expire_time=30 * (accounts.count() + 2))

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing check of email accounts…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_mailaccounts() is already locked, '
                           u'aborting.')
            return

    with benchmark('refresh_all_mailaccounts()'):

        try:
            for account in accounts:
                try:
                    account.test_connection()
                    account.update_mailboxes()

                except:
                    pass

        finally:
            my_lock.release()

        LOGGER.info(
            u'Launched %s checks on unusable accounts out of %s total.',
            accounts.count(),
            MailAccount.objects.all().count())
Beispiel #16
0
def synchronize_statsd_feeds_gauges(full=False):
    """ synchronize all feed-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for BaseFeed.*'):

        all_feeds = BaseFeed.objects.all()

        statsd.gauge('feeds.counts.total', all_feeds.count())
        statsd.gauge('feeds.counts.open', BaseFeed.objects.active().count())

        if full:
            statsd.gauge('feeds.counts.mail', all_feeds.mail().count())
            statsd.gauge('feeds.counts.twitter', all_feeds.twitter().count())

            duplicates = BaseFeed.objects.exclude(duplicate_of=None)
            statsd.gauge('feeds.counts.duplicates', duplicates.count())

        pass
Beispiel #17
0
def refresh_all_mailaccounts(force=False):
    """ Check all unusable e-mail accounts. """

    if config.MAIL_ACCOUNT_REFRESH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'E-mail accounts check disabled in configuration.')
        return

    accounts = MailAccount.objects.unusable()

    my_lock = RedisExpiringLock(REFRESH_ALL_MAILACCOUNTS_LOCK_NAME,
                                expire_time=30 * (accounts.count() + 2))

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing check of email accounts…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_mailaccounts() is already locked, '
                           u'aborting.')
            return

    with benchmark('refresh_all_mailaccounts()'):

        try:
            for account in accounts:
                try:
                    account.test_connection()
                    account.update_mailboxes()

                except:
                    pass

        finally:
            my_lock.release()

        LOGGER.info(u'Launched %s checks on unusable accounts out of %s total.',
                    accounts.count(), MailAccount.objects.all().count())
Beispiel #18
0
def do_whatever_SQL(query, qargs, pretty_name, commit=True):
    """ Go ahead, man. """

    with benchmark(pretty_name):
        cursor = connection.cursor()

        cursor.execute(query, qargs)

        try:
            result = cursor.fetchone()[0]

        except:
            result = None

        if commit:
            cursor.execute('COMMIT;')

        cursor.close()

        return result
Beispiel #19
0
def synchronize_statsd_reads_gauges(full=False):
    """ synchronize all read-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Read.*'):

        count = Read.objects.all().count()
        good = Read.objects.good().count()
        bad = Read.objects.bad().count()

        with statsd.pipeline() as spipe:
            spipe.gauge('reads.counts.total', count)
            spipe.gauge('reads.counts.good', good)
            spipe.gauge('reads.counts.bad', bad)

        # Am I paranoïd?!? No, I come from two years of MongoDB.
        # Sorry PostgreSQL, I'm underway healing.
        if bad != (count - good):
            LOGGER.warning(
                u'Bad count (%s) is different from total-good (%s)!', bad,
                count - good)
Beispiel #20
0
def check_one_user(user, extended_check=False, force=False, verbose=False):
    u""" Completely check a user account and its “things”.

    Eg. user feeds, user subscriptions, user counters. Extended check:

    - recompute user counters (can be very long).

    :param extended_check: default ``False``, check more things.
    :param force: default ``False``, currently ignored.
    :param verbose: default ``False``, currently ignored.
    """

    try:
        user_feeds = user.user_feeds

    except:
        user_feeds = UserFeeds(user=user)
        user_feeds.save()

    user_feeds.check()

    try:
        user_subscriptions = user.user_subscriptions

    except:
        user_subscriptions = UserSubscriptions(user=user)
        user_subscriptions.save()

    user_subscriptions.check()

    try:
        user_counters = user.user_counters

    except:
        user_counters = UserCounters(user=user)
        user_counters.save()

    if extended_check:
        with benchmark(u'Recomputing cached descriptors'):
            user.user_counters.compute_cached_descriptors()
Beispiel #21
0
def check_one_user(user, extended_check=False, force=False, verbose=False):
    u""" Completely check a user account and its “things”.

    Eg. user feeds, user subscriptions, user counters. Extended check:

    - recompute user counters (can be very long).

    :param extended_check: default ``False``, check more things.
    :param force: default ``False``, currently ignored.
    :param verbose: default ``False``, currently ignored.
    """

    try:
        user_feeds = user.user_feeds

    except:
        user_feeds = UserFeeds(user=user)
        user_feeds.save()

    user_feeds.check()

    try:
        user_subscriptions = user.user_subscriptions

    except:
        user_subscriptions = UserSubscriptions(user=user)
        user_subscriptions.save()

    user_subscriptions.check()

    try:
        user_counters = user.user_counters

    except:
        user_counters = UserCounters(user=user)
        user_counters.save()

    if extended_check:
        with benchmark(u'Recomputing cached descriptors'):
            user.user_counters.compute_cached_descriptors()
Beispiel #22
0
    def repair_missing_authors_migration_201411(cls):

        # from oneflow.core.tasks.migration import vacuum_analyze

        articles = Article.objects.filter(
            authors=None,
            date_created__gt=datetime(2014, 10, 31))

        count = articles.count()
        done = 0

        LOGGER.info(u'Starting repairing %s missing authors @%s', count, now())

        with benchmark(u'Fix missing authors on rel-DB fetched content…'):

            for article in articles:
                article.postprocess_original_data(force=True)

                # if done % 25000 == 0:
                #     vacuum_analyze()

                done += 1
Beispiel #23
0
def go(limit=None, all_hint=None):
    """ Do the dirty things, fast. """

    TO_CLEAN = ('url', 'content', )

    URL_CLEANING_QUERY = """
UPDATE core_article SET {0}_error = NULL
WHERE core_article.baseitem_ptr_id IN (
    SELECT baseitem_ptr_id
    FROM core_article
    WHERE {0}_error = ''
    LIMIT {1}
);
"""

    COUNT_QUERY = """
SELECT COUNT(*)
FROM core_article
WHERE {0}_error = '';
"""

    def one_line(a_string):
        return re.sub(u'  +', u' ', u' '.join(a_string.splitlines()))

    if limit is None:
        limit = 10000

    if all_hint is None:
        all_hint = 7000000

    LOGGER.info(u'Starting to fix the world @ %s', now())

    with benchmark(u'Fix everything'):

        for to_clean in TO_CLEAN:

            done = 0

            with benchmark(u'Fixing %s' % to_clean):
                while True:
                    do_whatever_SQL(
                        one_line(URL_CLEANING_QUERY).format(
                            to_clean, limit
                        ),
                        [],
                        u'Fixing %s, round %s' % (to_clean, done)
                    )

                    done += 1

                    # if done % 10 == 0:
                    #     vacuum_analyze('at %s' % (done * 50000))

                    if done > (all_hint / limit):
                        count = do_whatever_SQL(
                            one_line(COUNT_QUERY).format(to_clean),
                            [],
                            u'Counting things',
                            commit=False
                        )
                        if count == 0:
                            break

                    time.sleep(20)
Beispiel #24
0
def global_orphaned_checker(limit=None,
                            extended_check=False,
                            force=False,
                            verbose=False,
                            break_on_exception=False):
    """ Check all orphaned articles and delete them.

    They will be deleted only if they are duplicate of other orphaned ones,
    and only if the duplication replacement process finished successfully.
    If it failed, the orphan is left in place, to be able to re-run the
    operation later.

    Can be disabled by ``config.CHECK_ORPHANED_DISABLED`` directive.

    :param limit: integer, the maximum number of users to check.
        Default: none.
    :param extended_check: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param break_on_exception: boolean, default ``False``, currently
        ignored in this function.
    """

    if config.CHECK_ORPHANED_DISABLED:
        LOGGER.warning(u'Orphaned check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_orphaned', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing orphaned check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_orphaned_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_ORPHANED_LIMIT

    orphaned_items = Article.objects.orphaned().master()
    orphaned_items_count = orphaned_items.count()
    processed_orphans = 0
    changed_orphans = 0
    deleted_orphans = 0
    skipped_orphans = 0

    with benchmark(u"Check {0}/{1} orphans".format(limit or u'all',
                                                   orphaned_items_count)):
        try:
            for orphan in orphaned_items.iterator():
                processed_orphans += 1

                if limit and changed_orphans >= limit:
                    break

                old_url = orphan.url

                new_url = ARTICLE_ORPHANED_BASE + generate_orphaned_hash(
                    orphan.name, orphan.feeds.all())

                if new_url != old_url:
                    orphan.url = new_url
                    orphan.url_absolute = True

                else:
                    if not orphan.url_absolute:
                        changed_orphans += 1
                        orphan.url_absolute = True
                        orphan.save()

                    continue

                try:
                    orphan.save()

                except IntegrityError:
                    master = Article.objects.get(url=orphan.url)

                    # We have to put back the original URL, else the
                    # duplicate registration process will fail.
                    orphan.url = old_url

                    # Register the duplicate right here and now, to be able to
                    master.register_duplicate(orphan,
                                              force=force,
                                              background=False)

                    # Reload the orphan to get the refreshed duplicate status.
                    orphan = Article.objects.get(id=orphan.id)

                    if orphan.duplicate_status == DUPLICATE_STATUS.FINISHED:
                        orphan.delete()
                        deleted_orphans += 1

                        if verbose:
                            LOGGER.info(u'Deleted duplicate orphan %s', orphan)

                except:
                    skipped_orphans += 1
                    LOGGER.exception(u'Unhandled exception while checking %s',
                                     orphan)

                else:
                    changed_orphans += 1

        finally:
            my_lock.release()

    LOGGER.info(
        u'global_orphans_checker(): %s/%s orphans processed '
        u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), '
        u'%s skipped (%.2f%%).', processed_orphans, orphaned_items_count,
        processed_orphans * 100.0 / orphaned_items_count, changed_orphans,
        changed_orphans * 100.0 / processed_orphans, deleted_orphans,
        deleted_orphans * 100.0 / processed_orphans, skipped_orphans,
        skipped_orphans * 100.0 / processed_orphans)
Beispiel #25
0
def global_users_checker(limit=None,
                         extended_check=False,
                         force=False,
                         verbose=False,
                         break_on_exception=False):
    """ Check all Users and their dependancies.

    Can be disabled by ``config.CHECK_USERS_DISABLED`` directive.

    :param limit: integer, the maximum number of users to check.
        Default: none.
    :param extended_check: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param break_on_exception: boolean, default ``False``, currently
        ignored in this function.
    """

    if config.CHECK_USERS_DISABLED:
        LOGGER.warning(u'Users check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_users', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing users check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_users_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_USERS_LIMIT

    active_users = User.objects.filter(is_active=True)
    total_users_count = active_users.count()
    processed_users = 0
    changed_users = 0
    skipped_count = 0

    with benchmark(u"Check {0}/{1} users".format(limit or u'all',
                                                 total_users_count)):
        try:
            for user in active_users.iterator():

                processed_users += 1

                if limit and changed_users >= limit:
                    break

                check_one_user(user,
                               extended_check=extended_check,
                               force=force,
                               verbose=verbose)

        finally:
            my_lock.release()

    LOGGER.info(
        u'global_users_checker(): %s/%s users processed '
        u'(%.2f%%), %s corrected (%.2f%%), %s skipped (%.2f%%).',
        processed_users, total_users_count,
        processed_users * 100.0 / total_users_count, changed_users,
        changed_users * 100.0 / processed_users, skipped_count,
        skipped_count * 100.0 / processed_users)
Beispiel #26
0
def reprocess_failed_articles(failed=None,
                              expiry=None,
                              limit=None,
                              force=False,
                              reprocessing_type=None):
    u""" Reprocess articles that failed absolutization.

    In case there was a temporary error, this could lead to more good articles.
    """

    if config.ARTICLE_REPROCESSING_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Articles reprocess disabled in configuration.')
        return

    if failed is None:
        raise RuntimeError(u'Need a queryset of failed items to reprocess.')

    # TODO: as the celery tasks expires,
    # the lock is probably not needed anymore.

    my_lock = RedisExpiringLock('reprocess_failed_articles_' + str(expiry),
                                expire_time=expiry)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing failed articles reprocessing…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'reprocess_failed_articles() is already locked, '
                           u'aborting.')
            return

    failed_count = failed.count()

    with benchmark((u'Reprocess_failed_articles(expiry=%s): %s '
                    u' processing chains relaunched.') %
                   (naturaldelta(expiry), failed_count)):

        try:
            for article in failed.iterator():

                if reprocessing_type is None:
                    article.url_error = None
                    article.save()

                    article_post_create_task.apply(args=(article.id, ),
                                                   kwargs={'apply_now': True})

                elif reprocessing_type == 'standard':
                    article.process()

        finally:
            # HEADS UP: in case the system is overloaded, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to flood the queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass
Beispiel #27
0
def global_reads_checker(limit=None,
                         extended_check=False,
                         force=False,
                         verbose=False,
                         break_on_exception=False):
    """ Check all reads and their dependants.

    Will activate reads that are currently bad, but whose article is OK
    to display.

    This task is one of the most expensive thing in 1flow.
    It can run for hours because it scans all the bad reads and their
    articles, but will not kill the database with massive updates, it
    does them one by one.

    Can be disabled by ``config.CHECK_READS_DISABLED`` directive.

    :param limit: integer, the maximum number of duplicates to check.
        Default: none.
    :param extended_check: boolean, default ``False``.
        Runs :meth:`Read.set_subscriptions` if ``True`` and checked read
        has no subscription.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``, display (more)
        informative messages.
    :param break_on_exception: boolean, default ``False``, stop processing
        at the first encountered exception. Whatever it is, the exception
        will be logged to sentry.
    """

    if config.CHECK_READS_DISABLED:
        LOGGER.warning(u'Reads check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_reads', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing reads check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_reads_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_READS_LIMIT

    bad_reads = Read.objects.bad()

    total_reads_count = bad_reads.count()
    processed_reads = 0
    wiped_reads_count = 0
    changed_reads_count = 0
    skipped_count = 0

    with benchmark(u"Check {0}/{1} reads".format(limit or u'all',
                                                 total_reads_count)):
        try:
            for read in bad_reads.iterator():

                processed_reads += 1

                if limit and changed_reads_count >= limit:
                    break

                if read.is_good:
                    # This read has been activated via another
                    # checked one, attached to the same article.
                    changed_reads_count += 1
                    continue

                try:
                    article = read.item

                except:
                    LOGGER.critical(u'Could not get read.item for %s', read)
                    continue

                if extended_check:
                    try:
                        if read.subscriptions.all().exists():

                            # TODO: remove this
                            #       check_set_subscriptions_131004_done
                            #       transient check.
                            if read.check_set_subscriptions_131004_done:
                                read.check_subscriptions()

                            else:
                                read.check_set_subscriptions_131004()

                        else:
                            read.set_subscriptions()

                    except:
                        skipped_count += 1
                        LOGGER.exception(
                            u'Could not set subscriptions on '
                            u'read #%s, from article #%s, for '
                            u'user #%s. Skipping.', read.id, article.id,
                            read.user.id)
                        continue

                try:
                    if article.is_good:
                        changed_reads_count += 1

                        if verbose:
                            LOGGER.info(
                                u'Bad read %s has a good article, '
                                u'fixing…', read)

                        article.activate_reads(extended_check=extended_check)

                except:
                    LOGGER.exception(
                        u'Could not activate reads from '
                        u'article %s of read %s.', article, read)
                    if break_on_exception:
                        break

        finally:
            my_lock.release()

    LOGGER.info(
        u'global_reads_checker(): %s/%s reads processed '
        u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), '
        u'%s skipped (%.2f%%).', processed_reads, total_reads_count,
        processed_reads * 100.0 / total_reads_count, changed_reads_count,
        changed_reads_count * 100.0 / processed_reads, wiped_reads_count,
        wiped_reads_count * 100.0 / processed_reads, skipped_count,
        skipped_count * 100.0 / processed_reads)
Beispiel #28
0
def archive_articles(limit=None):
    """ Archive articles that pollute the production database. """

    raise NotImplementedError('REVIEW for RELDB.')

    # cf. https://docs.djangoproject.com/en/dev/topics/db/multi-db/#selecting-a-database-to-delete-from  # NOQA

    counts = {
        'duplicates': 0,
        'orphaned': 0,
        'bad_articles': 0,
        'archived_dupes': 0,
    }

    if limit is None:
        limit = config.ARTICLE_ARCHIVE_BATCH_SIZE

    with no_dereference(Article) as ArticleOnly:
        if config.ARTICLE_ARCHIVE_OLDER_THAN > 0:
            older_than = now() - timedelta(
                days=config.ARTICLE_ARCHIVE_OLDER_THAN)

            duplicates = ArticleOnly.objects(
                duplicate_of__ne=None,
                date_published__lt=older_than).limit(limit)
            orphaned = ArticleOnly.objects(
                orphaned=True, date_published__lt=older_than).limit(limit)

        else:
            duplicates = ArticleOnly.objects(
                duplicate_of__ne=None).limit(limit)
            orphaned = ArticleOnly.objects(orphaned=True).limit(limit)

    duplicates.no_cache()
    orphaned.no_cache()

    counts['duplicates'] = duplicates.count()
    counts['orphaned'] = orphaned.count()

    if counts['duplicates']:
        current = 0
        LOGGER.info(u'Archiving of %s duplicate article(s) started.',
                    counts['duplicates'])

        with benchmark('Archiving of %s duplicate article(s)' %
                       counts['duplicates']):
            for article in duplicates:
                archive_article_one_internal(article, counts)
                current += 1
                if current % 50 == 0:
                    LOGGER.info(u'Archived %s/%s duplicate articles so far.',
                                current, counts['duplicates'])

    if counts['orphaned']:
        current = 0
        LOGGER.info(u'Archiving of %s orphaned article(s) started.',
                    counts['orphaned'])

        with benchmark('Archiving of %s orphaned article(s)' %
                       counts['orphaned']):
            for article in orphaned:
                archive_article_one_internal(article, counts)
                current += 1
                if current % 50 == 0:
                    LOGGER.info(u'Archived %s/%s orphaned articles so far.',
                                current, counts['duplicates'])

    if counts['duplicates'] or counts['orphaned']:
        synchronize_statsd_articles_gauges(full=True)

        LOGGER.info(
            '%s already archived and %s bad articles were found '
            u'during the operation.', counts['archived_dupes'],
            counts['bad_articles'])

    else:
        LOGGER.info(u'No article to archive.')
Beispiel #29
0
def global_subscriptions_checker(force=False,
                                 limit=None,
                                 from_feeds=True,
                                 from_users=False,
                                 extended_check=False):
    """ A conditionned version of :meth:`Feed.check_subscriptions`. """

    if config.CHECK_SUBSCRIPTIONS_DISABLED:
        LOGGER.warning(u'Subscriptions checks disabled in configuration.')
        return

    # This task runs one a day. Acquire the lock for just a
    # little more time to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_subscriptions',
                                expire_time=3600 * 25)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing subscriptions checks…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_subscriptions_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_SUBSCRIPTIONS_LIMIT

    assert int(limit) >= 0

    try:
        if from_feeds:
            with benchmark("Check all subscriptions from feeds"):

                # We check ALL feeds (including inactive ones) to be
                # sure all subscriptions / reads are up-to-date.
                feeds = BaseFeed.objects.all()
                feeds_count = feeds.count()
                processed_count = 0
                checked_count = 0

                for feed in feeds.iterator():

                    if limit and checked_count > limit:
                        break

                    if extended_check:
                        feed.compute_cached_descriptors()
                        # all=True, good=True, bad=True

                    # Do not extended_check=True, this would double-do
                    # the subscription.check_reads() already called below.
                    feed.check_subscriptions()

                    for subscription in feed.subscriptions.all().iterator():

                        processed_count += 1

                        if subscription.all_items_count \
                                != feed.good_items_count:

                            checked_count += 1

                            LOGGER.info(
                                u'Subscription %s (#%s) has %s reads '
                                u'whereas its feed has %s good '
                                u'articles; checking…', subscription.name,
                                subscription.id, subscription.all_items_count,
                                feed.good_items_count)

                            subscription.check_reads(
                                extended_check=extended_check, force=True)

                LOGGER.info(
                    u'%s/%s (limit:%s) feeds processed, %s '
                    u'checked (%.2f%%).', processed_count, feeds_count, limit,
                    checked_count, checked_count * 100.0 / processed_count)

        if from_users:
            with benchmark("Check all subscriptions from users"):

                users = User.objects.filter(is_active=True)
                users_count = users.count()
                processed_count = 0

                for user in users:

                    # Do not extended_check=True, this would double-do
                    # the subscription.check_reads() already called below.
                    user.check_subscriptions()

                    if extended_check:
                        user.user_counters.compute_cached_descriptors()
                        # all=True, unread=True, starred=True, bookmarked=True

                        for subscription in user.subscriptions.all().iterator(
                        ):
                            processed_count += 1

                            subscription.check_reads(extended_check=True,
                                                     force=True)

                LOGGER.info(
                    u'%s users %sprocessed. '
                    u'All were checked.', users_count,
                    u'and %s subscriptions '.format(processed_count)
                    if extended_check else u'')

    finally:
        my_lock.release()
Beispiel #30
0
def archive_articles(limit=None):
    """ Archive articles that pollute the production database. """

    raise NotImplementedError('REVIEW for RELDB.')

    # cf. https://docs.djangoproject.com/en/dev/topics/db/multi-db/#selecting-a-database-to-delete-from  # NOQA

    counts = {
        'duplicates': 0,
        'orphaned': 0,
        'bad_articles': 0,
        'archived_dupes': 0,
    }

    if limit is None:
        limit = config.ARTICLE_ARCHIVE_BATCH_SIZE

    with no_dereference(Article) as ArticleOnly:
        if config.ARTICLE_ARCHIVE_OLDER_THAN > 0:
            older_than = now() - timedelta(
                days=config.ARTICLE_ARCHIVE_OLDER_THAN)

            duplicates = ArticleOnly.objects(
                duplicate_of__ne=None,
                date_published__lt=older_than).limit(limit)
            orphaned   = ArticleOnly.objects(
                orphaned=True,
                date_published__lt=older_than).limit(limit)

        else:
            duplicates = ArticleOnly.objects(duplicate_of__ne=None
                                             ).limit(limit)
            orphaned   = ArticleOnly.objects(orphaned=True).limit(limit)

    duplicates.no_cache()
    orphaned.no_cache()

    counts['duplicates'] = duplicates.count()
    counts['orphaned']   = orphaned.count()

    if counts['duplicates']:
        current = 0
        LOGGER.info(u'Archiving of %s duplicate article(s) started.',
                    counts['duplicates'])

        with benchmark('Archiving of %s duplicate article(s)'
                       % counts['duplicates']):
            for article in duplicates:
                archive_article_one_internal(article, counts)
                current += 1
                if current % 50 == 0:
                    LOGGER.info(u'Archived %s/%s duplicate articles so far.',
                                current, counts['duplicates'])

    if counts['orphaned']:
        current = 0
        LOGGER.info(u'Archiving of %s orphaned article(s) started.',
                    counts['orphaned'])

        with benchmark('Archiving of %s orphaned article(s)'
                       % counts['orphaned']):
            for article in orphaned:
                archive_article_one_internal(article, counts)
                current += 1
                if current % 50 == 0:
                    LOGGER.info(u'Archived %s/%s orphaned articles so far.',
                                current, counts['duplicates'])

    if counts['duplicates'] or counts['orphaned']:
        synchronize_statsd_articles_gauges(full=True)

        LOGGER.info('%s already archived and %s bad articles were found '
                    u'during the operation.', counts['archived_dupes'],
                    counts['bad_articles'])

    else:
        LOGGER.info(u'No article to archive.')
Beispiel #31
0
def global_reads_checker(limit=None, extended_check=False, force=False,
                         verbose=False, break_on_exception=False):
    """ Check all reads and their dependants.

    Will activate reads that are currently bad, but whose article is OK
    to display.

    This task is one of the most expensive thing in 1flow.
    It can run for hours because it scans all the bad reads and their
    articles, but will not kill the database with massive updates, it
    does them one by one.

    Can be disabled by ``config.CHECK_READS_DISABLED`` directive.

    :param limit: integer, the maximum number of duplicates to check.
        Default: none.
    :param extended_check: boolean, default ``False``.
        Runs :meth:`Read.set_subscriptions` if ``True`` and checked read
        has no subscription.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``, display (more)
        informative messages.
    :param break_on_exception: boolean, default ``False``, stop processing
        at the first encountered exception. Whatever it is, the exception
        will be logged to sentry.
    """

    if config.CHECK_READS_DISABLED:
        LOGGER.warning(u'Reads check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_reads', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing reads check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_reads_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_READS_LIMIT

    bad_reads = Read.objects.bad()

    total_reads_count   = bad_reads.count()
    processed_reads     = 0
    wiped_reads_count   = 0
    changed_reads_count = 0
    skipped_count       = 0

    with benchmark(u"Check {0}/{1} reads".format(limit or u'all',
                   total_reads_count)):
        try:
            for read in bad_reads.iterator():

                processed_reads += 1

                if limit and changed_reads_count >= limit:
                    break

                if read.is_good:
                    # This read has been activated via another
                    # checked one, attached to the same article.
                    changed_reads_count += 1
                    continue

                try:
                    article = read.item

                except:
                    LOGGER.critical(u'Could not get read.item for %s', read)
                    continue

                if extended_check:
                    try:
                        if read.subscriptions.all().exists():

                            # TODO: remove this
                            #       check_set_subscriptions_131004_done
                            #       transient check.
                            if read.check_set_subscriptions_131004_done:
                                read.check_subscriptions()

                            else:
                                read.check_set_subscriptions_131004()

                        else:
                            read.set_subscriptions()

                    except:
                        skipped_count += 1
                        LOGGER.exception(u'Could not set subscriptions on '
                                         u'read #%s, from article #%s, for '
                                         u'user #%s. Skipping.', read.id,
                                         article.id, read.user.id)
                        continue

                try:
                    if article.is_good:
                        changed_reads_count += 1

                        if verbose:
                            LOGGER.info(u'Bad read %s has a good article, '
                                        u'fixing…', read)

                        article.activate_reads(extended_check=extended_check)

                except:
                    LOGGER.exception(u'Could not activate reads from '
                                     u'article %s of read %s.',
                                     article, read)
                    if break_on_exception:
                        break

        finally:
            my_lock.release()

    LOGGER.info(u'global_reads_checker(): %s/%s reads processed '
                u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), '
                u'%s skipped (%.2f%%).',
                processed_reads, total_reads_count,
                processed_reads * 100.0 / total_reads_count,
                changed_reads_count,
                changed_reads_count * 100.0 / processed_reads,
                wiped_reads_count,
                wiped_reads_count * 100.0 / processed_reads,
                skipped_count,
                skipped_count * 100.0 / processed_reads)
Beispiel #32
0
def refresh_all_mongo_feeds(limit=None, force=False):
    u""" Refresh all MongoEngine feeds (RSS).

    .. note:: this task should vanish when
        MongoDB → PostgreSQL migration is done.
    """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    # Be sure two refresh operations don't overlap, but don't hold the
    # lock too long if something goes wrong. In production conditions
    # as of 20130812, refreshing all feeds takes only a moment:
    # [2013-08-12 09:07:02,028: INFO/MainProcess] Task
    #    oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s.
    my_lock = RedisExpiringLock(
        'refresh_all_mongo_feeds',
        expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1

    )

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing all feed refresh…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_mongo_feeds() is already '
                           u'locked, aborting.')
            return

    feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True)

    if limit:
        feeds = feeds.limit(limit)

    # No need for caching and cluttering CPU/memory for a one-shot thing.
    feeds.no_cache()

    with benchmark('refresh_all_mongo_feeds()'):

        try:
            count = 0
            mynow = now()

            for feed in feeds:

                if feed.refresh_lock.is_locked():
                    LOGGER.info(u'Feed %s already locked, skipped.', feed)
                    continue

                interval = timedelta(seconds=feed.fetch_interval)

                if feed.last_fetch is None:

                    mongo_feed_refresh_task.delay(feed.id)

                    LOGGER.info(u'Launched immediate refresh of feed %s which '
                                u'has never been refreshed.', feed)

                elif force or feed.last_fetch + interval < mynow:

                    how_late = feed.last_fetch + interval - mynow
                    how_late = how_late.days * 86400 + how_late.seconds

                    countdown = 0
                    mongo_feed_refresh_task.delay(feed.id, force)

                    LOGGER.info(u'%s refresh of feed %s %s (%s late).',
                                u'Scheduled randomized'
                                if countdown else u'Launched',
                                feed,
                                u' in {0}'.format(naturaldelta(countdown))
                                if countdown else u'in the background',
                                naturaldelta(how_late))
                    count += 1

        finally:
            # HEADS UP: see core.tasks.refresh_all_feeds() for note.
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.',
                    count, feeds.count())
Beispiel #33
0
    def check_temporary_defect(cls, defect_name, limit=None, progress=None):
        """ TODO. """

        if limit is None:
            # Don't let "Cursor … invalid at server" errors stop us.
            limit = 10000

        if progress is None:
            progress = 100

        if hasattr(cls, defect_name) and hasattr(cls, defect_name + '_done'):

            Q1_params    = {defect_name + '_done__exists': False}
            Q2_params    = {defect_name + '_done': False}
            done_count   = 0
            failed_count = 0
            failed_ids   = []

            def get_count():
                return cls.objects(Q(**Q1_params)
                                   | Q(**Q2_params)).no_cache().count()

            def get_documents_with_limit():
                return cls.objects(Q(**Q1_params)
                                   | Q(**Q2_params)).limit(limit).no_cache()

            LOGGER.info(u'Counting initial `%s` defects on %s…',
                        defect_name, cls.__name__)

            count = get_count()

            if count:
                LOGGER.info(u'Starting check of %s %s against `%s` '
                            u'(each star: %s done)…', count, cls.__name__,
                            defect_name, progress)

            with benchmark(u'Check %s %s against %s' % (
                           count, cls.__name__, defect_name)):

                while count > failed_count:
                    with benchmark(u'Sub-check %s %s against `%s`' % (limit,
                                   cls.__name__, defect_name)):
                        for document in get_documents_with_limit():
                            try:
                                getattr(document, defect_name)()

                            except:
                                # Let's roll. One fail will not stop up.

                                if document.id in failed_ids:
                                    # Don't "continue", we need done_count to
                                    # be updated, and dots to be outputed.
                                    pass

                                else:
                                    failed_ids.append(document.id)
                                    failed_count += 1
                                    sys.stderr.write(u'\n')
                                    LOGGER.exception(u'SKIP: self.%s() failed '
                                                     u'on %s #%s', defect_name,
                                                     cls.__name__, document.id)

                            done_count += 1

                            if done_count % progress == 0:
                                sys.stderr.write(u'*')
                                sys.stderr.flush()

                                if done_count % limit == 0:
                                    sys.stderr.write(u'\n')

                        count = get_count()

                        if done_count % limit != 0:
                            # Last line deserves a newline.
                            sys.stderr.write(u'\n')

        else:
            LOGGER.error(u'Defect `%s` has not the required class '
                         u'attributes on %s.', defect_name, cls.__name__)
Beispiel #34
0
def refresh_all_feeds(limit=None, force=False):
    u""" Refresh all feeds (RSS/Mail/Twitter…). """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable,
    # we should re-evaluate it each time we run.
    this_round_expire_time = (config.FEED_GLOBAL_REFRESH_INTERVAL * 60 -
                              config.FEED_GLOBAL_REFRESH_INTERVAL)

    # Be sure two refresh operations don't overlap, but don't hold the
    # lock too long if something goes wrong. In production conditions
    # as of 20130812, refreshing all feeds takes only a moment:
    # [2013-08-12 09:07:02,028: INFO/MainProcess] Task
    #       oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s.
    #
    my_lock = RedisExpiringLock(REFRESH_ALL_FEEDS_LOCK_NAME,
                                expire_time=this_round_expire_time)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing all feed refresh…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.')
            return

    # This should bring us a Polymorphic Query to refresh all feeds types.
    feeds = BaseFeed.objects.filter(
        is_active=True, is_internal=False).order_by('date_last_fetch')

    if limit:
        feeds = feeds[:limit]

    with benchmark('refresh_all_feeds()'):

        try:
            count = 0
            mynow = now()

            for feed in feeds:

                if feed.refresh_lock.is_locked():
                    # The refresh task lauched before its expiration, and is
                    # still [long] running while we want to launch another.
                    # Avoid, because the new would exit immediately on
                    # date_last_fetch too recent.
                    LOGGER.debug(u'Feed %s already locked, skipped.', feed)
                    continue

                if feed.date_last_fetch is None:

                    basefeed_refresh_task.apply_async(
                        args=(feed.id, ),

                        # in `this_round_expire_time`, we will relaunch it
                        # anyway, so don't clutter the queue with double work.
                        expire=this_round_expire_time,
                    )

                    LOGGER.info(
                        u'Launched immediate refresh of feed %s which '
                        u'has never been refreshed.', feed)
                    count += 1
                    continue

                if feed.fetch_interval > 86399:
                    interval_days = feed.fetch_interval / 86400
                    interval_seconds = feed.fetch_interval - (interval_days *
                                                              86400)

                    interval = timedelta(days=interval_days,
                                         seconds=interval_seconds)

                else:
                    interval = timedelta(seconds=feed.fetch_interval)

                if force or feed.date_last_fetch + interval < mynow:

                    how_late = feed.date_last_fetch + interval - mynow
                    how_late = how_late.days * 86400 + how_late.seconds

                    late = feed.date_last_fetch + interval < mynow

                    basefeed_refresh_task.apply_async(
                        args=(feed.id, ),
                        kwargs={'force': force},
                        expire=this_round_expire_time,
                    )

                    LOGGER.info(u'Launched refresh of feed %s (%s %s).', feed,
                                naturaldelta(how_late),
                                u'late' if late else u'earlier')
                    count += 1

        finally:
            # HEADS UP: in case the system is overloaded and feeds refresh()
            #           tasks don't complete fast enough, the current task
            #           will overload it even more. Thus, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to feed the refresh queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count,
                    feeds.count())
Beispiel #35
0
def global_subscriptions_checker(force=False, limit=None, from_feeds=True,
                                 from_users=False, extended_check=False):
    """ A conditionned version of :meth:`Feed.check_subscriptions`. """

    if config.CHECK_SUBSCRIPTIONS_DISABLED:
        LOGGER.warning(u'Subscriptions checks disabled in configuration.')
        return

    # This task runs one a day. Acquire the lock for just a
    # little more time to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_subscriptions',
                                expire_time=3600 * 25)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing subscriptions checks…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_subscriptions_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_SUBSCRIPTIONS_LIMIT

    assert int(limit) >= 0

    try:
        if from_feeds:
            with benchmark("Check all subscriptions from feeds"):

                # We check ALL feeds (including inactive ones) to be
                # sure all subscriptions / reads are up-to-date.
                feeds           = BaseFeed.objects.all()
                feeds_count     = feeds.count()
                processed_count = 0
                checked_count   = 0

                for feed in feeds.iterator():

                    if limit and checked_count > limit:
                        break

                    if extended_check:
                        feed.compute_cached_descriptors()
                        # all=True, good=True, bad=True

                    # Do not extended_check=True, this would double-do
                    # the subscription.check_reads() already called below.
                    feed.check_subscriptions()

                    for subscription in feed.subscriptions.all().iterator():

                        processed_count += 1

                        if subscription.all_items_count \
                                != feed.good_items_count:

                            checked_count += 1

                            LOGGER.info(u'Subscription %s (#%s) has %s reads '
                                        u'whereas its feed has %s good '
                                        u'articles; checking…',
                                        subscription.name, subscription.id,
                                        subscription.all_items_count,
                                        feed.good_items_count)

                            subscription.check_reads(
                                extended_check=extended_check, force=True)

                LOGGER.info(u'%s/%s (limit:%s) feeds processed, %s '
                            u'checked (%.2f%%).',
                            processed_count, feeds_count, limit,
                            checked_count, checked_count
                            * 100.0 / processed_count)

        if from_users:
            with benchmark("Check all subscriptions from users"):

                users           = User.objects.filter(is_active=True)
                users_count     = users.count()
                processed_count = 0

                for user in users:

                    # Do not extended_check=True, this would double-do
                    # the subscription.check_reads() already called below.
                    user.check_subscriptions()

                    if extended_check:
                        user.user_counters.compute_cached_descriptors()
                        # all=True, unread=True, starred=True, bookmarked=True

                        for subscription in user.subscriptions.all().iterator():
                                processed_count += 1

                                subscription.check_reads(extended_check=True,
                                                         force=True)

                LOGGER.info(u'%s users %sprocessed. '
                            u'All were checked.', users_count,
                            u'and %s subscriptions '.format(processed_count)
                            if extended_check else u'')

    finally:
        my_lock.release()
Beispiel #36
0
def global_duplicates_checker(limit=None, force=False):
    """ Check that duplicate articles have no more Reads anywhere.

    Fix it if not, and update all counters accordingly.

    :param limit: integer, the maximum number of duplicates to check.
        Default: none.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    """

    if config.CHECK_DUPLICATES_DISABLED:
        LOGGER.warning(u'Duplicates check disabled in configuration.')
        return

    # This task runs one a day. Acquire the lock for just a
    # little more time to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing duplicates check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_subscriptions_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_DUPLICATES_LIMIT

    start_time = pytime.time()
    duplicates = BaseItem.objects.duplicate()

    total_dupes_count = duplicates.count()
    total_reads_count = 0
    processed_dupes = 0
    done_dupes_count = 0
    purged_dupes_count = 0

    purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS)
    purge_after_weeks_count = min(52, purge_after_weeks_count)

    purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7)

    LOGGER.info(
        u'Done counting (took %s of pure SQL joy), starting procedure.',
        naturaldelta(pytime.time() - start_time))

    with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all',
                                                      total_dupes_count)):

        try:
            for duplicate in duplicates.iterator():
                reads = duplicate.reads.all()

                processed_dupes += 1

                if reads.exists():
                    done_dupes_count += 1
                    reads_count = reads.count()
                    total_reads_count += reads_count

                    LOGGER.info(
                        u'Duplicate %s #%s still has %s reads, fixing…',
                        duplicate._meta.model.__name__, duplicate.id,
                        reads_count)

                    duplicate.duplicate_of.register_duplicate(
                        duplicate,
                        force=duplicate.duplicate_status ==
                        DUPLICATE_STATUS.FINISHED)

                if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED:
                    #
                    # TODO: check we didn't get some race-conditions new
                    #       dependancies between the moment the duplicate
                    #       was marked duplicate and now.

                    if duplicate.date_created < purge_before_date:
                        try:
                            with transaction.atomic():
                                duplicate.delete()
                        except:
                            LOGGER.exception(
                                u'Exception while deleting '
                                u'duplicate %s #%s',
                                duplicate._meta.model.__name__, duplicate.id)

                        purged_dupes_count += 1
                        LOGGER.info(u'Purged duplicate %s #%s from database.',
                                    duplicate._meta.model.__name__,
                                    duplicate.id)

                elif duplicate.duplicate_status in (
                        DUPLICATE_STATUS.NOT_REPLACED,
                        DUPLICATE_STATUS.FAILED):
                    # Something went wrong, perhaps the
                    # task was purged before beiing run.
                    duplicate.duplicate_of.register_duplicate(duplicate)
                    done_dupes_count += 1

                elif duplicate.duplicate_status is None:
                    # Something went very wrong. If the article is a known
                    # duplicate, its status field should have been set to
                    # at least NOT_REPLACED.
                    duplicate.duplicate_of.register_duplicate(duplicate)
                    done_dupes_count += 1

                    LOGGER.error(
                        u'Corrected duplicate %s #%s found with no '
                        u'status.', duplicate._meta.model.__name__,
                        duplicate.id)

                if limit and processed_dupes >= limit:
                    break

        finally:
            my_lock.release()

    LOGGER.info(
        u'global_duplicates_checker(): %s/%s duplicates processed '
        u'(%.2f%%; limit: %s), %s corrected (%.2f%%), '
        u'%s purged (%.2f%%); %s reads altered.', processed_dupes,
        total_dupes_count, processed_dupes * 100.0 / total_dupes_count, limit
        or u'none', done_dupes_count,
        (done_dupes_count * 100.0 /
         processed_dupes) if processed_dupes else 0.0, purged_dupes_count,
        (purged_dupes_count * 100.0 /
         processed_dupes) if processed_dupes else 0.0, total_reads_count)
Beispiel #37
0
def global_duplicates_checker(limit=None, force=False):
    """ Check that duplicate articles have no more Reads anywhere.

    Fix it if not, and update all counters accordingly.

    :param limit: integer, the maximum number of duplicates to check.
        Default: none.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    """

    if config.CHECK_DUPLICATES_DISABLED:
        LOGGER.warning(u'Duplicates check disabled in configuration.')
        return

    # This task runs one a day. Acquire the lock for just a
    # little more time to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing duplicates check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_subscriptions_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_DUPLICATES_LIMIT

    start_time = pytime.time()
    duplicates = BaseItem.objects.duplicate()

    total_dupes_count  = duplicates.count()
    total_reads_count  = 0
    processed_dupes    = 0
    done_dupes_count   = 0
    purged_dupes_count = 0

    purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS)
    purge_after_weeks_count = min(52, purge_after_weeks_count)

    purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7)

    LOGGER.info(u'Done counting (took %s of pure SQL joy), starting procedure.',
                naturaldelta(pytime.time() - start_time))

    with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all',
                   total_dupes_count)):

        try:
            for duplicate in duplicates.iterator():
                reads = duplicate.reads.all()

                processed_dupes += 1

                if reads.exists():
                    done_dupes_count  += 1
                    reads_count        = reads.count()
                    total_reads_count += reads_count

                    LOGGER.info(u'Duplicate %s #%s still has %s reads, fixing…',
                                duplicate._meta.model.__name__,
                                duplicate.id, reads_count)

                    duplicate.duplicate_of.register_duplicate(
                        duplicate, force=duplicate.duplicate_status
                        == DUPLICATE_STATUS.FINISHED)

                if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED:
                    #
                    # TODO: check we didn't get some race-conditions new
                    #       dependancies between the moment the duplicate
                    #       was marked duplicate and now.

                    if duplicate.date_created < purge_before_date:
                        try:
                            with transaction.atomic():
                                duplicate.delete()
                        except:
                            LOGGER.exception(u'Exception while deleting '
                                             u'duplicate %s #%s',
                                             duplicate._meta.model.__name__,
                                             duplicate.id)

                        purged_dupes_count += 1
                        LOGGER.info(u'Purged duplicate %s #%s from database.',
                                    duplicate._meta.model.__name__,
                                    duplicate.id)

                elif duplicate.duplicate_status in (
                    DUPLICATE_STATUS.NOT_REPLACED,
                        DUPLICATE_STATUS.FAILED):
                    # Something went wrong, perhaps the
                    # task was purged before beiing run.
                    duplicate.duplicate_of.register_duplicate(duplicate)
                    done_dupes_count += 1

                elif duplicate.duplicate_status is None:
                    # Something went very wrong. If the article is a known
                    # duplicate, its status field should have been set to
                    # at least NOT_REPLACED.
                    duplicate.duplicate_of.register_duplicate(duplicate)
                    done_dupes_count += 1

                    LOGGER.error(u'Corrected duplicate %s #%s found with no '
                                 u'status.', duplicate._meta.model.__name__,
                                 duplicate.id)

                if limit and processed_dupes >= limit:
                    break

        finally:
            my_lock.release()

    LOGGER.info(u'global_duplicates_checker(): %s/%s duplicates processed '
                u'(%.2f%%; limit: %s), %s corrected (%.2f%%), '
                u'%s purged (%.2f%%); %s reads altered.',

                processed_dupes, total_dupes_count,
                processed_dupes * 100.0 / total_dupes_count,

                limit or u'none',

                done_dupes_count,
                (done_dupes_count * 100.0 / processed_dupes)
                if processed_dupes else 0.0,

                purged_dupes_count,
                (purged_dupes_count * 100.0 / processed_dupes)
                if processed_dupes else 0.0,

                total_reads_count)
Beispiel #38
0
def global_users_checker(limit=None, extended_check=False, force=False,
                         verbose=False, break_on_exception=False):
    """ Check all Users and their dependancies.

    Can be disabled by ``config.CHECK_USERS_DISABLED`` directive.

    :param limit: integer, the maximum number of users to check.
        Default: none.
    :param extended_check: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param break_on_exception: boolean, default ``False``, currently
        ignored in this function.
    """

    if config.CHECK_USERS_DISABLED:
        LOGGER.warning(u'Users check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_users', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing users check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_users_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_USERS_LIMIT

    active_users      = User.objects.filter(is_active=True)
    total_users_count = active_users.count()
    processed_users   = 0
    changed_users     = 0
    skipped_count     = 0

    with benchmark(u"Check {0}/{1} users".format(limit or u'all',
                   total_users_count)):
        try:
            for user in active_users.iterator():

                processed_users += 1

                if limit and changed_users >= limit:
                    break

                check_one_user(user, extended_check=extended_check,
                               force=force, verbose=verbose)

        finally:
            my_lock.release()

    LOGGER.info(u'global_users_checker(): %s/%s users processed '
                u'(%.2f%%), %s corrected (%.2f%%), %s skipped (%.2f%%).',
                processed_users, total_users_count,
                processed_users * 100.0 / total_users_count,
                changed_users,
                changed_users * 100.0 / processed_users,
                skipped_count,
                skipped_count * 100.0 / processed_users)
Beispiel #39
0
def refresh_all_feeds(limit=None, force=False):
    u""" Refresh all feeds (RSS/Mail/Twitter…). """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable,
    # we should re-evaluate it each time we run.
    this_round_expire_time = (
        config.FEED_GLOBAL_REFRESH_INTERVAL * 60
        - config.FEED_GLOBAL_REFRESH_INTERVAL
    )

    # Be sure two refresh operations don't overlap, but don't hold the
    # lock too long if something goes wrong. In production conditions
    # as of 20130812, refreshing all feeds takes only a moment:
    # [2013-08-12 09:07:02,028: INFO/MainProcess] Task
    #       oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s.
    #
    my_lock = RedisExpiringLock(
        REFRESH_ALL_FEEDS_LOCK_NAME,
        expire_time=this_round_expire_time
    )

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing all feed refresh…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.')
            return

    # This should bring us a Polymorphic Query to refresh all feeds types.
    feeds = BaseFeed.objects.filter(is_active=True,
                                    is_internal=False).order_by(
                                        'date_last_fetch')

    if limit:
        feeds = feeds[:limit]

    with benchmark('refresh_all_feeds()'):

        try:
            count = 0
            mynow = now()

            for feed in feeds:

                if feed.refresh_lock.is_locked():
                    # The refresh task lauched before its expiration, and is
                    # still [long] running while we want to launch another.
                    # Avoid, because the new would exit immediately on
                    # date_last_fetch too recent.
                    LOGGER.debug(u'Feed %s already locked, skipped.', feed)
                    continue

                if feed.date_last_fetch is None:

                    basefeed_refresh_task.apply_async(
                        args=(feed.id, ),

                        # in `this_round_expire_time`, we will relaunch it
                        # anyway, so don't clutter the queue with double work.
                        expire=this_round_expire_time,
                    )

                    LOGGER.info(u'Launched immediate refresh of feed %s which '
                                u'has never been refreshed.', feed)
                    count += 1
                    continue

                if feed.fetch_interval > 86399:
                    interval_days = feed.fetch_interval / 86400
                    interval_seconds = feed.fetch_interval - (
                        interval_days * 86400)

                    interval = timedelta(days=interval_days,
                                         seconds=interval_seconds)

                else:
                    interval = timedelta(seconds=feed.fetch_interval)

                if force or feed.date_last_fetch + interval < mynow:

                    how_late = feed.date_last_fetch + interval - mynow
                    how_late = how_late.days * 86400 + how_late.seconds

                    late = feed.date_last_fetch + interval < mynow

                    basefeed_refresh_task.apply_async(
                        args=(feed.id, ),
                        kwargs={'force': force},
                        expire=this_round_expire_time,
                    )

                    LOGGER.info(u'Launched refresh of feed %s (%s %s).',
                                feed, naturaldelta(how_late),
                                u'late' if late else u'earlier')
                    count += 1

        finally:
            # HEADS UP: in case the system is overloaded and feeds refresh()
            #           tasks don't complete fast enough, the current task
            #           will overload it even more. Thus, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to feed the refresh queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.',
                    count, feeds.count())
Beispiel #40
0
def refresh_all_mongo_feeds(limit=None, force=False):
    u""" Refresh all MongoEngine feeds (RSS).

    .. note:: this task should vanish when
        MongoDB → PostgreSQL migration is done.
    """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    # Be sure two refresh operations don't overlap, but don't hold the
    # lock too long if something goes wrong. In production conditions
    # as of 20130812, refreshing all feeds takes only a moment:
    # [2013-08-12 09:07:02,028: INFO/MainProcess] Task
    #    oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s.
    my_lock = RedisExpiringLock(
        'refresh_all_mongo_feeds',
        expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing all feed refresh…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_mongo_feeds() is already '
                           u'locked, aborting.')
            return

    feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True)

    if limit:
        feeds = feeds.limit(limit)

    # No need for caching and cluttering CPU/memory for a one-shot thing.
    feeds.no_cache()

    with benchmark('refresh_all_mongo_feeds()'):

        try:
            count = 0
            mynow = now()

            for feed in feeds:

                if feed.refresh_lock.is_locked():
                    LOGGER.info(u'Feed %s already locked, skipped.', feed)
                    continue

                interval = timedelta(seconds=feed.fetch_interval)

                if feed.last_fetch is None:

                    mongo_feed_refresh_task.delay(feed.id)

                    LOGGER.info(
                        u'Launched immediate refresh of feed %s which '
                        u'has never been refreshed.', feed)

                elif force or feed.last_fetch + interval < mynow:

                    how_late = feed.last_fetch + interval - mynow
                    how_late = how_late.days * 86400 + how_late.seconds

                    countdown = 0
                    mongo_feed_refresh_task.delay(feed.id, force)

                    LOGGER.info(
                        u'%s refresh of feed %s %s (%s late).',
                        u'Scheduled randomized' if countdown else u'Launched',
                        feed, u' in {0}'.format(naturaldelta(countdown))
                        if countdown else u'in the background',
                        naturaldelta(how_late))
                    count += 1

        finally:
            # HEADS UP: see core.tasks.refresh_all_feeds() for note.
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count,
                    feeds.count())
Beispiel #41
0
def global_orphaned_checker(limit=None, extended_check=False, force=False,
                            verbose=False, break_on_exception=False):
    """ Check all orphaned articles and delete them.

    They will be deleted only if they are duplicate of other orphaned ones,
    and only if the duplication replacement process finished successfully.
    If it failed, the orphan is left in place, to be able to re-run the
    operation later.

    Can be disabled by ``config.CHECK_ORPHANED_DISABLED`` directive.

    :param limit: integer, the maximum number of users to check.
        Default: none.
    :param extended_check: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param break_on_exception: boolean, default ``False``, currently
        ignored in this function.
    """

    if config.CHECK_ORPHANED_DISABLED:
        LOGGER.warning(u'Orphaned check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_orphaned', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing orphaned check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_orphaned_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_ORPHANED_LIMIT

    orphaned_items       = Article.objects.orphaned().master()
    orphaned_items_count = orphaned_items.count()
    processed_orphans    = 0
    changed_orphans      = 0
    deleted_orphans      = 0
    skipped_orphans      = 0

    with benchmark(u"Check {0}/{1} orphans".format(limit or u'all',
                   orphaned_items_count)):
        try:
            for orphan in orphaned_items.iterator():
                processed_orphans += 1

                if limit and changed_orphans >= limit:
                    break

                old_url = orphan.url

                new_url = ARTICLE_ORPHANED_BASE + generate_orphaned_hash(
                    orphan.name, orphan.feeds.all())

                if new_url != old_url:
                    orphan.url = new_url
                    orphan.url_absolute = True

                else:
                    if not orphan.url_absolute:
                        changed_orphans += 1
                        orphan.url_absolute = True
                        orphan.save()

                    continue

                try:
                    orphan.save()

                except IntegrityError:
                    master = Article.objects.get(url=orphan.url)

                    # We have to put back the original URL, else the
                    # duplicate registration process will fail.
                    orphan.url = old_url

                    # Register the duplicate right here and now, to be able to
                    master.register_duplicate(orphan, force=force,
                                              background=False)

                    # Reload the orphan to get the refreshed duplicate status.
                    orphan = Article.objects.get(id=orphan.id)

                    if orphan.duplicate_status == DUPLICATE_STATUS.FINISHED:
                        orphan.delete()
                        deleted_orphans += 1

                        if verbose:
                            LOGGER.info(u'Deleted duplicate orphan %s', orphan)

                except:
                    skipped_orphans += 1
                    LOGGER.exception(u'Unhandled exception while checking %s',
                                     orphan)

                else:
                    changed_orphans += 1

        finally:
            my_lock.release()

    LOGGER.info(u'global_orphans_checker(): %s/%s orphans processed '
                u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), '
                u'%s skipped (%.2f%%).',
                processed_orphans, orphaned_items_count,
                processed_orphans * 100.0 / orphaned_items_count,
                changed_orphans,
                changed_orphans * 100.0 / processed_orphans,
                deleted_orphans,
                deleted_orphans * 100.0 / processed_orphans,
                skipped_orphans,
                skipped_orphans * 100.0 / processed_orphans)
Beispiel #42
0
def reprocess_failed_articles(failed=None, expiry=None,
                              limit=None, force=False,
                              reprocessing_type=None):
    u""" Reprocess articles that failed absolutization.

    In case there was a temporary error, this could lead to more good articles.
    """

    if config.ARTICLE_REPROCESSING_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Articles reprocess disabled in configuration.')
        return

    if failed is None:
        raise RuntimeError(u'Need a queryset of failed items to reprocess.')

    # TODO: as the celery tasks expires,
    # the lock is probably not needed anymore.

    my_lock = RedisExpiringLock(
        'reprocess_failed_articles_' + str(expiry),
        expire_time=expiry
    )

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing failed articles reprocessing…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'reprocess_failed_articles() is already locked, '
                           u'aborting.')
            return

    failed_count = failed.count()

    with benchmark((u'Reprocess_failed_articles(expiry=%s): %s '
                   u' processing chains relaunched.')
                   % (naturaldelta(expiry), failed_count)):

        try:
            for article in failed.iterator():

                if reprocessing_type is None:
                    article.url_error = None
                    article.save()

                    article_post_create_task.apply(args=(article.id, ),
                                                   kwargs={'apply_now': True})

                elif reprocessing_type == 'standard':
                    article.process()

        finally:
            # HEADS UP: in case the system is overloaded, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to flood the queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass