Ejemplo n.º 1
0
def archive_documents(limit=None, force=False):
    """ Archive all kind of documents that need archiving. """

    if config.DOCUMENTS_ARCHIVING_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Document archiving disabled in configuration.')
        return

    # Be sure two archiving operations don't overlap, this is a very costly
    # operation for the database, and it can make the system very slugish.
    # The whole operation can be very long, we lock for a long time.
    my_lock = RedisExpiringLock('archive_documents', expire_time=3600 * 24)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'archive_documents() force unlock/re-acquire, '
                           u'be careful with that.')

        else:
            LOGGER.warning(u'archive_documents() is already locked, aborting.')
            return

    # these are tasks, but we run them sequentially in this global archive job
    # to avoid hammering the production database with multiple archive jobs.
    archive_articles(limit=limit)

    my_lock.release()
Ejemplo n.º 2
0
def synchronize_statsd_gauges(full=False, force=False):
    """ Synchronize all counters to statsd. """

    # from oneflow.core.stats import (
    #     synchronize_mongodb_statsd_articles_gauges,
    #     synchronize_mongodb_statsd_tags_gauges,
    #     synchronize_mongodb_statsd_websites_gauges,
    #     synchronize_mongodb_statsd_authors_gauges,
    # )

    from oneflow.core.dbstats import (
        synchronize_statsd_articles_gauges,
        synchronize_statsd_tags_gauges,
        synchronize_statsd_websites_gauges,
        synchronize_statsd_authors_gauges,
        synchronize_statsd_feeds_gauges,
        synchronize_statsd_subscriptions_gauges,
        synchronize_statsd_reads_gauges,
    )

    my_lock = RedisExpiringLock(SYNCHRONIZE_STATSD_LOCK_NAME, expire_time=3600)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing statsd gauges synchronization…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'synchronize_statsd_gauges() is already locked, '
                           u'aborting.')
            return

    # with benchmark('synchronize_mongodb_statsd_gauges()'):
    #     try:
    #         synchronize_mongodb_statsd_articles_gauges(full=full)
    #         synchronize_mongodb_statsd_tags_gauges(full=full)
    #         synchronize_mongodb_statsd_websites_gauges(full=full)
    #         synchronize_mongodb_statsd_authors_gauges(full=full)
    #     except:
    #         LOGGER.exception(u'MongoDB stats failed at some point')

    with benchmark('synchronize_statsd_gauges()'):

        try:
            synchronize_statsd_articles_gauges(full=full)
            synchronize_statsd_tags_gauges(full=full)
            synchronize_statsd_websites_gauges(full=full)
            synchronize_statsd_authors_gauges(full=full)
            synchronize_statsd_feeds_gauges(full=full)
            synchronize_statsd_subscriptions_gauges(full=full)
            synchronize_statsd_reads_gauges(full=full)

        finally:
            my_lock.release()
Ejemplo n.º 3
0
def archive_documents(limit=None, force=False):
    """ Archive all kind of documents that need archiving. """

    if config.DOCUMENTS_ARCHIVING_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Document archiving disabled in configuration.')
        return

    # Be sure two archiving operations don't overlap, this is a very costly
    # operation for the database, and it can make the system very slugish.
    # The whole operation can be very long, we lock for a long time.
    my_lock = RedisExpiringLock('archive_documents', expire_time=3600 * 24)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'archive_documents() force unlock/re-acquire, '
                           u'be careful with that.')

        else:
            LOGGER.warning(u'archive_documents() is already locked, aborting.')
            return

    # these are tasks, but we run them sequentially in this global archive job
    # to avoid hammering the production database with multiple archive jobs.
    archive_articles(limit=limit)

    my_lock.release()
Ejemplo n.º 4
0
    def update_recent_items_count(self, force=False):
        """ This task is protected to run only once per day,
            even if is called more. """

        urac_lock = RedisExpiringLock(self,
                                      lock_name='urac',
                                      expire_time=86100)

        if urac_lock.acquire() or force:
            self.recent_items_count = self.recent_items.count()

        elif not force:
            LOGGER.warning(
                u'No more than one update_recent_items_count '
                u'per day (feed %s).', self)
Ejemplo n.º 5
0
    def refresh_lock(self):
        try:
            return self.__refresh_lock

        except AttributeError:
            self.__refresh_lock = RedisExpiringLock(
                self,
                lock_name='fetch',
                expire_time=self.REFRESH_LOCK_INTERVAL or self.fetch_interval)
            return self.__refresh_lock
Ejemplo n.º 6
0
    def refresh_lock(self):
        try:
            return self.__refresh_lock

        except AttributeError:
            self.__refresh_lock = RedisExpiringLock(
                self,
                lock_name='account_fetch',
            )
            return self.__refresh_lock
Ejemplo n.º 7
0
    def sync_lock(self):

        try:
            return self._sync_lock_

        except AttributeError:
            self._sync_lock_ = RedisExpiringLock(self,
                                                 lock_name='sync',
                                                 expire_time=86100)
            return self._sync_lock_
Ejemplo n.º 8
0
def refresh_all_mailaccounts(force=False):
    """ Check all unusable e-mail accounts. """

    if config.MAIL_ACCOUNT_REFRESH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'E-mail accounts check disabled in configuration.')
        return

    accounts = MailAccount.objects.unusable()

    my_lock = RedisExpiringLock(REFRESH_ALL_MAILACCOUNTS_LOCK_NAME,
                                expire_time=30 * (accounts.count() + 2))

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing check of email accounts…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_mailaccounts() is already locked, '
                           u'aborting.')
            return

    with benchmark('refresh_all_mailaccounts()'):

        try:
            for account in accounts:
                try:
                    account.test_connection()
                    account.update_mailboxes()

                except:
                    pass

        finally:
            my_lock.release()

        LOGGER.info(
            u'Launched %s checks on unusable accounts out of %s total.',
            accounts.count(),
            MailAccount.objects.all().count())
Ejemplo n.º 9
0
def refresh_all_mailaccounts(force=False):
    """ Check all unusable e-mail accounts. """

    if config.MAIL_ACCOUNT_REFRESH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'E-mail accounts check disabled in configuration.')
        return

    accounts = MailAccount.objects.unusable()

    my_lock = RedisExpiringLock(REFRESH_ALL_MAILACCOUNTS_LOCK_NAME,
                                expire_time=30 * (accounts.count() + 2))

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing check of email accounts…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_mailaccounts() is already locked, '
                           u'aborting.')
            return

    with benchmark('refresh_all_mailaccounts()'):

        try:
            for account in accounts:
                try:
                    account.test_connection()
                    account.update_mailboxes()

                except:
                    pass

        finally:
            my_lock.release()

        LOGGER.info(u'Launched %s checks on unusable accounts out of %s total.',
                    accounts.count(), MailAccount.objects.all().count())
Ejemplo n.º 10
0
def global_duplicates_checker(limit=None, force=False):
    """ Check that duplicate articles have no more Reads anywhere.

    Fix it if not, and update all counters accordingly.

    :param limit: integer, the maximum number of duplicates to check.
        Default: none.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    """

    if config.CHECK_DUPLICATES_DISABLED:
        LOGGER.warning(u'Duplicates check disabled in configuration.')
        return

    # This task runs one a day. Acquire the lock for just a
    # little more time to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing duplicates check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_subscriptions_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_DUPLICATES_LIMIT

    start_time = pytime.time()
    duplicates = BaseItem.objects.duplicate()

    total_dupes_count = duplicates.count()
    total_reads_count = 0
    processed_dupes = 0
    done_dupes_count = 0
    purged_dupes_count = 0

    purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS)
    purge_after_weeks_count = min(52, purge_after_weeks_count)

    purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7)

    LOGGER.info(
        u'Done counting (took %s of pure SQL joy), starting procedure.',
        naturaldelta(pytime.time() - start_time))

    with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all',
                                                      total_dupes_count)):

        try:
            for duplicate in duplicates.iterator():
                reads = duplicate.reads.all()

                processed_dupes += 1

                if reads.exists():
                    done_dupes_count += 1
                    reads_count = reads.count()
                    total_reads_count += reads_count

                    LOGGER.info(
                        u'Duplicate %s #%s still has %s reads, fixing…',
                        duplicate._meta.model.__name__, duplicate.id,
                        reads_count)

                    duplicate.duplicate_of.register_duplicate(
                        duplicate,
                        force=duplicate.duplicate_status ==
                        DUPLICATE_STATUS.FINISHED)

                if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED:
                    #
                    # TODO: check we didn't get some race-conditions new
                    #       dependancies between the moment the duplicate
                    #       was marked duplicate and now.

                    if duplicate.date_created < purge_before_date:
                        try:
                            with transaction.atomic():
                                duplicate.delete()
                        except:
                            LOGGER.exception(
                                u'Exception while deleting '
                                u'duplicate %s #%s',
                                duplicate._meta.model.__name__, duplicate.id)

                        purged_dupes_count += 1
                        LOGGER.info(u'Purged duplicate %s #%s from database.',
                                    duplicate._meta.model.__name__,
                                    duplicate.id)

                elif duplicate.duplicate_status in (
                        DUPLICATE_STATUS.NOT_REPLACED,
                        DUPLICATE_STATUS.FAILED):
                    # Something went wrong, perhaps the
                    # task was purged before beiing run.
                    duplicate.duplicate_of.register_duplicate(duplicate)
                    done_dupes_count += 1

                elif duplicate.duplicate_status is None:
                    # Something went very wrong. If the article is a known
                    # duplicate, its status field should have been set to
                    # at least NOT_REPLACED.
                    duplicate.duplicate_of.register_duplicate(duplicate)
                    done_dupes_count += 1

                    LOGGER.error(
                        u'Corrected duplicate %s #%s found with no '
                        u'status.', duplicate._meta.model.__name__,
                        duplicate.id)

                if limit and processed_dupes >= limit:
                    break

        finally:
            my_lock.release()

    LOGGER.info(
        u'global_duplicates_checker(): %s/%s duplicates processed '
        u'(%.2f%%; limit: %s), %s corrected (%.2f%%), '
        u'%s purged (%.2f%%); %s reads altered.', processed_dupes,
        total_dupes_count, processed_dupes * 100.0 / total_dupes_count, limit
        or u'none', done_dupes_count,
        (done_dupes_count * 100.0 /
         processed_dupes) if processed_dupes else 0.0, purged_dupes_count,
        (purged_dupes_count * 100.0 /
         processed_dupes) if processed_dupes else 0.0, total_reads_count)
Ejemplo n.º 11
0
def global_subscriptions_checker(force=False,
                                 limit=None,
                                 from_feeds=True,
                                 from_users=False,
                                 extended_check=False):
    """ A conditionned version of :meth:`Feed.check_subscriptions`. """

    if config.CHECK_SUBSCRIPTIONS_DISABLED:
        LOGGER.warning(u'Subscriptions checks disabled in configuration.')
        return

    # This task runs one a day. Acquire the lock for just a
    # little more time to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_subscriptions',
                                expire_time=3600 * 25)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing subscriptions checks…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_subscriptions_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_SUBSCRIPTIONS_LIMIT

    assert int(limit) >= 0

    try:
        if from_feeds:
            with benchmark("Check all subscriptions from feeds"):

                # We check ALL feeds (including inactive ones) to be
                # sure all subscriptions / reads are up-to-date.
                feeds = BaseFeed.objects.all()
                feeds_count = feeds.count()
                processed_count = 0
                checked_count = 0

                for feed in feeds.iterator():

                    if limit and checked_count > limit:
                        break

                    if extended_check:
                        feed.compute_cached_descriptors()
                        # all=True, good=True, bad=True

                    # Do not extended_check=True, this would double-do
                    # the subscription.check_reads() already called below.
                    feed.check_subscriptions()

                    for subscription in feed.subscriptions.all().iterator():

                        processed_count += 1

                        if subscription.all_items_count \
                                != feed.good_items_count:

                            checked_count += 1

                            LOGGER.info(
                                u'Subscription %s (#%s) has %s reads '
                                u'whereas its feed has %s good '
                                u'articles; checking…', subscription.name,
                                subscription.id, subscription.all_items_count,
                                feed.good_items_count)

                            subscription.check_reads(
                                extended_check=extended_check, force=True)

                LOGGER.info(
                    u'%s/%s (limit:%s) feeds processed, %s '
                    u'checked (%.2f%%).', processed_count, feeds_count, limit,
                    checked_count, checked_count * 100.0 / processed_count)

        if from_users:
            with benchmark("Check all subscriptions from users"):

                users = User.objects.filter(is_active=True)
                users_count = users.count()
                processed_count = 0

                for user in users:

                    # Do not extended_check=True, this would double-do
                    # the subscription.check_reads() already called below.
                    user.check_subscriptions()

                    if extended_check:
                        user.user_counters.compute_cached_descriptors()
                        # all=True, unread=True, starred=True, bookmarked=True

                        for subscription in user.subscriptions.all().iterator(
                        ):
                            processed_count += 1

                            subscription.check_reads(extended_check=True,
                                                     force=True)

                LOGGER.info(
                    u'%s users %sprocessed. '
                    u'All were checked.', users_count,
                    u'and %s subscriptions '.format(processed_count)
                    if extended_check else u'')

    finally:
        my_lock.release()
Ejemplo n.º 12
0
def refresh_all_feeds(limit=None, force=False):
    u""" Refresh all feeds (RSS/Mail/Twitter…). """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable,
    # we should re-evaluate it each time we run.
    this_round_expire_time = (
        config.FEED_GLOBAL_REFRESH_INTERVAL * 60
        - config.FEED_GLOBAL_REFRESH_INTERVAL
    )

    # Be sure two refresh operations don't overlap, but don't hold the
    # lock too long if something goes wrong. In production conditions
    # as of 20130812, refreshing all feeds takes only a moment:
    # [2013-08-12 09:07:02,028: INFO/MainProcess] Task
    #       oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s.
    #
    my_lock = RedisExpiringLock(
        REFRESH_ALL_FEEDS_LOCK_NAME,
        expire_time=this_round_expire_time
    )

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing all feed refresh…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.')
            return

    # This should bring us a Polymorphic Query to refresh all feeds types.
    feeds = BaseFeed.objects.filter(is_active=True,
                                    is_internal=False).order_by(
                                        'date_last_fetch')

    if limit:
        feeds = feeds[:limit]

    with benchmark('refresh_all_feeds()'):

        try:
            count = 0
            mynow = now()

            for feed in feeds:

                if feed.refresh_lock.is_locked():
                    # The refresh task lauched before its expiration, and is
                    # still [long] running while we want to launch another.
                    # Avoid, because the new would exit immediately on
                    # date_last_fetch too recent.
                    LOGGER.debug(u'Feed %s already locked, skipped.', feed)
                    continue

                if feed.date_last_fetch is None:

                    basefeed_refresh_task.apply_async(
                        args=(feed.id, ),

                        # in `this_round_expire_time`, we will relaunch it
                        # anyway, so don't clutter the queue with double work.
                        expire=this_round_expire_time,
                    )

                    LOGGER.info(u'Launched immediate refresh of feed %s which '
                                u'has never been refreshed.', feed)
                    count += 1
                    continue

                if feed.fetch_interval > 86399:
                    interval_days = feed.fetch_interval / 86400
                    interval_seconds = feed.fetch_interval - (
                        interval_days * 86400)

                    interval = timedelta(days=interval_days,
                                         seconds=interval_seconds)

                else:
                    interval = timedelta(seconds=feed.fetch_interval)

                if force or feed.date_last_fetch + interval < mynow:

                    how_late = feed.date_last_fetch + interval - mynow
                    how_late = how_late.days * 86400 + how_late.seconds

                    late = feed.date_last_fetch + interval < mynow

                    basefeed_refresh_task.apply_async(
                        args=(feed.id, ),
                        kwargs={'force': force},
                        expire=this_round_expire_time,
                    )

                    LOGGER.info(u'Launched refresh of feed %s (%s %s).',
                                feed, naturaldelta(how_late),
                                u'late' if late else u'earlier')
                    count += 1

        finally:
            # HEADS UP: in case the system is overloaded and feeds refresh()
            #           tasks don't complete fast enough, the current task
            #           will overload it even more. Thus, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to feed the refresh queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.',
                    count, feeds.count())
Ejemplo n.º 13
0
def throttle_feed_refresh(force=False):
    u""" Be sure we don't overflow queues uselessly. """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    my_lock = RedisExpiringLock(
        THROTTLE_REFRESH_LOCK_NAME,
        expire_time=58
    )

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing feed refresh throttling…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.')
            return

    queues = {
        q['name']: q['backing_queue_status']['len']
        for q in rabbitmq_queues()
    }

    relations = {
        r[0]: r[1] for r in postgresql_relations_sizes()
    }

    feed_qitems = queues['refresh']

    feeds_count = relations[BaseFeed._meta.db_table]

    low_limit = feeds_count / 5

    try:
        if feed_qitems > feeds_count:

            try:
                refresh_all_feeds.lock.release()

            except:
                pass

            refresh_all_feeds.lock.acquire()

            LOGGER.warning(u'Throttled feed refresh because queue items '
                           u'is going too high (%s > %s)',
                           feed_qitems, feeds_count)

        elif feed_qitems < low_limit:
            # Unleash the kraken!

            try:
                refresh_all_feeds.lock.release()

            except:
                pass

            LOGGER.info(u'Unthrottled feed refreshes, queue items number '
                        u'is low enough (%s for %s feeds).',
                        feed_qitems, feeds_count)

        else:
            LOGGER.debug(u'Not throttled, %s < items(%s) <= feeds(%s).',
                         low_limit, feed_qitems, feeds_count)

    finally:
        my_lock.release()
Ejemplo n.º 14
0
Archivo: mongo.py Proyecto: 1flow/1flow
def refresh_all_mongo_feeds(limit=None, force=False):
    u""" Refresh all MongoEngine feeds (RSS).

    .. note:: this task should vanish when
        MongoDB → PostgreSQL migration is done.
    """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    # Be sure two refresh operations don't overlap, but don't hold the
    # lock too long if something goes wrong. In production conditions
    # as of 20130812, refreshing all feeds takes only a moment:
    # [2013-08-12 09:07:02,028: INFO/MainProcess] Task
    #    oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s.
    my_lock = RedisExpiringLock(
        'refresh_all_mongo_feeds',
        expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1

    )

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing all feed refresh…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_mongo_feeds() is already '
                           u'locked, aborting.')
            return

    feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True)

    if limit:
        feeds = feeds.limit(limit)

    # No need for caching and cluttering CPU/memory for a one-shot thing.
    feeds.no_cache()

    with benchmark('refresh_all_mongo_feeds()'):

        try:
            count = 0
            mynow = now()

            for feed in feeds:

                if feed.refresh_lock.is_locked():
                    LOGGER.info(u'Feed %s already locked, skipped.', feed)
                    continue

                interval = timedelta(seconds=feed.fetch_interval)

                if feed.last_fetch is None:

                    mongo_feed_refresh_task.delay(feed.id)

                    LOGGER.info(u'Launched immediate refresh of feed %s which '
                                u'has never been refreshed.', feed)

                elif force or feed.last_fetch + interval < mynow:

                    how_late = feed.last_fetch + interval - mynow
                    how_late = how_late.days * 86400 + how_late.seconds

                    countdown = 0
                    mongo_feed_refresh_task.delay(feed.id, force)

                    LOGGER.info(u'%s refresh of feed %s %s (%s late).',
                                u'Scheduled randomized'
                                if countdown else u'Launched',
                                feed,
                                u' in {0}'.format(naturaldelta(countdown))
                                if countdown else u'in the background',
                                naturaldelta(how_late))
                    count += 1

        finally:
            # HEADS UP: see core.tasks.refresh_all_feeds() for note.
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.',
                    count, feeds.count())
Ejemplo n.º 15
0
def reprocess_failed_articles(failed=None,
                              expiry=None,
                              limit=None,
                              force=False,
                              reprocessing_type=None):
    u""" Reprocess articles that failed absolutization.

    In case there was a temporary error, this could lead to more good articles.
    """

    if config.ARTICLE_REPROCESSING_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Articles reprocess disabled in configuration.')
        return

    if failed is None:
        raise RuntimeError(u'Need a queryset of failed items to reprocess.')

    # TODO: as the celery tasks expires,
    # the lock is probably not needed anymore.

    my_lock = RedisExpiringLock('reprocess_failed_articles_' + str(expiry),
                                expire_time=expiry)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing failed articles reprocessing…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'reprocess_failed_articles() is already locked, '
                           u'aborting.')
            return

    failed_count = failed.count()

    with benchmark((u'Reprocess_failed_articles(expiry=%s): %s '
                    u' processing chains relaunched.') %
                   (naturaldelta(expiry), failed_count)):

        try:
            for article in failed.iterator():

                if reprocessing_type is None:
                    article.url_error = None
                    article.save()

                    article_post_create_task.apply(args=(article.id, ),
                                                   kwargs={'apply_now': True})

                elif reprocessing_type == 'standard':
                    article.process()

        finally:
            # HEADS UP: in case the system is overloaded, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to flood the queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass
Ejemplo n.º 16
0
def refresh_all_mongo_feeds(limit=None, force=False):
    u""" Refresh all MongoEngine feeds (RSS).

    .. note:: this task should vanish when
        MongoDB → PostgreSQL migration is done.
    """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    # Be sure two refresh operations don't overlap, but don't hold the
    # lock too long if something goes wrong. In production conditions
    # as of 20130812, refreshing all feeds takes only a moment:
    # [2013-08-12 09:07:02,028: INFO/MainProcess] Task
    #    oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s.
    my_lock = RedisExpiringLock(
        'refresh_all_mongo_feeds',
        expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing all feed refresh…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_mongo_feeds() is already '
                           u'locked, aborting.')
            return

    feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True)

    if limit:
        feeds = feeds.limit(limit)

    # No need for caching and cluttering CPU/memory for a one-shot thing.
    feeds.no_cache()

    with benchmark('refresh_all_mongo_feeds()'):

        try:
            count = 0
            mynow = now()

            for feed in feeds:

                if feed.refresh_lock.is_locked():
                    LOGGER.info(u'Feed %s already locked, skipped.', feed)
                    continue

                interval = timedelta(seconds=feed.fetch_interval)

                if feed.last_fetch is None:

                    mongo_feed_refresh_task.delay(feed.id)

                    LOGGER.info(
                        u'Launched immediate refresh of feed %s which '
                        u'has never been refreshed.', feed)

                elif force or feed.last_fetch + interval < mynow:

                    how_late = feed.last_fetch + interval - mynow
                    how_late = how_late.days * 86400 + how_late.seconds

                    countdown = 0
                    mongo_feed_refresh_task.delay(feed.id, force)

                    LOGGER.info(
                        u'%s refresh of feed %s %s (%s late).',
                        u'Scheduled randomized' if countdown else u'Launched',
                        feed, u' in {0}'.format(naturaldelta(countdown))
                        if countdown else u'in the background',
                        naturaldelta(how_late))
                    count += 1

        finally:
            # HEADS UP: see core.tasks.refresh_all_feeds() for note.
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count,
                    feeds.count())
Ejemplo n.º 17
0
def global_duplicates_checker(limit=None, force=False):
    """ Check that duplicate articles have no more Reads anywhere.

    Fix it if not, and update all counters accordingly.

    :param limit: integer, the maximum number of duplicates to check.
        Default: none.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    """

    if config.CHECK_DUPLICATES_DISABLED:
        LOGGER.warning(u'Duplicates check disabled in configuration.')
        return

    # This task runs one a day. Acquire the lock for just a
    # little more time to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing duplicates check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_subscriptions_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_DUPLICATES_LIMIT

    start_time = pytime.time()
    duplicates = BaseItem.objects.duplicate()

    total_dupes_count  = duplicates.count()
    total_reads_count  = 0
    processed_dupes    = 0
    done_dupes_count   = 0
    purged_dupes_count = 0

    purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS)
    purge_after_weeks_count = min(52, purge_after_weeks_count)

    purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7)

    LOGGER.info(u'Done counting (took %s of pure SQL joy), starting procedure.',
                naturaldelta(pytime.time() - start_time))

    with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all',
                   total_dupes_count)):

        try:
            for duplicate in duplicates.iterator():
                reads = duplicate.reads.all()

                processed_dupes += 1

                if reads.exists():
                    done_dupes_count  += 1
                    reads_count        = reads.count()
                    total_reads_count += reads_count

                    LOGGER.info(u'Duplicate %s #%s still has %s reads, fixing…',
                                duplicate._meta.model.__name__,
                                duplicate.id, reads_count)

                    duplicate.duplicate_of.register_duplicate(
                        duplicate, force=duplicate.duplicate_status
                        == DUPLICATE_STATUS.FINISHED)

                if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED:
                    #
                    # TODO: check we didn't get some race-conditions new
                    #       dependancies between the moment the duplicate
                    #       was marked duplicate and now.

                    if duplicate.date_created < purge_before_date:
                        try:
                            with transaction.atomic():
                                duplicate.delete()
                        except:
                            LOGGER.exception(u'Exception while deleting '
                                             u'duplicate %s #%s',
                                             duplicate._meta.model.__name__,
                                             duplicate.id)

                        purged_dupes_count += 1
                        LOGGER.info(u'Purged duplicate %s #%s from database.',
                                    duplicate._meta.model.__name__,
                                    duplicate.id)

                elif duplicate.duplicate_status in (
                    DUPLICATE_STATUS.NOT_REPLACED,
                        DUPLICATE_STATUS.FAILED):
                    # Something went wrong, perhaps the
                    # task was purged before beiing run.
                    duplicate.duplicate_of.register_duplicate(duplicate)
                    done_dupes_count += 1

                elif duplicate.duplicate_status is None:
                    # Something went very wrong. If the article is a known
                    # duplicate, its status field should have been set to
                    # at least NOT_REPLACED.
                    duplicate.duplicate_of.register_duplicate(duplicate)
                    done_dupes_count += 1

                    LOGGER.error(u'Corrected duplicate %s #%s found with no '
                                 u'status.', duplicate._meta.model.__name__,
                                 duplicate.id)

                if limit and processed_dupes >= limit:
                    break

        finally:
            my_lock.release()

    LOGGER.info(u'global_duplicates_checker(): %s/%s duplicates processed '
                u'(%.2f%%; limit: %s), %s corrected (%.2f%%), '
                u'%s purged (%.2f%%); %s reads altered.',

                processed_dupes, total_dupes_count,
                processed_dupes * 100.0 / total_dupes_count,

                limit or u'none',

                done_dupes_count,
                (done_dupes_count * 100.0 / processed_dupes)
                if processed_dupes else 0.0,

                purged_dupes_count,
                (purged_dupes_count * 100.0 / processed_dupes)
                if processed_dupes else 0.0,

                total_reads_count)
Ejemplo n.º 18
0
def global_users_checker(limit=None, extended_check=False, force=False,
                         verbose=False, break_on_exception=False):
    """ Check all Users and their dependancies.

    Can be disabled by ``config.CHECK_USERS_DISABLED`` directive.

    :param limit: integer, the maximum number of users to check.
        Default: none.
    :param extended_check: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param break_on_exception: boolean, default ``False``, currently
        ignored in this function.
    """

    if config.CHECK_USERS_DISABLED:
        LOGGER.warning(u'Users check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_users', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing users check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_users_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_USERS_LIMIT

    active_users      = User.objects.filter(is_active=True)
    total_users_count = active_users.count()
    processed_users   = 0
    changed_users     = 0
    skipped_count     = 0

    with benchmark(u"Check {0}/{1} users".format(limit or u'all',
                   total_users_count)):
        try:
            for user in active_users.iterator():

                processed_users += 1

                if limit and changed_users >= limit:
                    break

                check_one_user(user, extended_check=extended_check,
                               force=force, verbose=verbose)

        finally:
            my_lock.release()

    LOGGER.info(u'global_users_checker(): %s/%s users processed '
                u'(%.2f%%), %s corrected (%.2f%%), %s skipped (%.2f%%).',
                processed_users, total_users_count,
                processed_users * 100.0 / total_users_count,
                changed_users,
                changed_users * 100.0 / processed_users,
                skipped_count,
                skipped_count * 100.0 / processed_users)
Ejemplo n.º 19
0
def global_orphaned_checker(limit=None,
                            extended_check=False,
                            force=False,
                            verbose=False,
                            break_on_exception=False):
    """ Check all orphaned articles and delete them.

    They will be deleted only if they are duplicate of other orphaned ones,
    and only if the duplication replacement process finished successfully.
    If it failed, the orphan is left in place, to be able to re-run the
    operation later.

    Can be disabled by ``config.CHECK_ORPHANED_DISABLED`` directive.

    :param limit: integer, the maximum number of users to check.
        Default: none.
    :param extended_check: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param break_on_exception: boolean, default ``False``, currently
        ignored in this function.
    """

    if config.CHECK_ORPHANED_DISABLED:
        LOGGER.warning(u'Orphaned check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_orphaned', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing orphaned check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_orphaned_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_ORPHANED_LIMIT

    orphaned_items = Article.objects.orphaned().master()
    orphaned_items_count = orphaned_items.count()
    processed_orphans = 0
    changed_orphans = 0
    deleted_orphans = 0
    skipped_orphans = 0

    with benchmark(u"Check {0}/{1} orphans".format(limit or u'all',
                                                   orphaned_items_count)):
        try:
            for orphan in orphaned_items.iterator():
                processed_orphans += 1

                if limit and changed_orphans >= limit:
                    break

                old_url = orphan.url

                new_url = ARTICLE_ORPHANED_BASE + generate_orphaned_hash(
                    orphan.name, orphan.feeds.all())

                if new_url != old_url:
                    orphan.url = new_url
                    orphan.url_absolute = True

                else:
                    if not orphan.url_absolute:
                        changed_orphans += 1
                        orphan.url_absolute = True
                        orphan.save()

                    continue

                try:
                    orphan.save()

                except IntegrityError:
                    master = Article.objects.get(url=orphan.url)

                    # We have to put back the original URL, else the
                    # duplicate registration process will fail.
                    orphan.url = old_url

                    # Register the duplicate right here and now, to be able to
                    master.register_duplicate(orphan,
                                              force=force,
                                              background=False)

                    # Reload the orphan to get the refreshed duplicate status.
                    orphan = Article.objects.get(id=orphan.id)

                    if orphan.duplicate_status == DUPLICATE_STATUS.FINISHED:
                        orphan.delete()
                        deleted_orphans += 1

                        if verbose:
                            LOGGER.info(u'Deleted duplicate orphan %s', orphan)

                except:
                    skipped_orphans += 1
                    LOGGER.exception(u'Unhandled exception while checking %s',
                                     orphan)

                else:
                    changed_orphans += 1

        finally:
            my_lock.release()

    LOGGER.info(
        u'global_orphans_checker(): %s/%s orphans processed '
        u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), '
        u'%s skipped (%.2f%%).', processed_orphans, orphaned_items_count,
        processed_orphans * 100.0 / orphaned_items_count, changed_orphans,
        changed_orphans * 100.0 / processed_orphans, deleted_orphans,
        deleted_orphans * 100.0 / processed_orphans, skipped_orphans,
        skipped_orphans * 100.0 / processed_orphans)
Ejemplo n.º 20
0
# ————————————————————————————————————————————————————————————————— start ghost

if config.FEED_FETCH_GHOST_ENABLED:
    try:
        import ghost
    except:
        ghost = None  # NOQA
    else:
        GHOST_BROWSER = ghost.Ghost()

else:
    ghost = None  # NOQA

# Until we patch Ghost to use more than one Xvfb at the same time,
# we are tied to ensure there is only one running at a time.
global_ghost_lock = RedisExpiringLock('__ghost.py__')

# ——————————————————————————————————————————————————————————— QuerySet patching


def BaseItemQuerySet_empty_method(self):
    """ Patch BaseItemQuerySet to know how to return empty content. """

    return self.filter(content_type__in=[None, CONTENT_TYPES.NONE])


def BaseItemQuerySet_parsed_method(self):
    """ Patch BaseItemQuerySet to know how to return parsed content. """

    return self.filter(content_type__in=CONTENT_TYPES_FINAL)
Ejemplo n.º 21
0
def User_share_lock_property_get(self):
    """ Return a redis expiring lock to avoid sharing to same user too much. """

    return RedisExpiringLock(self, 'share')
Ejemplo n.º 22
0
def global_reads_checker(limit=None,
                         extended_check=False,
                         force=False,
                         verbose=False,
                         break_on_exception=False):
    """ Check all reads and their dependants.

    Will activate reads that are currently bad, but whose article is OK
    to display.

    This task is one of the most expensive thing in 1flow.
    It can run for hours because it scans all the bad reads and their
    articles, but will not kill the database with massive updates, it
    does them one by one.

    Can be disabled by ``config.CHECK_READS_DISABLED`` directive.

    :param limit: integer, the maximum number of duplicates to check.
        Default: none.
    :param extended_check: boolean, default ``False``.
        Runs :meth:`Read.set_subscriptions` if ``True`` and checked read
        has no subscription.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``, display (more)
        informative messages.
    :param break_on_exception: boolean, default ``False``, stop processing
        at the first encountered exception. Whatever it is, the exception
        will be logged to sentry.
    """

    if config.CHECK_READS_DISABLED:
        LOGGER.warning(u'Reads check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_reads', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing reads check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_reads_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_READS_LIMIT

    bad_reads = Read.objects.bad()

    total_reads_count = bad_reads.count()
    processed_reads = 0
    wiped_reads_count = 0
    changed_reads_count = 0
    skipped_count = 0

    with benchmark(u"Check {0}/{1} reads".format(limit or u'all',
                                                 total_reads_count)):
        try:
            for read in bad_reads.iterator():

                processed_reads += 1

                if limit and changed_reads_count >= limit:
                    break

                if read.is_good:
                    # This read has been activated via another
                    # checked one, attached to the same article.
                    changed_reads_count += 1
                    continue

                try:
                    article = read.item

                except:
                    LOGGER.critical(u'Could not get read.item for %s', read)
                    continue

                if extended_check:
                    try:
                        if read.subscriptions.all().exists():

                            # TODO: remove this
                            #       check_set_subscriptions_131004_done
                            #       transient check.
                            if read.check_set_subscriptions_131004_done:
                                read.check_subscriptions()

                            else:
                                read.check_set_subscriptions_131004()

                        else:
                            read.set_subscriptions()

                    except:
                        skipped_count += 1
                        LOGGER.exception(
                            u'Could not set subscriptions on '
                            u'read #%s, from article #%s, for '
                            u'user #%s. Skipping.', read.id, article.id,
                            read.user.id)
                        continue

                try:
                    if article.is_good:
                        changed_reads_count += 1

                        if verbose:
                            LOGGER.info(
                                u'Bad read %s has a good article, '
                                u'fixing…', read)

                        article.activate_reads(extended_check=extended_check)

                except:
                    LOGGER.exception(
                        u'Could not activate reads from '
                        u'article %s of read %s.', article, read)
                    if break_on_exception:
                        break

        finally:
            my_lock.release()

    LOGGER.info(
        u'global_reads_checker(): %s/%s reads processed '
        u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), '
        u'%s skipped (%.2f%%).', processed_reads, total_reads_count,
        processed_reads * 100.0 / total_reads_count, changed_reads_count,
        changed_reads_count * 100.0 / processed_reads, wiped_reads_count,
        wiped_reads_count * 100.0 / processed_reads, skipped_count,
        skipped_count * 100.0 / processed_reads)
Ejemplo n.º 23
0
            refresh_all_feeds.lock.acquire()

            LOGGER.warning(u'Throttled feed refresh because queue items '
                           u'is going too high (%s > %s)',
                           feed_qitems, feeds_count)

        elif feed_qitems < low_limit:
            # Unleash the kraken!

            try:
                refresh_all_feeds.lock.release()

            except:
                pass

            LOGGER.info(u'Unthrottled feed refreshes, queue items number '
                        u'is low enough (%s for %s feeds).',
                        feed_qitems, feeds_count)

        else:
            LOGGER.debug(u'Not throttled, %s < items(%s) <= feeds(%s).',
                         low_limit, feed_qitems, feeds_count)

    finally:
        my_lock.release()


# Allow to release the lock manually for testing purposes.
throttle_feed_refresh.lock = RedisExpiringLock(THROTTLE_REFRESH_LOCK_NAME)
Ejemplo n.º 24
0
def global_users_checker(limit=None,
                         extended_check=False,
                         force=False,
                         verbose=False,
                         break_on_exception=False):
    """ Check all Users and their dependancies.

    Can be disabled by ``config.CHECK_USERS_DISABLED`` directive.

    :param limit: integer, the maximum number of users to check.
        Default: none.
    :param extended_check: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param break_on_exception: boolean, default ``False``, currently
        ignored in this function.
    """

    if config.CHECK_USERS_DISABLED:
        LOGGER.warning(u'Users check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_users', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing users check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_users_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_USERS_LIMIT

    active_users = User.objects.filter(is_active=True)
    total_users_count = active_users.count()
    processed_users = 0
    changed_users = 0
    skipped_count = 0

    with benchmark(u"Check {0}/{1} users".format(limit or u'all',
                                                 total_users_count)):
        try:
            for user in active_users.iterator():

                processed_users += 1

                if limit and changed_users >= limit:
                    break

                check_one_user(user,
                               extended_check=extended_check,
                               force=force,
                               verbose=verbose)

        finally:
            my_lock.release()

    LOGGER.info(
        u'global_users_checker(): %s/%s users processed '
        u'(%.2f%%), %s corrected (%.2f%%), %s skipped (%.2f%%).',
        processed_users, total_users_count,
        processed_users * 100.0 / total_users_count, changed_users,
        changed_users * 100.0 / processed_users, skipped_count,
        skipped_count * 100.0 / processed_users)
Ejemplo n.º 25
0
        try:
            synchronize_statsd_articles_gauges(full=full)
            synchronize_statsd_tags_gauges(full=full)
            synchronize_statsd_websites_gauges(full=full)
            synchronize_statsd_authors_gauges(full=full)
            synchronize_statsd_feeds_gauges(full=full)
            synchronize_statsd_subscriptions_gauges(full=full)
            synchronize_statsd_reads_gauges(full=full)

        finally:
            my_lock.release()


# Allow to release the lock manually for testing purposes.
synchronize_statsd_gauges.lock = RedisExpiringLock(
    SYNCHRONIZE_STATSD_LOCK_NAME)


@beat_init.connect()
def clear_all_locks(conf=None, **kwargs):
    """ Clear all expiring locks when celery beat starts. """

    for key, value in globals().items():
        if hasattr(value, 'lock'):
            getattr(value, 'lock').release()

            LOGGER.info(u'Released %s() lock.', key)

    locked_count = 0

    for feed in BaseFeed.objects.filter(is_active=True, is_internal=False):
Ejemplo n.º 26
0
def global_subscriptions_checker(force=False, limit=None, from_feeds=True,
                                 from_users=False, extended_check=False):
    """ A conditionned version of :meth:`Feed.check_subscriptions`. """

    if config.CHECK_SUBSCRIPTIONS_DISABLED:
        LOGGER.warning(u'Subscriptions checks disabled in configuration.')
        return

    # This task runs one a day. Acquire the lock for just a
    # little more time to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_subscriptions',
                                expire_time=3600 * 25)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing subscriptions checks…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_subscriptions_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_SUBSCRIPTIONS_LIMIT

    assert int(limit) >= 0

    try:
        if from_feeds:
            with benchmark("Check all subscriptions from feeds"):

                # We check ALL feeds (including inactive ones) to be
                # sure all subscriptions / reads are up-to-date.
                feeds           = BaseFeed.objects.all()
                feeds_count     = feeds.count()
                processed_count = 0
                checked_count   = 0

                for feed in feeds.iterator():

                    if limit and checked_count > limit:
                        break

                    if extended_check:
                        feed.compute_cached_descriptors()
                        # all=True, good=True, bad=True

                    # Do not extended_check=True, this would double-do
                    # the subscription.check_reads() already called below.
                    feed.check_subscriptions()

                    for subscription in feed.subscriptions.all().iterator():

                        processed_count += 1

                        if subscription.all_items_count \
                                != feed.good_items_count:

                            checked_count += 1

                            LOGGER.info(u'Subscription %s (#%s) has %s reads '
                                        u'whereas its feed has %s good '
                                        u'articles; checking…',
                                        subscription.name, subscription.id,
                                        subscription.all_items_count,
                                        feed.good_items_count)

                            subscription.check_reads(
                                extended_check=extended_check, force=True)

                LOGGER.info(u'%s/%s (limit:%s) feeds processed, %s '
                            u'checked (%.2f%%).',
                            processed_count, feeds_count, limit,
                            checked_count, checked_count
                            * 100.0 / processed_count)

        if from_users:
            with benchmark("Check all subscriptions from users"):

                users           = User.objects.filter(is_active=True)
                users_count     = users.count()
                processed_count = 0

                for user in users:

                    # Do not extended_check=True, this would double-do
                    # the subscription.check_reads() already called below.
                    user.check_subscriptions()

                    if extended_check:
                        user.user_counters.compute_cached_descriptors()
                        # all=True, unread=True, starred=True, bookmarked=True

                        for subscription in user.subscriptions.all().iterator():
                                processed_count += 1

                                subscription.check_reads(extended_check=True,
                                                         force=True)

                LOGGER.info(u'%s users %sprocessed. '
                            u'All were checked.', users_count,
                            u'and %s subscriptions '.format(processed_count)
                            if extended_check else u'')

    finally:
        my_lock.release()
Ejemplo n.º 27
0
            # HEADS UP: in case the system is overloaded and feeds refresh()
            #           tasks don't complete fast enough, the current task
            #           will overload it even more. Thus, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to feed the refresh queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count,
                    feeds.count())


# Allow to release the lock manually for testing purposes.
refresh_all_feeds.lock = RedisExpiringLock(REFRESH_ALL_FEEDS_LOCK_NAME)


@task(name='oneflow.core.tasks.refresh_all_mailaccounts', queue='refresh')
def refresh_all_mailaccounts(force=False):
    """ Check all unusable e-mail accounts. """

    if config.MAIL_ACCOUNT_REFRESH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'E-mail accounts check disabled in configuration.')
        return

    accounts = MailAccount.objects.unusable()

    my_lock = RedisExpiringLock(REFRESH_ALL_MAILACCOUNTS_LOCK_NAME,
                                expire_time=30 * (accounts.count() + 2))
Ejemplo n.º 28
0
def global_reads_checker(limit=None, extended_check=False, force=False,
                         verbose=False, break_on_exception=False):
    """ Check all reads and their dependants.

    Will activate reads that are currently bad, but whose article is OK
    to display.

    This task is one of the most expensive thing in 1flow.
    It can run for hours because it scans all the bad reads and their
    articles, but will not kill the database with massive updates, it
    does them one by one.

    Can be disabled by ``config.CHECK_READS_DISABLED`` directive.

    :param limit: integer, the maximum number of duplicates to check.
        Default: none.
    :param extended_check: boolean, default ``False``.
        Runs :meth:`Read.set_subscriptions` if ``True`` and checked read
        has no subscription.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``, display (more)
        informative messages.
    :param break_on_exception: boolean, default ``False``, stop processing
        at the first encountered exception. Whatever it is, the exception
        will be logged to sentry.
    """

    if config.CHECK_READS_DISABLED:
        LOGGER.warning(u'Reads check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_reads', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing reads check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_reads_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_READS_LIMIT

    bad_reads = Read.objects.bad()

    total_reads_count   = bad_reads.count()
    processed_reads     = 0
    wiped_reads_count   = 0
    changed_reads_count = 0
    skipped_count       = 0

    with benchmark(u"Check {0}/{1} reads".format(limit or u'all',
                   total_reads_count)):
        try:
            for read in bad_reads.iterator():

                processed_reads += 1

                if limit and changed_reads_count >= limit:
                    break

                if read.is_good:
                    # This read has been activated via another
                    # checked one, attached to the same article.
                    changed_reads_count += 1
                    continue

                try:
                    article = read.item

                except:
                    LOGGER.critical(u'Could not get read.item for %s', read)
                    continue

                if extended_check:
                    try:
                        if read.subscriptions.all().exists():

                            # TODO: remove this
                            #       check_set_subscriptions_131004_done
                            #       transient check.
                            if read.check_set_subscriptions_131004_done:
                                read.check_subscriptions()

                            else:
                                read.check_set_subscriptions_131004()

                        else:
                            read.set_subscriptions()

                    except:
                        skipped_count += 1
                        LOGGER.exception(u'Could not set subscriptions on '
                                         u'read #%s, from article #%s, for '
                                         u'user #%s. Skipping.', read.id,
                                         article.id, read.user.id)
                        continue

                try:
                    if article.is_good:
                        changed_reads_count += 1

                        if verbose:
                            LOGGER.info(u'Bad read %s has a good article, '
                                        u'fixing…', read)

                        article.activate_reads(extended_check=extended_check)

                except:
                    LOGGER.exception(u'Could not activate reads from '
                                     u'article %s of read %s.',
                                     article, read)
                    if break_on_exception:
                        break

        finally:
            my_lock.release()

    LOGGER.info(u'global_reads_checker(): %s/%s reads processed '
                u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), '
                u'%s skipped (%.2f%%).',
                processed_reads, total_reads_count,
                processed_reads * 100.0 / total_reads_count,
                changed_reads_count,
                changed_reads_count * 100.0 / processed_reads,
                wiped_reads_count,
                wiped_reads_count * 100.0 / processed_reads,
                skipped_count,
                skipped_count * 100.0 / processed_reads)
Ejemplo n.º 29
0
def refresh_all_feeds(limit=None, force=False):
    u""" Refresh all feeds (RSS/Mail/Twitter…). """

    if config.FEED_FETCH_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Feed refresh disabled in configuration.')
        return

    # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable,
    # we should re-evaluate it each time we run.
    this_round_expire_time = (config.FEED_GLOBAL_REFRESH_INTERVAL * 60 -
                              config.FEED_GLOBAL_REFRESH_INTERVAL)

    # Be sure two refresh operations don't overlap, but don't hold the
    # lock too long if something goes wrong. In production conditions
    # as of 20130812, refreshing all feeds takes only a moment:
    # [2013-08-12 09:07:02,028: INFO/MainProcess] Task
    #       oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s.
    #
    my_lock = RedisExpiringLock(REFRESH_ALL_FEEDS_LOCK_NAME,
                                expire_time=this_round_expire_time)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(_(u'Forcing all feed refresh…'))

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.')
            return

    # This should bring us a Polymorphic Query to refresh all feeds types.
    feeds = BaseFeed.objects.filter(
        is_active=True, is_internal=False).order_by('date_last_fetch')

    if limit:
        feeds = feeds[:limit]

    with benchmark('refresh_all_feeds()'):

        try:
            count = 0
            mynow = now()

            for feed in feeds:

                if feed.refresh_lock.is_locked():
                    # The refresh task lauched before its expiration, and is
                    # still [long] running while we want to launch another.
                    # Avoid, because the new would exit immediately on
                    # date_last_fetch too recent.
                    LOGGER.debug(u'Feed %s already locked, skipped.', feed)
                    continue

                if feed.date_last_fetch is None:

                    basefeed_refresh_task.apply_async(
                        args=(feed.id, ),

                        # in `this_round_expire_time`, we will relaunch it
                        # anyway, so don't clutter the queue with double work.
                        expire=this_round_expire_time,
                    )

                    LOGGER.info(
                        u'Launched immediate refresh of feed %s which '
                        u'has never been refreshed.', feed)
                    count += 1
                    continue

                if feed.fetch_interval > 86399:
                    interval_days = feed.fetch_interval / 86400
                    interval_seconds = feed.fetch_interval - (interval_days *
                                                              86400)

                    interval = timedelta(days=interval_days,
                                         seconds=interval_seconds)

                else:
                    interval = timedelta(seconds=feed.fetch_interval)

                if force or feed.date_last_fetch + interval < mynow:

                    how_late = feed.date_last_fetch + interval - mynow
                    how_late = how_late.days * 86400 + how_late.seconds

                    late = feed.date_last_fetch + interval < mynow

                    basefeed_refresh_task.apply_async(
                        args=(feed.id, ),
                        kwargs={'force': force},
                        expire=this_round_expire_time,
                    )

                    LOGGER.info(u'Launched refresh of feed %s (%s %s).', feed,
                                naturaldelta(how_late),
                                u'late' if late else u'earlier')
                    count += 1

        finally:
            # HEADS UP: in case the system is overloaded and feeds refresh()
            #           tasks don't complete fast enough, the current task
            #           will overload it even more. Thus, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to feed the refresh queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass

        LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count,
                    feeds.count())
Ejemplo n.º 30
0
def global_orphaned_checker(limit=None, extended_check=False, force=False,
                            verbose=False, break_on_exception=False):
    """ Check all orphaned articles and delete them.

    They will be deleted only if they are duplicate of other orphaned ones,
    and only if the duplication replacement process finished successfully.
    If it failed, the orphan is left in place, to be able to re-run the
    operation later.

    Can be disabled by ``config.CHECK_ORPHANED_DISABLED`` directive.

    :param limit: integer, the maximum number of users to check.
        Default: none.
    :param extended_check: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param force: boolean, default ``False``, allows to by bypass and
        reacquire the lock.
    :param verbose: boolean, default ``False``. Forwarded
        to :func:`check_one_user`.
    :param break_on_exception: boolean, default ``False``, currently
        ignored in this function.
    """

    if config.CHECK_ORPHANED_DISABLED:
        LOGGER.warning(u'Orphaned check disabled in configuration.')
        return

    # This task runs twice a day. Acquire the lock for just a
    # little more time (13h, because Redis doesn't like floats)
    # to avoid over-parallelized runs.
    my_lock = RedisExpiringLock('check_all_orphaned', expire_time=3600 * 13)

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing orphaned check…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'global_orphaned_checker() is already '
                           u'locked, aborting.')
            return

    if limit is None:
        limit = config.CHECK_ORPHANED_LIMIT

    orphaned_items       = Article.objects.orphaned().master()
    orphaned_items_count = orphaned_items.count()
    processed_orphans    = 0
    changed_orphans      = 0
    deleted_orphans      = 0
    skipped_orphans      = 0

    with benchmark(u"Check {0}/{1} orphans".format(limit or u'all',
                   orphaned_items_count)):
        try:
            for orphan in orphaned_items.iterator():
                processed_orphans += 1

                if limit and changed_orphans >= limit:
                    break

                old_url = orphan.url

                new_url = ARTICLE_ORPHANED_BASE + generate_orphaned_hash(
                    orphan.name, orphan.feeds.all())

                if new_url != old_url:
                    orphan.url = new_url
                    orphan.url_absolute = True

                else:
                    if not orphan.url_absolute:
                        changed_orphans += 1
                        orphan.url_absolute = True
                        orphan.save()

                    continue

                try:
                    orphan.save()

                except IntegrityError:
                    master = Article.objects.get(url=orphan.url)

                    # We have to put back the original URL, else the
                    # duplicate registration process will fail.
                    orphan.url = old_url

                    # Register the duplicate right here and now, to be able to
                    master.register_duplicate(orphan, force=force,
                                              background=False)

                    # Reload the orphan to get the refreshed duplicate status.
                    orphan = Article.objects.get(id=orphan.id)

                    if orphan.duplicate_status == DUPLICATE_STATUS.FINISHED:
                        orphan.delete()
                        deleted_orphans += 1

                        if verbose:
                            LOGGER.info(u'Deleted duplicate orphan %s', orphan)

                except:
                    skipped_orphans += 1
                    LOGGER.exception(u'Unhandled exception while checking %s',
                                     orphan)

                else:
                    changed_orphans += 1

        finally:
            my_lock.release()

    LOGGER.info(u'global_orphans_checker(): %s/%s orphans processed '
                u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), '
                u'%s skipped (%.2f%%).',
                processed_orphans, orphaned_items_count,
                processed_orphans * 100.0 / orphaned_items_count,
                changed_orphans,
                changed_orphans * 100.0 / processed_orphans,
                deleted_orphans,
                deleted_orphans * 100.0 / processed_orphans,
                skipped_orphans,
                skipped_orphans * 100.0 / processed_orphans)
Ejemplo n.º 31
0
def reprocess_failed_articles(failed=None, expiry=None,
                              limit=None, force=False,
                              reprocessing_type=None):
    u""" Reprocess articles that failed absolutization.

    In case there was a temporary error, this could lead to more good articles.
    """

    if config.ARTICLE_REPROCESSING_DISABLED:
        # Do not raise any .retry(), this is a scheduled task.
        LOGGER.warning(u'Articles reprocess disabled in configuration.')
        return

    if failed is None:
        raise RuntimeError(u'Need a queryset of failed items to reprocess.')

    # TODO: as the celery tasks expires,
    # the lock is probably not needed anymore.

    my_lock = RedisExpiringLock(
        'reprocess_failed_articles_' + str(expiry),
        expire_time=expiry
    )

    if not my_lock.acquire():
        if force:
            my_lock.release()
            my_lock.acquire()
            LOGGER.warning(u'Forcing failed articles reprocessing…')

        else:
            # Avoid running this task over and over again in the queue
            # if the previous instance did not yet terminate. Happens
            # when scheduled task runs too quickly.
            LOGGER.warning(u'reprocess_failed_articles() is already locked, '
                           u'aborting.')
            return

    failed_count = failed.count()

    with benchmark((u'Reprocess_failed_articles(expiry=%s): %s '
                   u' processing chains relaunched.')
                   % (naturaldelta(expiry), failed_count)):

        try:
            for article in failed.iterator():

                if reprocessing_type is None:
                    article.url_error = None
                    article.save()

                    article_post_create_task.apply(args=(article.id, ),
                                                   kwargs={'apply_now': True})

                elif reprocessing_type == 'standard':
                    article.process()

        finally:
            # HEADS UP: in case the system is overloaded, we intentionaly
            #           don't release the lock to avoid over-re-launched
            #           global tasks to flood the queue with useless
            #           double-triple-Nble individual tasks.
            #
            # my_lock.release()
            pass