def archive_documents(limit=None, force=False): """ Archive all kind of documents that need archiving. """ if config.DOCUMENTS_ARCHIVING_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Document archiving disabled in configuration.') return # Be sure two archiving operations don't overlap, this is a very costly # operation for the database, and it can make the system very slugish. # The whole operation can be very long, we lock for a long time. my_lock = RedisExpiringLock('archive_documents', expire_time=3600 * 24) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'archive_documents() force unlock/re-acquire, ' u'be careful with that.') else: LOGGER.warning(u'archive_documents() is already locked, aborting.') return # these are tasks, but we run them sequentially in this global archive job # to avoid hammering the production database with multiple archive jobs. archive_articles(limit=limit) my_lock.release()
def synchronize_statsd_gauges(full=False, force=False): """ Synchronize all counters to statsd. """ # from oneflow.core.stats import ( # synchronize_mongodb_statsd_articles_gauges, # synchronize_mongodb_statsd_tags_gauges, # synchronize_mongodb_statsd_websites_gauges, # synchronize_mongodb_statsd_authors_gauges, # ) from oneflow.core.dbstats import ( synchronize_statsd_articles_gauges, synchronize_statsd_tags_gauges, synchronize_statsd_websites_gauges, synchronize_statsd_authors_gauges, synchronize_statsd_feeds_gauges, synchronize_statsd_subscriptions_gauges, synchronize_statsd_reads_gauges, ) my_lock = RedisExpiringLock(SYNCHRONIZE_STATSD_LOCK_NAME, expire_time=3600) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing statsd gauges synchronization…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'synchronize_statsd_gauges() is already locked, ' u'aborting.') return # with benchmark('synchronize_mongodb_statsd_gauges()'): # try: # synchronize_mongodb_statsd_articles_gauges(full=full) # synchronize_mongodb_statsd_tags_gauges(full=full) # synchronize_mongodb_statsd_websites_gauges(full=full) # synchronize_mongodb_statsd_authors_gauges(full=full) # except: # LOGGER.exception(u'MongoDB stats failed at some point') with benchmark('synchronize_statsd_gauges()'): try: synchronize_statsd_articles_gauges(full=full) synchronize_statsd_tags_gauges(full=full) synchronize_statsd_websites_gauges(full=full) synchronize_statsd_authors_gauges(full=full) synchronize_statsd_feeds_gauges(full=full) synchronize_statsd_subscriptions_gauges(full=full) synchronize_statsd_reads_gauges(full=full) finally: my_lock.release()
def update_recent_items_count(self, force=False): """ This task is protected to run only once per day, even if is called more. """ urac_lock = RedisExpiringLock(self, lock_name='urac', expire_time=86100) if urac_lock.acquire() or force: self.recent_items_count = self.recent_items.count() elif not force: LOGGER.warning( u'No more than one update_recent_items_count ' u'per day (feed %s).', self)
def refresh_lock(self): try: return self.__refresh_lock except AttributeError: self.__refresh_lock = RedisExpiringLock( self, lock_name='fetch', expire_time=self.REFRESH_LOCK_INTERVAL or self.fetch_interval) return self.__refresh_lock
def refresh_lock(self): try: return self.__refresh_lock except AttributeError: self.__refresh_lock = RedisExpiringLock( self, lock_name='account_fetch', ) return self.__refresh_lock
def sync_lock(self): try: return self._sync_lock_ except AttributeError: self._sync_lock_ = RedisExpiringLock(self, lock_name='sync', expire_time=86100) return self._sync_lock_
def refresh_all_mailaccounts(force=False): """ Check all unusable e-mail accounts. """ if config.MAIL_ACCOUNT_REFRESH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'E-mail accounts check disabled in configuration.') return accounts = MailAccount.objects.unusable() my_lock = RedisExpiringLock(REFRESH_ALL_MAILACCOUNTS_LOCK_NAME, expire_time=30 * (accounts.count() + 2)) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing check of email accounts…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_mailaccounts() is already locked, ' u'aborting.') return with benchmark('refresh_all_mailaccounts()'): try: for account in accounts: try: account.test_connection() account.update_mailboxes() except: pass finally: my_lock.release() LOGGER.info( u'Launched %s checks on unusable accounts out of %s total.', accounts.count(), MailAccount.objects.all().count())
def refresh_all_mailaccounts(force=False): """ Check all unusable e-mail accounts. """ if config.MAIL_ACCOUNT_REFRESH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'E-mail accounts check disabled in configuration.') return accounts = MailAccount.objects.unusable() my_lock = RedisExpiringLock(REFRESH_ALL_MAILACCOUNTS_LOCK_NAME, expire_time=30 * (accounts.count() + 2)) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing check of email accounts…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_mailaccounts() is already locked, ' u'aborting.') return with benchmark('refresh_all_mailaccounts()'): try: for account in accounts: try: account.test_connection() account.update_mailboxes() except: pass finally: my_lock.release() LOGGER.info(u'Launched %s checks on unusable accounts out of %s total.', accounts.count(), MailAccount.objects.all().count())
def global_duplicates_checker(limit=None, force=False): """ Check that duplicate articles have no more Reads anywhere. Fix it if not, and update all counters accordingly. :param limit: integer, the maximum number of duplicates to check. Default: none. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. """ if config.CHECK_DUPLICATES_DISABLED: LOGGER.warning(u'Duplicates check disabled in configuration.') return # This task runs one a day. Acquire the lock for just a # little more time to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing duplicates check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_subscriptions_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_DUPLICATES_LIMIT start_time = pytime.time() duplicates = BaseItem.objects.duplicate() total_dupes_count = duplicates.count() total_reads_count = 0 processed_dupes = 0 done_dupes_count = 0 purged_dupes_count = 0 purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS) purge_after_weeks_count = min(52, purge_after_weeks_count) purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7) LOGGER.info( u'Done counting (took %s of pure SQL joy), starting procedure.', naturaldelta(pytime.time() - start_time)) with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all', total_dupes_count)): try: for duplicate in duplicates.iterator(): reads = duplicate.reads.all() processed_dupes += 1 if reads.exists(): done_dupes_count += 1 reads_count = reads.count() total_reads_count += reads_count LOGGER.info( u'Duplicate %s #%s still has %s reads, fixing…', duplicate._meta.model.__name__, duplicate.id, reads_count) duplicate.duplicate_of.register_duplicate( duplicate, force=duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED) if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED: # # TODO: check we didn't get some race-conditions new # dependancies between the moment the duplicate # was marked duplicate and now. if duplicate.date_created < purge_before_date: try: with transaction.atomic(): duplicate.delete() except: LOGGER.exception( u'Exception while deleting ' u'duplicate %s #%s', duplicate._meta.model.__name__, duplicate.id) purged_dupes_count += 1 LOGGER.info(u'Purged duplicate %s #%s from database.', duplicate._meta.model.__name__, duplicate.id) elif duplicate.duplicate_status in ( DUPLICATE_STATUS.NOT_REPLACED, DUPLICATE_STATUS.FAILED): # Something went wrong, perhaps the # task was purged before beiing run. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 elif duplicate.duplicate_status is None: # Something went very wrong. If the article is a known # duplicate, its status field should have been set to # at least NOT_REPLACED. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 LOGGER.error( u'Corrected duplicate %s #%s found with no ' u'status.', duplicate._meta.model.__name__, duplicate.id) if limit and processed_dupes >= limit: break finally: my_lock.release() LOGGER.info( u'global_duplicates_checker(): %s/%s duplicates processed ' u'(%.2f%%; limit: %s), %s corrected (%.2f%%), ' u'%s purged (%.2f%%); %s reads altered.', processed_dupes, total_dupes_count, processed_dupes * 100.0 / total_dupes_count, limit or u'none', done_dupes_count, (done_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, purged_dupes_count, (purged_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, total_reads_count)
def global_subscriptions_checker(force=False, limit=None, from_feeds=True, from_users=False, extended_check=False): """ A conditionned version of :meth:`Feed.check_subscriptions`. """ if config.CHECK_SUBSCRIPTIONS_DISABLED: LOGGER.warning(u'Subscriptions checks disabled in configuration.') return # This task runs one a day. Acquire the lock for just a # little more time to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_subscriptions', expire_time=3600 * 25) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing subscriptions checks…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_subscriptions_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_SUBSCRIPTIONS_LIMIT assert int(limit) >= 0 try: if from_feeds: with benchmark("Check all subscriptions from feeds"): # We check ALL feeds (including inactive ones) to be # sure all subscriptions / reads are up-to-date. feeds = BaseFeed.objects.all() feeds_count = feeds.count() processed_count = 0 checked_count = 0 for feed in feeds.iterator(): if limit and checked_count > limit: break if extended_check: feed.compute_cached_descriptors() # all=True, good=True, bad=True # Do not extended_check=True, this would double-do # the subscription.check_reads() already called below. feed.check_subscriptions() for subscription in feed.subscriptions.all().iterator(): processed_count += 1 if subscription.all_items_count \ != feed.good_items_count: checked_count += 1 LOGGER.info( u'Subscription %s (#%s) has %s reads ' u'whereas its feed has %s good ' u'articles; checking…', subscription.name, subscription.id, subscription.all_items_count, feed.good_items_count) subscription.check_reads( extended_check=extended_check, force=True) LOGGER.info( u'%s/%s (limit:%s) feeds processed, %s ' u'checked (%.2f%%).', processed_count, feeds_count, limit, checked_count, checked_count * 100.0 / processed_count) if from_users: with benchmark("Check all subscriptions from users"): users = User.objects.filter(is_active=True) users_count = users.count() processed_count = 0 for user in users: # Do not extended_check=True, this would double-do # the subscription.check_reads() already called below. user.check_subscriptions() if extended_check: user.user_counters.compute_cached_descriptors() # all=True, unread=True, starred=True, bookmarked=True for subscription in user.subscriptions.all().iterator( ): processed_count += 1 subscription.check_reads(extended_check=True, force=True) LOGGER.info( u'%s users %sprocessed. ' u'All were checked.', users_count, u'and %s subscriptions '.format(processed_count) if extended_check else u'') finally: my_lock.release()
def refresh_all_feeds(limit=None, force=False): u""" Refresh all feeds (RSS/Mail/Twitter…). """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable, # we should re-evaluate it each time we run. this_round_expire_time = ( config.FEED_GLOBAL_REFRESH_INTERVAL * 60 - config.FEED_GLOBAL_REFRESH_INTERVAL ) # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s. # my_lock = RedisExpiringLock( REFRESH_ALL_FEEDS_LOCK_NAME, expire_time=this_round_expire_time ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.') return # This should bring us a Polymorphic Query to refresh all feeds types. feeds = BaseFeed.objects.filter(is_active=True, is_internal=False).order_by( 'date_last_fetch') if limit: feeds = feeds[:limit] with benchmark('refresh_all_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): # The refresh task lauched before its expiration, and is # still [long] running while we want to launch another. # Avoid, because the new would exit immediately on # date_last_fetch too recent. LOGGER.debug(u'Feed %s already locked, skipped.', feed) continue if feed.date_last_fetch is None: basefeed_refresh_task.apply_async( args=(feed.id, ), # in `this_round_expire_time`, we will relaunch it # anyway, so don't clutter the queue with double work. expire=this_round_expire_time, ) LOGGER.info(u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) count += 1 continue if feed.fetch_interval > 86399: interval_days = feed.fetch_interval / 86400 interval_seconds = feed.fetch_interval - ( interval_days * 86400) interval = timedelta(days=interval_days, seconds=interval_seconds) else: interval = timedelta(seconds=feed.fetch_interval) if force or feed.date_last_fetch + interval < mynow: how_late = feed.date_last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds late = feed.date_last_fetch + interval < mynow basefeed_refresh_task.apply_async( args=(feed.id, ), kwargs={'force': force}, expire=this_round_expire_time, ) LOGGER.info(u'Launched refresh of feed %s (%s %s).', feed, naturaldelta(how_late), u'late' if late else u'earlier') count += 1 finally: # HEADS UP: in case the system is overloaded and feeds refresh() # tasks don't complete fast enough, the current task # will overload it even more. Thus, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to feed the refresh queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def throttle_feed_refresh(force=False): u""" Be sure we don't overflow queues uselessly. """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return my_lock = RedisExpiringLock( THROTTLE_REFRESH_LOCK_NAME, expire_time=58 ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing feed refresh throttling…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.') return queues = { q['name']: q['backing_queue_status']['len'] for q in rabbitmq_queues() } relations = { r[0]: r[1] for r in postgresql_relations_sizes() } feed_qitems = queues['refresh'] feeds_count = relations[BaseFeed._meta.db_table] low_limit = feeds_count / 5 try: if feed_qitems > feeds_count: try: refresh_all_feeds.lock.release() except: pass refresh_all_feeds.lock.acquire() LOGGER.warning(u'Throttled feed refresh because queue items ' u'is going too high (%s > %s)', feed_qitems, feeds_count) elif feed_qitems < low_limit: # Unleash the kraken! try: refresh_all_feeds.lock.release() except: pass LOGGER.info(u'Unthrottled feed refreshes, queue items number ' u'is low enough (%s for %s feeds).', feed_qitems, feeds_count) else: LOGGER.debug(u'Not throttled, %s < items(%s) <= feeds(%s).', low_limit, feed_qitems, feeds_count) finally: my_lock.release()
def refresh_all_mongo_feeds(limit=None, force=False): u""" Refresh all MongoEngine feeds (RSS). .. note:: this task should vanish when MongoDB → PostgreSQL migration is done. """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s. my_lock = RedisExpiringLock( 'refresh_all_mongo_feeds', expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1 ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_mongo_feeds() is already ' u'locked, aborting.') return feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True) if limit: feeds = feeds.limit(limit) # No need for caching and cluttering CPU/memory for a one-shot thing. feeds.no_cache() with benchmark('refresh_all_mongo_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): LOGGER.info(u'Feed %s already locked, skipped.', feed) continue interval = timedelta(seconds=feed.fetch_interval) if feed.last_fetch is None: mongo_feed_refresh_task.delay(feed.id) LOGGER.info(u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) elif force or feed.last_fetch + interval < mynow: how_late = feed.last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds countdown = 0 mongo_feed_refresh_task.delay(feed.id, force) LOGGER.info(u'%s refresh of feed %s %s (%s late).', u'Scheduled randomized' if countdown else u'Launched', feed, u' in {0}'.format(naturaldelta(countdown)) if countdown else u'in the background', naturaldelta(how_late)) count += 1 finally: # HEADS UP: see core.tasks.refresh_all_feeds() for note. # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def reprocess_failed_articles(failed=None, expiry=None, limit=None, force=False, reprocessing_type=None): u""" Reprocess articles that failed absolutization. In case there was a temporary error, this could lead to more good articles. """ if config.ARTICLE_REPROCESSING_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Articles reprocess disabled in configuration.') return if failed is None: raise RuntimeError(u'Need a queryset of failed items to reprocess.') # TODO: as the celery tasks expires, # the lock is probably not needed anymore. my_lock = RedisExpiringLock('reprocess_failed_articles_' + str(expiry), expire_time=expiry) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing failed articles reprocessing…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'reprocess_failed_articles() is already locked, ' u'aborting.') return failed_count = failed.count() with benchmark((u'Reprocess_failed_articles(expiry=%s): %s ' u' processing chains relaunched.') % (naturaldelta(expiry), failed_count)): try: for article in failed.iterator(): if reprocessing_type is None: article.url_error = None article.save() article_post_create_task.apply(args=(article.id, ), kwargs={'apply_now': True}) elif reprocessing_type == 'standard': article.process() finally: # HEADS UP: in case the system is overloaded, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to flood the queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass
def refresh_all_mongo_feeds(limit=None, force=False): u""" Refresh all MongoEngine feeds (RSS). .. note:: this task should vanish when MongoDB → PostgreSQL migration is done. """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s. my_lock = RedisExpiringLock( 'refresh_all_mongo_feeds', expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_mongo_feeds() is already ' u'locked, aborting.') return feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True) if limit: feeds = feeds.limit(limit) # No need for caching and cluttering CPU/memory for a one-shot thing. feeds.no_cache() with benchmark('refresh_all_mongo_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): LOGGER.info(u'Feed %s already locked, skipped.', feed) continue interval = timedelta(seconds=feed.fetch_interval) if feed.last_fetch is None: mongo_feed_refresh_task.delay(feed.id) LOGGER.info( u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) elif force or feed.last_fetch + interval < mynow: how_late = feed.last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds countdown = 0 mongo_feed_refresh_task.delay(feed.id, force) LOGGER.info( u'%s refresh of feed %s %s (%s late).', u'Scheduled randomized' if countdown else u'Launched', feed, u' in {0}'.format(naturaldelta(countdown)) if countdown else u'in the background', naturaldelta(how_late)) count += 1 finally: # HEADS UP: see core.tasks.refresh_all_feeds() for note. # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def global_duplicates_checker(limit=None, force=False): """ Check that duplicate articles have no more Reads anywhere. Fix it if not, and update all counters accordingly. :param limit: integer, the maximum number of duplicates to check. Default: none. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. """ if config.CHECK_DUPLICATES_DISABLED: LOGGER.warning(u'Duplicates check disabled in configuration.') return # This task runs one a day. Acquire the lock for just a # little more time to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing duplicates check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_subscriptions_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_DUPLICATES_LIMIT start_time = pytime.time() duplicates = BaseItem.objects.duplicate() total_dupes_count = duplicates.count() total_reads_count = 0 processed_dupes = 0 done_dupes_count = 0 purged_dupes_count = 0 purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS) purge_after_weeks_count = min(52, purge_after_weeks_count) purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7) LOGGER.info(u'Done counting (took %s of pure SQL joy), starting procedure.', naturaldelta(pytime.time() - start_time)) with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all', total_dupes_count)): try: for duplicate in duplicates.iterator(): reads = duplicate.reads.all() processed_dupes += 1 if reads.exists(): done_dupes_count += 1 reads_count = reads.count() total_reads_count += reads_count LOGGER.info(u'Duplicate %s #%s still has %s reads, fixing…', duplicate._meta.model.__name__, duplicate.id, reads_count) duplicate.duplicate_of.register_duplicate( duplicate, force=duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED) if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED: # # TODO: check we didn't get some race-conditions new # dependancies between the moment the duplicate # was marked duplicate and now. if duplicate.date_created < purge_before_date: try: with transaction.atomic(): duplicate.delete() except: LOGGER.exception(u'Exception while deleting ' u'duplicate %s #%s', duplicate._meta.model.__name__, duplicate.id) purged_dupes_count += 1 LOGGER.info(u'Purged duplicate %s #%s from database.', duplicate._meta.model.__name__, duplicate.id) elif duplicate.duplicate_status in ( DUPLICATE_STATUS.NOT_REPLACED, DUPLICATE_STATUS.FAILED): # Something went wrong, perhaps the # task was purged before beiing run. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 elif duplicate.duplicate_status is None: # Something went very wrong. If the article is a known # duplicate, its status field should have been set to # at least NOT_REPLACED. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 LOGGER.error(u'Corrected duplicate %s #%s found with no ' u'status.', duplicate._meta.model.__name__, duplicate.id) if limit and processed_dupes >= limit: break finally: my_lock.release() LOGGER.info(u'global_duplicates_checker(): %s/%s duplicates processed ' u'(%.2f%%; limit: %s), %s corrected (%.2f%%), ' u'%s purged (%.2f%%); %s reads altered.', processed_dupes, total_dupes_count, processed_dupes * 100.0 / total_dupes_count, limit or u'none', done_dupes_count, (done_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, purged_dupes_count, (purged_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, total_reads_count)
def global_users_checker(limit=None, extended_check=False, force=False, verbose=False, break_on_exception=False): """ Check all Users and their dependancies. Can be disabled by ``config.CHECK_USERS_DISABLED`` directive. :param limit: integer, the maximum number of users to check. Default: none. :param extended_check: boolean, default ``False``. Forwarded to :func:`check_one_user`. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. :param verbose: boolean, default ``False``. Forwarded to :func:`check_one_user`. :param break_on_exception: boolean, default ``False``, currently ignored in this function. """ if config.CHECK_USERS_DISABLED: LOGGER.warning(u'Users check disabled in configuration.') return # This task runs twice a day. Acquire the lock for just a # little more time (13h, because Redis doesn't like floats) # to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_users', expire_time=3600 * 13) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing users check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_users_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_USERS_LIMIT active_users = User.objects.filter(is_active=True) total_users_count = active_users.count() processed_users = 0 changed_users = 0 skipped_count = 0 with benchmark(u"Check {0}/{1} users".format(limit or u'all', total_users_count)): try: for user in active_users.iterator(): processed_users += 1 if limit and changed_users >= limit: break check_one_user(user, extended_check=extended_check, force=force, verbose=verbose) finally: my_lock.release() LOGGER.info(u'global_users_checker(): %s/%s users processed ' u'(%.2f%%), %s corrected (%.2f%%), %s skipped (%.2f%%).', processed_users, total_users_count, processed_users * 100.0 / total_users_count, changed_users, changed_users * 100.0 / processed_users, skipped_count, skipped_count * 100.0 / processed_users)
def global_orphaned_checker(limit=None, extended_check=False, force=False, verbose=False, break_on_exception=False): """ Check all orphaned articles and delete them. They will be deleted only if they are duplicate of other orphaned ones, and only if the duplication replacement process finished successfully. If it failed, the orphan is left in place, to be able to re-run the operation later. Can be disabled by ``config.CHECK_ORPHANED_DISABLED`` directive. :param limit: integer, the maximum number of users to check. Default: none. :param extended_check: boolean, default ``False``. Forwarded to :func:`check_one_user`. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. :param verbose: boolean, default ``False``. Forwarded to :func:`check_one_user`. :param break_on_exception: boolean, default ``False``, currently ignored in this function. """ if config.CHECK_ORPHANED_DISABLED: LOGGER.warning(u'Orphaned check disabled in configuration.') return # This task runs twice a day. Acquire the lock for just a # little more time (13h, because Redis doesn't like floats) # to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_orphaned', expire_time=3600 * 13) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing orphaned check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_orphaned_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_ORPHANED_LIMIT orphaned_items = Article.objects.orphaned().master() orphaned_items_count = orphaned_items.count() processed_orphans = 0 changed_orphans = 0 deleted_orphans = 0 skipped_orphans = 0 with benchmark(u"Check {0}/{1} orphans".format(limit or u'all', orphaned_items_count)): try: for orphan in orphaned_items.iterator(): processed_orphans += 1 if limit and changed_orphans >= limit: break old_url = orphan.url new_url = ARTICLE_ORPHANED_BASE + generate_orphaned_hash( orphan.name, orphan.feeds.all()) if new_url != old_url: orphan.url = new_url orphan.url_absolute = True else: if not orphan.url_absolute: changed_orphans += 1 orphan.url_absolute = True orphan.save() continue try: orphan.save() except IntegrityError: master = Article.objects.get(url=orphan.url) # We have to put back the original URL, else the # duplicate registration process will fail. orphan.url = old_url # Register the duplicate right here and now, to be able to master.register_duplicate(orphan, force=force, background=False) # Reload the orphan to get the refreshed duplicate status. orphan = Article.objects.get(id=orphan.id) if orphan.duplicate_status == DUPLICATE_STATUS.FINISHED: orphan.delete() deleted_orphans += 1 if verbose: LOGGER.info(u'Deleted duplicate orphan %s', orphan) except: skipped_orphans += 1 LOGGER.exception(u'Unhandled exception while checking %s', orphan) else: changed_orphans += 1 finally: my_lock.release() LOGGER.info( u'global_orphans_checker(): %s/%s orphans processed ' u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), ' u'%s skipped (%.2f%%).', processed_orphans, orphaned_items_count, processed_orphans * 100.0 / orphaned_items_count, changed_orphans, changed_orphans * 100.0 / processed_orphans, deleted_orphans, deleted_orphans * 100.0 / processed_orphans, skipped_orphans, skipped_orphans * 100.0 / processed_orphans)
# ————————————————————————————————————————————————————————————————— start ghost if config.FEED_FETCH_GHOST_ENABLED: try: import ghost except: ghost = None # NOQA else: GHOST_BROWSER = ghost.Ghost() else: ghost = None # NOQA # Until we patch Ghost to use more than one Xvfb at the same time, # we are tied to ensure there is only one running at a time. global_ghost_lock = RedisExpiringLock('__ghost.py__') # ——————————————————————————————————————————————————————————— QuerySet patching def BaseItemQuerySet_empty_method(self): """ Patch BaseItemQuerySet to know how to return empty content. """ return self.filter(content_type__in=[None, CONTENT_TYPES.NONE]) def BaseItemQuerySet_parsed_method(self): """ Patch BaseItemQuerySet to know how to return parsed content. """ return self.filter(content_type__in=CONTENT_TYPES_FINAL)
def User_share_lock_property_get(self): """ Return a redis expiring lock to avoid sharing to same user too much. """ return RedisExpiringLock(self, 'share')
def global_reads_checker(limit=None, extended_check=False, force=False, verbose=False, break_on_exception=False): """ Check all reads and their dependants. Will activate reads that are currently bad, but whose article is OK to display. This task is one of the most expensive thing in 1flow. It can run for hours because it scans all the bad reads and their articles, but will not kill the database with massive updates, it does them one by one. Can be disabled by ``config.CHECK_READS_DISABLED`` directive. :param limit: integer, the maximum number of duplicates to check. Default: none. :param extended_check: boolean, default ``False``. Runs :meth:`Read.set_subscriptions` if ``True`` and checked read has no subscription. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. :param verbose: boolean, default ``False``, display (more) informative messages. :param break_on_exception: boolean, default ``False``, stop processing at the first encountered exception. Whatever it is, the exception will be logged to sentry. """ if config.CHECK_READS_DISABLED: LOGGER.warning(u'Reads check disabled in configuration.') return # This task runs twice a day. Acquire the lock for just a # little more time (13h, because Redis doesn't like floats) # to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_reads', expire_time=3600 * 13) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing reads check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_reads_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_READS_LIMIT bad_reads = Read.objects.bad() total_reads_count = bad_reads.count() processed_reads = 0 wiped_reads_count = 0 changed_reads_count = 0 skipped_count = 0 with benchmark(u"Check {0}/{1} reads".format(limit or u'all', total_reads_count)): try: for read in bad_reads.iterator(): processed_reads += 1 if limit and changed_reads_count >= limit: break if read.is_good: # This read has been activated via another # checked one, attached to the same article. changed_reads_count += 1 continue try: article = read.item except: LOGGER.critical(u'Could not get read.item for %s', read) continue if extended_check: try: if read.subscriptions.all().exists(): # TODO: remove this # check_set_subscriptions_131004_done # transient check. if read.check_set_subscriptions_131004_done: read.check_subscriptions() else: read.check_set_subscriptions_131004() else: read.set_subscriptions() except: skipped_count += 1 LOGGER.exception( u'Could not set subscriptions on ' u'read #%s, from article #%s, for ' u'user #%s. Skipping.', read.id, article.id, read.user.id) continue try: if article.is_good: changed_reads_count += 1 if verbose: LOGGER.info( u'Bad read %s has a good article, ' u'fixing…', read) article.activate_reads(extended_check=extended_check) except: LOGGER.exception( u'Could not activate reads from ' u'article %s of read %s.', article, read) if break_on_exception: break finally: my_lock.release() LOGGER.info( u'global_reads_checker(): %s/%s reads processed ' u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), ' u'%s skipped (%.2f%%).', processed_reads, total_reads_count, processed_reads * 100.0 / total_reads_count, changed_reads_count, changed_reads_count * 100.0 / processed_reads, wiped_reads_count, wiped_reads_count * 100.0 / processed_reads, skipped_count, skipped_count * 100.0 / processed_reads)
refresh_all_feeds.lock.acquire() LOGGER.warning(u'Throttled feed refresh because queue items ' u'is going too high (%s > %s)', feed_qitems, feeds_count) elif feed_qitems < low_limit: # Unleash the kraken! try: refresh_all_feeds.lock.release() except: pass LOGGER.info(u'Unthrottled feed refreshes, queue items number ' u'is low enough (%s for %s feeds).', feed_qitems, feeds_count) else: LOGGER.debug(u'Not throttled, %s < items(%s) <= feeds(%s).', low_limit, feed_qitems, feeds_count) finally: my_lock.release() # Allow to release the lock manually for testing purposes. throttle_feed_refresh.lock = RedisExpiringLock(THROTTLE_REFRESH_LOCK_NAME)
def global_users_checker(limit=None, extended_check=False, force=False, verbose=False, break_on_exception=False): """ Check all Users and their dependancies. Can be disabled by ``config.CHECK_USERS_DISABLED`` directive. :param limit: integer, the maximum number of users to check. Default: none. :param extended_check: boolean, default ``False``. Forwarded to :func:`check_one_user`. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. :param verbose: boolean, default ``False``. Forwarded to :func:`check_one_user`. :param break_on_exception: boolean, default ``False``, currently ignored in this function. """ if config.CHECK_USERS_DISABLED: LOGGER.warning(u'Users check disabled in configuration.') return # This task runs twice a day. Acquire the lock for just a # little more time (13h, because Redis doesn't like floats) # to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_users', expire_time=3600 * 13) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing users check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_users_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_USERS_LIMIT active_users = User.objects.filter(is_active=True) total_users_count = active_users.count() processed_users = 0 changed_users = 0 skipped_count = 0 with benchmark(u"Check {0}/{1} users".format(limit or u'all', total_users_count)): try: for user in active_users.iterator(): processed_users += 1 if limit and changed_users >= limit: break check_one_user(user, extended_check=extended_check, force=force, verbose=verbose) finally: my_lock.release() LOGGER.info( u'global_users_checker(): %s/%s users processed ' u'(%.2f%%), %s corrected (%.2f%%), %s skipped (%.2f%%).', processed_users, total_users_count, processed_users * 100.0 / total_users_count, changed_users, changed_users * 100.0 / processed_users, skipped_count, skipped_count * 100.0 / processed_users)
try: synchronize_statsd_articles_gauges(full=full) synchronize_statsd_tags_gauges(full=full) synchronize_statsd_websites_gauges(full=full) synchronize_statsd_authors_gauges(full=full) synchronize_statsd_feeds_gauges(full=full) synchronize_statsd_subscriptions_gauges(full=full) synchronize_statsd_reads_gauges(full=full) finally: my_lock.release() # Allow to release the lock manually for testing purposes. synchronize_statsd_gauges.lock = RedisExpiringLock( SYNCHRONIZE_STATSD_LOCK_NAME) @beat_init.connect() def clear_all_locks(conf=None, **kwargs): """ Clear all expiring locks when celery beat starts. """ for key, value in globals().items(): if hasattr(value, 'lock'): getattr(value, 'lock').release() LOGGER.info(u'Released %s() lock.', key) locked_count = 0 for feed in BaseFeed.objects.filter(is_active=True, is_internal=False):
def global_subscriptions_checker(force=False, limit=None, from_feeds=True, from_users=False, extended_check=False): """ A conditionned version of :meth:`Feed.check_subscriptions`. """ if config.CHECK_SUBSCRIPTIONS_DISABLED: LOGGER.warning(u'Subscriptions checks disabled in configuration.') return # This task runs one a day. Acquire the lock for just a # little more time to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_subscriptions', expire_time=3600 * 25) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing subscriptions checks…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_subscriptions_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_SUBSCRIPTIONS_LIMIT assert int(limit) >= 0 try: if from_feeds: with benchmark("Check all subscriptions from feeds"): # We check ALL feeds (including inactive ones) to be # sure all subscriptions / reads are up-to-date. feeds = BaseFeed.objects.all() feeds_count = feeds.count() processed_count = 0 checked_count = 0 for feed in feeds.iterator(): if limit and checked_count > limit: break if extended_check: feed.compute_cached_descriptors() # all=True, good=True, bad=True # Do not extended_check=True, this would double-do # the subscription.check_reads() already called below. feed.check_subscriptions() for subscription in feed.subscriptions.all().iterator(): processed_count += 1 if subscription.all_items_count \ != feed.good_items_count: checked_count += 1 LOGGER.info(u'Subscription %s (#%s) has %s reads ' u'whereas its feed has %s good ' u'articles; checking…', subscription.name, subscription.id, subscription.all_items_count, feed.good_items_count) subscription.check_reads( extended_check=extended_check, force=True) LOGGER.info(u'%s/%s (limit:%s) feeds processed, %s ' u'checked (%.2f%%).', processed_count, feeds_count, limit, checked_count, checked_count * 100.0 / processed_count) if from_users: with benchmark("Check all subscriptions from users"): users = User.objects.filter(is_active=True) users_count = users.count() processed_count = 0 for user in users: # Do not extended_check=True, this would double-do # the subscription.check_reads() already called below. user.check_subscriptions() if extended_check: user.user_counters.compute_cached_descriptors() # all=True, unread=True, starred=True, bookmarked=True for subscription in user.subscriptions.all().iterator(): processed_count += 1 subscription.check_reads(extended_check=True, force=True) LOGGER.info(u'%s users %sprocessed. ' u'All were checked.', users_count, u'and %s subscriptions '.format(processed_count) if extended_check else u'') finally: my_lock.release()
# HEADS UP: in case the system is overloaded and feeds refresh() # tasks don't complete fast enough, the current task # will overload it even more. Thus, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to feed the refresh queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count()) # Allow to release the lock manually for testing purposes. refresh_all_feeds.lock = RedisExpiringLock(REFRESH_ALL_FEEDS_LOCK_NAME) @task(name='oneflow.core.tasks.refresh_all_mailaccounts', queue='refresh') def refresh_all_mailaccounts(force=False): """ Check all unusable e-mail accounts. """ if config.MAIL_ACCOUNT_REFRESH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'E-mail accounts check disabled in configuration.') return accounts = MailAccount.objects.unusable() my_lock = RedisExpiringLock(REFRESH_ALL_MAILACCOUNTS_LOCK_NAME, expire_time=30 * (accounts.count() + 2))
def global_reads_checker(limit=None, extended_check=False, force=False, verbose=False, break_on_exception=False): """ Check all reads and their dependants. Will activate reads that are currently bad, but whose article is OK to display. This task is one of the most expensive thing in 1flow. It can run for hours because it scans all the bad reads and their articles, but will not kill the database with massive updates, it does them one by one. Can be disabled by ``config.CHECK_READS_DISABLED`` directive. :param limit: integer, the maximum number of duplicates to check. Default: none. :param extended_check: boolean, default ``False``. Runs :meth:`Read.set_subscriptions` if ``True`` and checked read has no subscription. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. :param verbose: boolean, default ``False``, display (more) informative messages. :param break_on_exception: boolean, default ``False``, stop processing at the first encountered exception. Whatever it is, the exception will be logged to sentry. """ if config.CHECK_READS_DISABLED: LOGGER.warning(u'Reads check disabled in configuration.') return # This task runs twice a day. Acquire the lock for just a # little more time (13h, because Redis doesn't like floats) # to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_reads', expire_time=3600 * 13) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing reads check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_reads_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_READS_LIMIT bad_reads = Read.objects.bad() total_reads_count = bad_reads.count() processed_reads = 0 wiped_reads_count = 0 changed_reads_count = 0 skipped_count = 0 with benchmark(u"Check {0}/{1} reads".format(limit or u'all', total_reads_count)): try: for read in bad_reads.iterator(): processed_reads += 1 if limit and changed_reads_count >= limit: break if read.is_good: # This read has been activated via another # checked one, attached to the same article. changed_reads_count += 1 continue try: article = read.item except: LOGGER.critical(u'Could not get read.item for %s', read) continue if extended_check: try: if read.subscriptions.all().exists(): # TODO: remove this # check_set_subscriptions_131004_done # transient check. if read.check_set_subscriptions_131004_done: read.check_subscriptions() else: read.check_set_subscriptions_131004() else: read.set_subscriptions() except: skipped_count += 1 LOGGER.exception(u'Could not set subscriptions on ' u'read #%s, from article #%s, for ' u'user #%s. Skipping.', read.id, article.id, read.user.id) continue try: if article.is_good: changed_reads_count += 1 if verbose: LOGGER.info(u'Bad read %s has a good article, ' u'fixing…', read) article.activate_reads(extended_check=extended_check) except: LOGGER.exception(u'Could not activate reads from ' u'article %s of read %s.', article, read) if break_on_exception: break finally: my_lock.release() LOGGER.info(u'global_reads_checker(): %s/%s reads processed ' u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), ' u'%s skipped (%.2f%%).', processed_reads, total_reads_count, processed_reads * 100.0 / total_reads_count, changed_reads_count, changed_reads_count * 100.0 / processed_reads, wiped_reads_count, wiped_reads_count * 100.0 / processed_reads, skipped_count, skipped_count * 100.0 / processed_reads)
def refresh_all_feeds(limit=None, force=False): u""" Refresh all feeds (RSS/Mail/Twitter…). """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable, # we should re-evaluate it each time we run. this_round_expire_time = (config.FEED_GLOBAL_REFRESH_INTERVAL * 60 - config.FEED_GLOBAL_REFRESH_INTERVAL) # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s. # my_lock = RedisExpiringLock(REFRESH_ALL_FEEDS_LOCK_NAME, expire_time=this_round_expire_time) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.') return # This should bring us a Polymorphic Query to refresh all feeds types. feeds = BaseFeed.objects.filter( is_active=True, is_internal=False).order_by('date_last_fetch') if limit: feeds = feeds[:limit] with benchmark('refresh_all_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): # The refresh task lauched before its expiration, and is # still [long] running while we want to launch another. # Avoid, because the new would exit immediately on # date_last_fetch too recent. LOGGER.debug(u'Feed %s already locked, skipped.', feed) continue if feed.date_last_fetch is None: basefeed_refresh_task.apply_async( args=(feed.id, ), # in `this_round_expire_time`, we will relaunch it # anyway, so don't clutter the queue with double work. expire=this_round_expire_time, ) LOGGER.info( u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) count += 1 continue if feed.fetch_interval > 86399: interval_days = feed.fetch_interval / 86400 interval_seconds = feed.fetch_interval - (interval_days * 86400) interval = timedelta(days=interval_days, seconds=interval_seconds) else: interval = timedelta(seconds=feed.fetch_interval) if force or feed.date_last_fetch + interval < mynow: how_late = feed.date_last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds late = feed.date_last_fetch + interval < mynow basefeed_refresh_task.apply_async( args=(feed.id, ), kwargs={'force': force}, expire=this_round_expire_time, ) LOGGER.info(u'Launched refresh of feed %s (%s %s).', feed, naturaldelta(how_late), u'late' if late else u'earlier') count += 1 finally: # HEADS UP: in case the system is overloaded and feeds refresh() # tasks don't complete fast enough, the current task # will overload it even more. Thus, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to feed the refresh queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def global_orphaned_checker(limit=None, extended_check=False, force=False, verbose=False, break_on_exception=False): """ Check all orphaned articles and delete them. They will be deleted only if they are duplicate of other orphaned ones, and only if the duplication replacement process finished successfully. If it failed, the orphan is left in place, to be able to re-run the operation later. Can be disabled by ``config.CHECK_ORPHANED_DISABLED`` directive. :param limit: integer, the maximum number of users to check. Default: none. :param extended_check: boolean, default ``False``. Forwarded to :func:`check_one_user`. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. :param verbose: boolean, default ``False``. Forwarded to :func:`check_one_user`. :param break_on_exception: boolean, default ``False``, currently ignored in this function. """ if config.CHECK_ORPHANED_DISABLED: LOGGER.warning(u'Orphaned check disabled in configuration.') return # This task runs twice a day. Acquire the lock for just a # little more time (13h, because Redis doesn't like floats) # to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_orphaned', expire_time=3600 * 13) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing orphaned check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_orphaned_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_ORPHANED_LIMIT orphaned_items = Article.objects.orphaned().master() orphaned_items_count = orphaned_items.count() processed_orphans = 0 changed_orphans = 0 deleted_orphans = 0 skipped_orphans = 0 with benchmark(u"Check {0}/{1} orphans".format(limit or u'all', orphaned_items_count)): try: for orphan in orphaned_items.iterator(): processed_orphans += 1 if limit and changed_orphans >= limit: break old_url = orphan.url new_url = ARTICLE_ORPHANED_BASE + generate_orphaned_hash( orphan.name, orphan.feeds.all()) if new_url != old_url: orphan.url = new_url orphan.url_absolute = True else: if not orphan.url_absolute: changed_orphans += 1 orphan.url_absolute = True orphan.save() continue try: orphan.save() except IntegrityError: master = Article.objects.get(url=orphan.url) # We have to put back the original URL, else the # duplicate registration process will fail. orphan.url = old_url # Register the duplicate right here and now, to be able to master.register_duplicate(orphan, force=force, background=False) # Reload the orphan to get the refreshed duplicate status. orphan = Article.objects.get(id=orphan.id) if orphan.duplicate_status == DUPLICATE_STATUS.FINISHED: orphan.delete() deleted_orphans += 1 if verbose: LOGGER.info(u'Deleted duplicate orphan %s', orphan) except: skipped_orphans += 1 LOGGER.exception(u'Unhandled exception while checking %s', orphan) else: changed_orphans += 1 finally: my_lock.release() LOGGER.info(u'global_orphans_checker(): %s/%s orphans processed ' u'(%.2f%%), %s corrected (%.2f%%), %s deleted (%.2f%%), ' u'%s skipped (%.2f%%).', processed_orphans, orphaned_items_count, processed_orphans * 100.0 / orphaned_items_count, changed_orphans, changed_orphans * 100.0 / processed_orphans, deleted_orphans, deleted_orphans * 100.0 / processed_orphans, skipped_orphans, skipped_orphans * 100.0 / processed_orphans)
def reprocess_failed_articles(failed=None, expiry=None, limit=None, force=False, reprocessing_type=None): u""" Reprocess articles that failed absolutization. In case there was a temporary error, this could lead to more good articles. """ if config.ARTICLE_REPROCESSING_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Articles reprocess disabled in configuration.') return if failed is None: raise RuntimeError(u'Need a queryset of failed items to reprocess.') # TODO: as the celery tasks expires, # the lock is probably not needed anymore. my_lock = RedisExpiringLock( 'reprocess_failed_articles_' + str(expiry), expire_time=expiry ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing failed articles reprocessing…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'reprocess_failed_articles() is already locked, ' u'aborting.') return failed_count = failed.count() with benchmark((u'Reprocess_failed_articles(expiry=%s): %s ' u' processing chains relaunched.') % (naturaldelta(expiry), failed_count)): try: for article in failed.iterator(): if reprocessing_type is None: article.url_error = None article.save() article_post_create_task.apply(args=(article.id, ), kwargs={'apply_now': True}) elif reprocessing_type == 'standard': article.process() finally: # HEADS UP: in case the system is overloaded, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to flood the queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass