def to_string(cls, results): """ Transform results into strings. Results must be the value returned by a call of :meth:`classify`. """ errors = results.get(u'error_types') output = ( u'>> %s error types: %s distinct on %s instances, ' u'computed in %s\n' % (cls.__name__[:-15], len(errors), results.get(u'seen_objects'), naturaldelta(results.get(u'duration')))) stored = results.get('stored_instances') output += u'\n'.join(u'%s: %s' % (k, v) for k, v in sorted(errors.items(), key=operator.itemgetter(1), reverse=True)) + u'\n\n' output += u'>> to get them, stored by error kind:\n' output += u'\n'.join(u'results.get("stored_instances").get("%s")' % s for s in stored) return output
def to_string(cls, results): """ Transform results into strings. Results must be the value returned by a call of :meth:`classify`. """ errors = results.get(u'error_types') output = (u'>> %s error types: %s distinct on %s instances, ' u'computed in %s\n' % ( cls.__name__[:-15], len(errors), results.get(u'seen_objects'), naturaldelta(results.get(u'duration')))) stored = results.get('stored_instances') output += u'\n'.join(u'%s: %s' % (k, v) for k, v in sorted( errors.items(), key=operator.itemgetter(1), reverse=True)) + u'\n\n' output += u'>> to get them, stored by error kind:\n' output += u'\n'.join(u'results.get("stored_instances").get("%s")' % s for s in stored) return output
def reading_time_abstracted(self): rtm = self.reading_time if rtm is None: return u'' inum = 1 icon = u'∎' # u'<i class="icon-time"></i>' tmpl = _(u'<span class="popover-top" data-toggle="tooltip" ' u'title="Reading time: {0}">{1}</span>') time = naturaldelta(timedelta(seconds=rtm * 60)) if rtm > 8: inum = 4 elif rtm > 3: inum = 3 elif rtm > 1: inum = 2 elif rtm == 0: # HEADS UP: patch/hack; non-breakable spaces everywhere. time = _(u'very quick (<1 min)') return tmpl.format(time, inum * icon)
def fetch_interval_display(self, obj): """ FILL ME, pep257. """ if obj.is_active: with django_language(): return naturaldelta(obj.fetch_interval) return u'—'
def fetch_interval_display(self, obj): """ FILL ME, pep257. """ if obj.closed: return u'—' with django_language(): return naturaldelta(obj.fetch_interval)
def format_quota(quota): if quota['remaining'] is None: return u' (no quota information)' if quota['remaining']: return u'; quota: %s call(s) remaining' % quota['remaining'] else: return u'; quota exhausted, reset in %s' % ( naturaldelta(now() - quota['reset'].replace(tzinfo=utc)))
def clean_obsolete_redis_keys(): """ Call in turn all redis-related cleaners. """ start_time = pytime.time() if today() <= (config.GR_END_DATE + timedelta(days=1)): clean_gri_keys() LOGGER.info(u'clean_obsolete_redis_keys(): finished in %s.', naturaldelta(pytime.time() - start_time))
def reading_time_display(self): rtm = self.reading_time if rtm is None: return u'' if rtm == 0: return _(u'a quick read') return _(u'{0} read').format(naturaldelta(timedelta(seconds=rtm * 60)))
def format_quota(quota): if quota['remaining'] is None: return u' (no quota information)' if quota['remaining']: return u'; quota: %s call(s) remaining' % quota['remaining'] else: return u'; quota exhausted, reset in %s' % ( naturaldelta(now() - quota['reset'].replace(tzinfo=utc)) )
def run(self): """ Run the import. """ # # NOTE: we don't care if the import was already running, finished, # whatever. This class is able to recover and re-run itself # over and over without doing bad thing in the database. # is_retrying = self.status == IMPORT_STATUS.RETRY self.status = IMPORT_STATUS.RUNNING self.date_started = now() self.save() try: return self.run_internal() except: LOGGER.exception(u'User import %s failed') if is_retrying: message_user(self.user, _(u'Your import #{0} failed to run after a ' u'retry. Please review it before relaunching ' u'it manually again.').format(self.id), constants.ERROR) self.status = IMPORT_STATUS.FAILED else: countdown = randrange(1800, 3600) delta_cd = naturaldelta(timedelta(seconds=countdown)) message_user(self.user, _(u'Your import #{0} failed to run. If will ' u'be automatically retried in {1}').format( self.id, delta_cd), constants.WARNING) globals()['userimport_run_task'].apply_async( (self.id, ), countdown=countdown) self.status = IMPORT_STATUS.RETRY self.save()
def feed_toggle_is_active(request, feed_id): """ Toggle a feed active or not (open or closed) from anywhere. If called via ajax, send a message to the user and call the notification template. """ feed = BaseFeed.objects.get(id=feed_id) LOGGER.info('%s %s', feed.is_active, feed) if feed.is_active: feed.close(u'Closed manually by {0}'.format(request.user.username)) messages.warning( request, _(u'Closed {0} <strong>{1}</strong>.').format( feed._meta.verbose_name, feed.name), extra_tags='safe' ) else: feed.reopen() messages.success( request, _(u'Re-opened {0} <strong>{1}</strong>. ' u'Next fetch in {2}.').format( feed._meta.verbose_name, feed.name, naturaldelta(feed.fetch_interval)), extra_tags='safe' ) if request.is_ajax(): return render(request, 'snippets/feed/is-active-toggle.html') # Standard request (admin or POST) if request.user.is_staff_or_superuser_and_enabled: fallback_referer = reverse('admin:index') else: fallback_referer = reverse('home') return HttpResponseRedirect(request.META.get('HTTP_REFERER', fallback_referer))
def feed_toggle_is_active(request, feed_id): """ Toggle a feed active or not (open or closed) from anywhere. If called via ajax, send a message to the user and call the notification template. """ feed = BaseFeed.objects.get(id=feed_id) LOGGER.info('%s %s', feed.is_active, feed) if feed.is_active: feed.close(u'Closed manually by {0}'.format(request.user.username)) messages.warning(request, _(u'Closed {0} <strong>{1}</strong>.').format( feed._meta.verbose_name, feed.name), extra_tags='safe') else: feed.reopen() messages.success(request, _(u'Re-opened {0} <strong>{1}</strong>. ' u'Next fetch in {2}.').format( feed._meta.verbose_name, feed.name, naturaldelta(feed.fetch_interval)), extra_tags='safe') if request.is_ajax(): return render(request, 'snippets/feed/is-active-toggle.html') # Standard request (admin or POST) if request.user.is_staff_or_superuser_and_enabled: fallback_referer = reverse('admin:index') else: fallback_referer = reverse('home') return HttpResponseRedirect( request.META.get('HTTP_REFERER', fallback_referer))
def feed_distribution_by_last_fetch_display(results=None): """ Display feeds by last_fetch. """ if results is None: results = feed_distribution_by_last_fetch() meta = results.get('meta') output = u'' for loop_count in xrange(meta.get('loop_count')): feeds, count, percent, lower_value, upper_value, avg_fi = \ results.get(loop_count) output += u'%s feeds (%.1f%%) fetched ' % (count, float(percent)) if lower_value is None: output += u'less than %s ago' % naturaldelta(upper_value) elif upper_value is None: output += u'more than %s ago' % naturaldelta(lower_value) else: output += u'between %s and %s ago' % (naturaldelta(lower_value), naturaldelta(upper_value)) output += (u', avg fetch interval: %s' % naturaldelta(avg_fi) + (u' — in results[%s]\n' % loop_count)) if meta['fetched_feeds'] == meta['open_feeds_count']: output += u'\n>>> All open feeds are beiing fetched.' else: output += (u'%s total feeds fetched, out of %s open feeds.\n' u'[computed in %s]') % (meta['fetched_feeds'], meta['open_feeds_count'], naturaldelta(meta['duration'])) return results, output
def feed_distribution_by_last_fetch_display(results=None): """ Display feeds by last_fetch. """ if results is None: results = feed_distribution_by_last_fetch() meta = results.get('meta') output = u'' for loop_count in xrange(meta.get('loop_count')): feeds, count, percent, lower_value, upper_value, avg_fi = \ results.get(loop_count) output += u'%s feeds (%.1f%%) fetched ' % (count, float(percent)) if lower_value is None: output += u'less than %s ago' % naturaldelta(upper_value) elif upper_value is None: output += u'more than %s ago' % naturaldelta(lower_value) else: output += u'between %s and %s ago' % (naturaldelta(lower_value), naturaldelta(upper_value)) output += (u', avg fetch interval: %s' % naturaldelta(avg_fi) + (u' — in results[%s]\n' % loop_count)) if meta['fetched_feeds'] == meta['open_feeds_count']: output += u'\n>>> All open feeds are beiing fetched.' else: output += (u'%s total feeds fetched, out of %s open feeds.\n' u'[computed in %s]') % ( meta['fetched_feeds'], meta['open_feeds_count'], naturaldelta(meta['duration'])) return results, output
def global_feeds_checker(): """ Check all RSS feeds and their dependants. Close them if needed. No parameter. """ def pretty_print_feed(feed): return (u'- %s,\n' u' - admin url: http://%s%s\n' u' - public url: %s\n' u' - %s\n' u' - reason: %s\n' u' - last error: %s') % ( feed, settings.SITE_DOMAIN, reverse('admin:%s_%s_change' % (feed._meta.app_label, feed._meta.module_name), args=[feed.id]), # Only RSS/Atom feeds have an URL… feed.url if hasattr(feed, 'url') else '(NO URL)', (u'closed on %s' % feed.date_closed) if feed.date_closed else u'(no closing date)', feed.closed_reason or u'none (or manually closed from the admin interface)', feed.errors[0] if len(feed.errors) else u'(no error recorded)') def pretty_print_feed_list(feed_list): return '\n\n'.join(pretty_print_feed(feed) for feed in feed_list) dtnow = now() limit_days = config.FEED_CLOSED_WARN_LIMIT closed_limit = dtnow - timedelta(days=limit_days) closed_tested = 0 reopened_list = [] # ———————————————————————————————— See if old closed feeds can be reopened. old_closed_feeds = BaseFeed.objects.filter(is_active=False).filter( date_closed__lt=closed_limit) for feed in old_closed_feeds: # check all closed feeds monthly, on their closing date anniversary. if feed.date_closed.day == dtnow.day: if feed.check_old_closed(): reopened_list.append(feed) closed_tested += 1 # ——————————————————————————————————————————— Report recently closed feeds. recently_closed_feeds = BaseFeed.objects.filter(is_active=False).filter( Q(date_closed=None) | Q(date_closed__gte=closed_limit)) if not recently_closed_feeds.exists(): LOGGER.info( 'No feed was closed in the last %s days, %s already ' u'closed checked for eventual back-to-life, of which ' u'%s were reopened.', limit_days, closed_tested, len(reopened_list)) return count = recently_closed_feeds.count() mail_managers( _(u'Reminder: {0} feed(s) closed in last ' u'{1} day(s), {2} automatically reopened').format( count, limit_days, len(reopened_list)), FEED_CHECK_TEMPLATE_TXT.format( feed_list=pretty_print_feed_list(recently_closed_feeds), closed_tested=closed_tested, reopened_count=len(reopened_list), reopened_list=pretty_print_feed_list(reopened_list)), ) start_time = pytime.time() # Close the feeds, but after sending the mail, # So that initial reason is displayed at least # once to a real human. for feed in recently_closed_feeds: if feed.date_closed is None: feed.close('Automatic close by periodic checker task') LOGGER.info('Closed %s feeds in %s.', count, naturaldelta(pytime.time() - start_time))
def global_duplicates_checker(limit=None, force=False): """ Check that duplicate articles have no more Reads anywhere. Fix it if not, and update all counters accordingly. :param limit: integer, the maximum number of duplicates to check. Default: none. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. """ if config.CHECK_DUPLICATES_DISABLED: LOGGER.warning(u'Duplicates check disabled in configuration.') return # This task runs one a day. Acquire the lock for just a # little more time to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing duplicates check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_subscriptions_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_DUPLICATES_LIMIT start_time = pytime.time() duplicates = BaseItem.objects.duplicate() total_dupes_count = duplicates.count() total_reads_count = 0 processed_dupes = 0 done_dupes_count = 0 purged_dupes_count = 0 purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS) purge_after_weeks_count = min(52, purge_after_weeks_count) purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7) LOGGER.info( u'Done counting (took %s of pure SQL joy), starting procedure.', naturaldelta(pytime.time() - start_time)) with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all', total_dupes_count)): try: for duplicate in duplicates.iterator(): reads = duplicate.reads.all() processed_dupes += 1 if reads.exists(): done_dupes_count += 1 reads_count = reads.count() total_reads_count += reads_count LOGGER.info( u'Duplicate %s #%s still has %s reads, fixing…', duplicate._meta.model.__name__, duplicate.id, reads_count) duplicate.duplicate_of.register_duplicate( duplicate, force=duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED) if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED: # # TODO: check we didn't get some race-conditions new # dependancies between the moment the duplicate # was marked duplicate and now. if duplicate.date_created < purge_before_date: try: with transaction.atomic(): duplicate.delete() except: LOGGER.exception( u'Exception while deleting ' u'duplicate %s #%s', duplicate._meta.model.__name__, duplicate.id) purged_dupes_count += 1 LOGGER.info(u'Purged duplicate %s #%s from database.', duplicate._meta.model.__name__, duplicate.id) elif duplicate.duplicate_status in ( DUPLICATE_STATUS.NOT_REPLACED, DUPLICATE_STATUS.FAILED): # Something went wrong, perhaps the # task was purged before beiing run. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 elif duplicate.duplicate_status is None: # Something went very wrong. If the article is a known # duplicate, its status field should have been set to # at least NOT_REPLACED. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 LOGGER.error( u'Corrected duplicate %s #%s found with no ' u'status.', duplicate._meta.model.__name__, duplicate.id) if limit and processed_dupes >= limit: break finally: my_lock.release() LOGGER.info( u'global_duplicates_checker(): %s/%s duplicates processed ' u'(%.2f%%; limit: %s), %s corrected (%.2f%%), ' u'%s purged (%.2f%%); %s reads altered.', processed_dupes, total_dupes_count, processed_dupes * 100.0 / total_dupes_count, limit or u'none', done_dupes_count, (done_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, purged_dupes_count, (purged_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, total_reads_count)
def refresh_all_mongo_feeds(limit=None, force=False): u""" Refresh all MongoEngine feeds (RSS). .. note:: this task should vanish when MongoDB → PostgreSQL migration is done. """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s. my_lock = RedisExpiringLock( 'refresh_all_mongo_feeds', expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_mongo_feeds() is already ' u'locked, aborting.') return feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True) if limit: feeds = feeds.limit(limit) # No need for caching and cluttering CPU/memory for a one-shot thing. feeds.no_cache() with benchmark('refresh_all_mongo_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): LOGGER.info(u'Feed %s already locked, skipped.', feed) continue interval = timedelta(seconds=feed.fetch_interval) if feed.last_fetch is None: mongo_feed_refresh_task.delay(feed.id) LOGGER.info( u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) elif force or feed.last_fetch + interval < mynow: how_late = feed.last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds countdown = 0 mongo_feed_refresh_task.delay(feed.id, force) LOGGER.info( u'%s refresh of feed %s %s (%s late).', u'Scheduled randomized' if countdown else u'Launched', feed, u' in {0}'.format(naturaldelta(countdown)) if countdown else u'in the background', naturaldelta(how_late)) count += 1 finally: # HEADS UP: see core.tasks.refresh_all_feeds() for note. # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def refresh_all_feeds(limit=None, force=False): u""" Refresh all feeds (RSS/Mail/Twitter…). """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable, # we should re-evaluate it each time we run. this_round_expire_time = ( config.FEED_GLOBAL_REFRESH_INTERVAL * 60 - config.FEED_GLOBAL_REFRESH_INTERVAL ) # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s. # my_lock = RedisExpiringLock( REFRESH_ALL_FEEDS_LOCK_NAME, expire_time=this_round_expire_time ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.') return # This should bring us a Polymorphic Query to refresh all feeds types. feeds = BaseFeed.objects.filter(is_active=True, is_internal=False).order_by( 'date_last_fetch') if limit: feeds = feeds[:limit] with benchmark('refresh_all_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): # The refresh task lauched before its expiration, and is # still [long] running while we want to launch another. # Avoid, because the new would exit immediately on # date_last_fetch too recent. LOGGER.debug(u'Feed %s already locked, skipped.', feed) continue if feed.date_last_fetch is None: basefeed_refresh_task.apply_async( args=(feed.id, ), # in `this_round_expire_time`, we will relaunch it # anyway, so don't clutter the queue with double work. expire=this_round_expire_time, ) LOGGER.info(u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) count += 1 continue if feed.fetch_interval > 86399: interval_days = feed.fetch_interval / 86400 interval_seconds = feed.fetch_interval - ( interval_days * 86400) interval = timedelta(days=interval_days, seconds=interval_seconds) else: interval = timedelta(seconds=feed.fetch_interval) if force or feed.date_last_fetch + interval < mynow: how_late = feed.date_last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds late = feed.date_last_fetch + interval < mynow basefeed_refresh_task.apply_async( args=(feed.id, ), kwargs={'force': force}, expire=this_round_expire_time, ) LOGGER.info(u'Launched refresh of feed %s (%s %s).', feed, naturaldelta(how_late), u'late' if late else u'earlier') count += 1 finally: # HEADS UP: in case the system is overloaded and feeds refresh() # tasks don't complete fast enough, the current task # will overload it even more. Thus, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to feed the refresh queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def reprocess_failed_articles(failed=None, expiry=None, limit=None, force=False, reprocessing_type=None): u""" Reprocess articles that failed absolutization. In case there was a temporary error, this could lead to more good articles. """ if config.ARTICLE_REPROCESSING_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Articles reprocess disabled in configuration.') return if failed is None: raise RuntimeError(u'Need a queryset of failed items to reprocess.') # TODO: as the celery tasks expires, # the lock is probably not needed anymore. my_lock = RedisExpiringLock('reprocess_failed_articles_' + str(expiry), expire_time=expiry) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing failed articles reprocessing…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'reprocess_failed_articles() is already locked, ' u'aborting.') return failed_count = failed.count() with benchmark((u'Reprocess_failed_articles(expiry=%s): %s ' u' processing chains relaunched.') % (naturaldelta(expiry), failed_count)): try: for article in failed.iterator(): if reprocessing_type is None: article.url_error = None article.save() article_post_create_task.apply(args=(article.id, ), kwargs={'apply_now': True}) elif reprocessing_type == 'standard': article.process() finally: # HEADS UP: in case the system is overloaded, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to flood the queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass
def reprocess_failed_articles(failed=None, expiry=None, limit=None, force=False, reprocessing_type=None): u""" Reprocess articles that failed absolutization. In case there was a temporary error, this could lead to more good articles. """ if config.ARTICLE_REPROCESSING_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Articles reprocess disabled in configuration.') return if failed is None: raise RuntimeError(u'Need a queryset of failed items to reprocess.') # TODO: as the celery tasks expires, # the lock is probably not needed anymore. my_lock = RedisExpiringLock( 'reprocess_failed_articles_' + str(expiry), expire_time=expiry ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing failed articles reprocessing…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'reprocess_failed_articles() is already locked, ' u'aborting.') return failed_count = failed.count() with benchmark((u'Reprocess_failed_articles(expiry=%s): %s ' u' processing chains relaunched.') % (naturaldelta(expiry), failed_count)): try: for article in failed.iterator(): if reprocessing_type is None: article.url_error = None article.save() article_post_create_task.apply(args=(article.id, ), kwargs={'apply_now': True}) elif reprocessing_type == 'standard': article.process() finally: # HEADS UP: in case the system is overloaded, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to flood the queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass
def import_google_reader_begin(user_id, access_token): auth = OAuth2Method(settings.GOOGLE_OAUTH2_CLIENT_ID, settings.GOOGLE_OAUTH2_CLIENT_SECRET) auth.authFromAccessToken(access_token) reader = GoogleReader(auth) django_user, mongo_user = get_user_from_dbs(user_id) username = django_user.username try: user_infos = reader.getUserInfo() except TypeError: LOGGER.exception(u'Could not start Google Reader import for user %s.', username) # Don't refresh, it's now done by a dedicated periodic task. # If we failed, it means the problem is quite serious. # import_google_reader_trigger(user_id, refresh=True) return GR_MAX_FEEDS = config.GR_MAX_FEEDS LOGGER.info(u'Starting Google Reader import for user %s.', username) gri = GoogleReaderImport(user_id) # take note of user informations now that we have them. gri.start(user_infos=user_infos) reader.buildSubscriptionList() total_reads, reg_date = reader.totalReadItems(without_date=False) total_starred, star1_date = reader.totalStarredItems(without_date=False) total_feeds = len(reader.feeds) + 1 # +1 for 'starred' gri.reg_date(pytime.mktime(reg_date.timetuple())) gri.star1_date(pytime.mktime(star1_date.timetuple())) gri.total_reads(total_reads) gri.total_starred(total_starred) LOGGER.info( u'Google Reader import for user %s: %s feed(s) and %s read ' u'article(s) to go…', username, total_feeds, total_reads) if total_feeds > GR_MAX_FEEDS and not settings.DEBUG: mail_admins( 'User {0} has more than {1} feeds: {2}!'.format( username, GR_MAX_FEEDS, total_feeds), u"\n\nThe GR import will be incomplete.\n\n" u"Just for you to know…\n\n") # We launch the starred feed import first. Launching it after the # standard feeds makes it being delayed until the world's end. reader.makeSpecialFeeds() starred_feed = reader.getSpecialFeed(ReaderUrl.STARRED_LIST) import_google_reader_starred.apply_async((user_id, username, starred_feed), queue='low') processed_feeds = 1 feeds_to_import = [] for gr_feed in reader.feeds[:GR_MAX_FEEDS]: try: feed = create_feed(gr_feed, mongo_user) except Feed.DoesNotExist: LOGGER.exception( u'Could not create feed “%s” for user %s, ' u'skipped.', gr_feed.title, username) continue processed_feeds += 1 feeds_to_import.append((user_id, username, gr_feed, feed)) LOGGER.info(u'Imported feed “%s” (%s/%s) for user %s…', gr_feed.title, processed_feeds, total_feeds, username) # We need to clamp the total, else task won't finish in # the case where the user has more feeds than allowed. # gri.total_feeds(min(processed_feeds, GR_MAX_FEEDS)) for feed_args in feeds_to_import: import_google_reader_articles.apply_async(feed_args, queue='low') LOGGER.info( u'Imported %s/%s feeds in %s. Articles import already ' u'started with limits: date: %s, %s waves of %s articles, ' u'max articles: %s, reads: %s, starred: %s.', processed_feeds, total_feeds, naturaldelta(now() - gri.start()), naturaltime(max([gri.reg_date(), GR_OLDEST_DATE])), config.GR_WAVE_LIMIT, config.GR_LOAD_LIMIT, config.GR_MAX_ARTICLES, total_reads, total_starred)
def refresh_all_feeds(limit=None, force=False): u""" Refresh all feeds (RSS/Mail/Twitter…). """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable, # we should re-evaluate it each time we run. this_round_expire_time = (config.FEED_GLOBAL_REFRESH_INTERVAL * 60 - config.FEED_GLOBAL_REFRESH_INTERVAL) # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s. # my_lock = RedisExpiringLock(REFRESH_ALL_FEEDS_LOCK_NAME, expire_time=this_round_expire_time) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.') return # This should bring us a Polymorphic Query to refresh all feeds types. feeds = BaseFeed.objects.filter( is_active=True, is_internal=False).order_by('date_last_fetch') if limit: feeds = feeds[:limit] with benchmark('refresh_all_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): # The refresh task lauched before its expiration, and is # still [long] running while we want to launch another. # Avoid, because the new would exit immediately on # date_last_fetch too recent. LOGGER.debug(u'Feed %s already locked, skipped.', feed) continue if feed.date_last_fetch is None: basefeed_refresh_task.apply_async( args=(feed.id, ), # in `this_round_expire_time`, we will relaunch it # anyway, so don't clutter the queue with double work. expire=this_round_expire_time, ) LOGGER.info( u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) count += 1 continue if feed.fetch_interval > 86399: interval_days = feed.fetch_interval / 86400 interval_seconds = feed.fetch_interval - (interval_days * 86400) interval = timedelta(days=interval_days, seconds=interval_seconds) else: interval = timedelta(seconds=feed.fetch_interval) if force or feed.date_last_fetch + interval < mynow: how_late = feed.date_last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds late = feed.date_last_fetch + interval < mynow basefeed_refresh_task.apply_async( args=(feed.id, ), kwargs={'force': force}, expire=this_round_expire_time, ) LOGGER.info(u'Launched refresh of feed %s (%s %s).', feed, naturaldelta(how_late), u'late' if late else u'earlier') count += 1 finally: # HEADS UP: in case the system is overloaded and feeds refresh() # tasks don't complete fast enough, the current task # will overload it even more. Thus, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to feed the refresh queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def global_feeds_checker(): """ Check all RSS feeds and their dependants. Close them if needed. No parameter. """ def pretty_print_feed(feed): return (u'- %s,\n' u' - admin url: http://%s%s\n' u' - public url: %s\n' u' - %s\n' u' - reason: %s\n' u' - last error: %s') % ( feed, settings.SITE_DOMAIN, reverse('admin:%s_%s_change' % ( feed._meta.app_label, feed._meta.module_name), args=[feed.id]), # Only RSS/Atom feeds have an URL… feed.url if hasattr(feed, 'url') else '(NO URL)', (u'closed on %s' % feed.date_closed) if feed.date_closed else u'(no closing date)', feed.closed_reason or u'none (or manually closed from the admin interface)', feed.errors[0] if len(feed.errors) else u'(no error recorded)') def pretty_print_feed_list(feed_list): return '\n\n'.join( pretty_print_feed(feed) for feed in feed_list ) dtnow = now() limit_days = config.FEED_CLOSED_WARN_LIMIT closed_limit = dtnow - timedelta(days=limit_days) closed_tested = 0 reopened_list = [] # ———————————————————————————————— See if old closed feeds can be reopened. old_closed_feeds = BaseFeed.objects.filter(is_active=False).filter( date_closed__lt=closed_limit) for feed in old_closed_feeds: # check all closed feeds monthly, on their closing date anniversary. if feed.date_closed.day == dtnow.day: if feed.check_old_closed(): reopened_list.append(feed) closed_tested += 1 # ——————————————————————————————————————————— Report recently closed feeds. recently_closed_feeds = BaseFeed.objects.filter(is_active=False).filter( Q(date_closed=None) | Q(date_closed__gte=closed_limit)) if not recently_closed_feeds.exists(): LOGGER.info('No feed was closed in the last %s days, %s already ' u'closed checked for eventual back-to-life, of which ' u'%s were reopened.', limit_days, closed_tested, len(reopened_list)) return count = recently_closed_feeds.count() mail_managers(_(u'Reminder: {0} feed(s) closed in last ' u'{1} day(s), {2} automatically reopened').format( count, limit_days, len(reopened_list)), FEED_CHECK_TEMPLATE_TXT.format( feed_list=pretty_print_feed_list(recently_closed_feeds), closed_tested=closed_tested, reopened_count=len(reopened_list), reopened_list=pretty_print_feed_list(reopened_list)), ) start_time = pytime.time() # Close the feeds, but after sending the mail, # So that initial reason is displayed at least # once to a real human. for feed in recently_closed_feeds: if feed.date_closed is None: feed.close('Automatic close by periodic checker task') LOGGER.info('Closed %s feeds in %s.', count, naturaldelta(pytime.time() - start_time))
def global_duplicates_checker(limit=None, force=False): """ Check that duplicate articles have no more Reads anywhere. Fix it if not, and update all counters accordingly. :param limit: integer, the maximum number of duplicates to check. Default: none. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. """ if config.CHECK_DUPLICATES_DISABLED: LOGGER.warning(u'Duplicates check disabled in configuration.') return # This task runs one a day. Acquire the lock for just a # little more time to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing duplicates check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_subscriptions_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_DUPLICATES_LIMIT start_time = pytime.time() duplicates = BaseItem.objects.duplicate() total_dupes_count = duplicates.count() total_reads_count = 0 processed_dupes = 0 done_dupes_count = 0 purged_dupes_count = 0 purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS) purge_after_weeks_count = min(52, purge_after_weeks_count) purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7) LOGGER.info(u'Done counting (took %s of pure SQL joy), starting procedure.', naturaldelta(pytime.time() - start_time)) with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all', total_dupes_count)): try: for duplicate in duplicates.iterator(): reads = duplicate.reads.all() processed_dupes += 1 if reads.exists(): done_dupes_count += 1 reads_count = reads.count() total_reads_count += reads_count LOGGER.info(u'Duplicate %s #%s still has %s reads, fixing…', duplicate._meta.model.__name__, duplicate.id, reads_count) duplicate.duplicate_of.register_duplicate( duplicate, force=duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED) if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED: # # TODO: check we didn't get some race-conditions new # dependancies between the moment the duplicate # was marked duplicate and now. if duplicate.date_created < purge_before_date: try: with transaction.atomic(): duplicate.delete() except: LOGGER.exception(u'Exception while deleting ' u'duplicate %s #%s', duplicate._meta.model.__name__, duplicate.id) purged_dupes_count += 1 LOGGER.info(u'Purged duplicate %s #%s from database.', duplicate._meta.model.__name__, duplicate.id) elif duplicate.duplicate_status in ( DUPLICATE_STATUS.NOT_REPLACED, DUPLICATE_STATUS.FAILED): # Something went wrong, perhaps the # task was purged before beiing run. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 elif duplicate.duplicate_status is None: # Something went very wrong. If the article is a known # duplicate, its status field should have been set to # at least NOT_REPLACED. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 LOGGER.error(u'Corrected duplicate %s #%s found with no ' u'status.', duplicate._meta.model.__name__, duplicate.id) if limit and processed_dupes >= limit: break finally: my_lock.release() LOGGER.info(u'global_duplicates_checker(): %s/%s duplicates processed ' u'(%.2f%%; limit: %s), %s corrected (%.2f%%), ' u'%s purged (%.2f%%); %s reads altered.', processed_dupes, total_dupes_count, processed_dupes * 100.0 / total_dupes_count, limit or u'none', done_dupes_count, (done_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, purged_dupes_count, (purged_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, total_reads_count)
def refresh_all_mongo_feeds(limit=None, force=False): u""" Refresh all MongoEngine feeds (RSS). .. note:: this task should vanish when MongoDB → PostgreSQL migration is done. """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s. my_lock = RedisExpiringLock( 'refresh_all_mongo_feeds', expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1 ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_mongo_feeds() is already ' u'locked, aborting.') return feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True) if limit: feeds = feeds.limit(limit) # No need for caching and cluttering CPU/memory for a one-shot thing. feeds.no_cache() with benchmark('refresh_all_mongo_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): LOGGER.info(u'Feed %s already locked, skipped.', feed) continue interval = timedelta(seconds=feed.fetch_interval) if feed.last_fetch is None: mongo_feed_refresh_task.delay(feed.id) LOGGER.info(u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) elif force or feed.last_fetch + interval < mynow: how_late = feed.last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds countdown = 0 mongo_feed_refresh_task.delay(feed.id, force) LOGGER.info(u'%s refresh of feed %s %s (%s late).', u'Scheduled randomized' if countdown else u'Launched', feed, u' in {0}'.format(naturaldelta(countdown)) if countdown else u'in the background', naturaldelta(how_late)) count += 1 finally: # HEADS UP: see core.tasks.refresh_all_feeds() for note. # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())