def mark_usable(self, commit=True, verbose=True): """ Mark the account usable and clear error. """ if verbose: LOGGER.info(u'%s is now considered usable.', self) if self.is_usable: start_task = False else: start_task = True self.date_last_conn = now() self.conn_error = None self.is_usable = True if commit: self.save() try: usable_start_task = self._meta.model.usable_start_task except: pass else: if start_task: usable_start_task.delay(self.id)
def mark_all_read(self, latest_displayed_read=None): if self.unread_items_count == 0: return # count = self.unread_items_count # self.unread_items_count = 0 # for folder in self.folders: # folder.unread_items_count -= count # self.user.unread_items_count -= count # Marking all read is not a database-friendly operation, # thus it's run via a task to be able to return now immediately, # with cache numbers updated. # # HEADS UP: this task name will be registered later # by the register_task_method() call. globals()['subscription_mark_all_read_in_database_task'].delay( self.id, now() if latest_displayed_read is None # # TRICK: we use self.user.reads for 2 reasons: # - avoid importing `Read`, which would create a loop. # - in case of a folder/global initiated mark_all_read(), # the ID can be one of a read in another subscription # and in this case, self.reads.get() will fail. # else latest_displayed_read.date_created)
def older_than_two_weeks(self): """ Return items created more than 14 days ago. """ two_week_delta = timedelta(days=14) two_weeks_before = now() - two_week_delta return self.filter(date_created__lte=two_weeks_before)
def older_than_one_day(self): """ Return items created more than 24 hours ago. """ one_day_delta = timedelta(days=1) one_day_before = now() - one_day_delta return self.filter(date_created__lte=one_day_before)
def mark_usable(self, commit=True, verbose=True): """ Mark the account usable and clear error. """ if verbose: LOGGER.info(u'%s is now considered usable.', self) if self.is_usable: start_task = False else: start_task = True self.date_last_conn = now() self.conn_error = None self.is_usable = True if commit: self.save() try: usable_start_task = self._meta.model.usable_start_task except: pass else: if start_task: usable_start_task.delay(self.id)
def older_than_one_week(self): """ Return items created more than 7 days ago. """ one_week_delta = timedelta(days=7) one_week_before = now() - one_week_delta return self.filter(date_created__lte=one_week_before)
def older_than_one_month(self): """ Return items created more than 31 days ago. """ one_month_delta = timedelta(days=31) one_month_before = now() - one_month_delta return self.filter(date_created__lte=one_month_before)
def older_than_one_month(self): """ Return items created more than 31 days ago. """ one_month_delta = timedelta(days=31) one_month_before = now() - one_month_delta return self.filter(date_created__lte=one_month_before)
def older_than_two_weeks(self): """ Return items created more than 14 days ago. """ two_week_delta = timedelta(days=14) two_weeks_before = now() - two_week_delta return self.filter(date_created__lte=two_weeks_before)
def older_than_one_day(self): """ Return items created more than 24 hours ago. """ one_day_delta = timedelta(days=1) one_day_before = now() - one_day_delta return self.filter(date_created__lte=one_day_before)
def older_than_one_week(self): """ Return items created more than 7 days ago. """ one_week_delta = timedelta(days=7) one_week_before = now() - one_week_delta return self.filter(date_created__lte=one_week_before)
def mark_read(self): """ Mark a read as read and update cached descriptors. """ self.is_read = True self.date_read = now() self.save() self.is_read_changed()
def running_old(self): """ An import running for too long is probably crashed. But we didn't notice it, or celery crashed. Whatever. """ return self.status == IMPORT_STATUS.RUNNING \ and self.date_started < (now() - timedelta(seconds=21600))
def error(self, message, commit=True, last_fetch=False): """ Take note of an error. If the maximum number of errors is reached, close the feed and return ``True``; else just return ``False``. :param last_fetch: as a commodity, set this to ``True`` if you want this method to update the :attr:`last_fetch` attribute with the value of ``now()`` (UTC). Default: ``False``. :param commit: as in any other Django DB-related method, set this to ``False`` if you don't want this method to call ``self.save()``. Default: ``True``. """ LOGGER.error(u'Error on feed %s: %s.', self, message) error_message = u'{0} @@{1}'.format(message, now().isoformat()) # Put the errors more recent first. self.errors.insert(0, error_message) if last_fetch: self.date_last_fetch = now() retval = False if len(self.errors) >= config.FEED_FETCH_MAX_ERRORS: if self.is_active: self.close(u'Too many errors on the feed. Last was: %s' % self.errors[0], commit=False) # LOGGER.critical(u'Too many errors on feed %s, closed.', self) # Keep only the most recent errors. self.errors = self.errors[:config.FEED_FETCH_MAX_ERRORS] retval = True if commit: self.save() statsd.incr('feeds.refresh.global.errors') return retval
def mark_read(self): """ Mark a read as read and update cached descriptors. """ self.is_read = True self.date_read = now() self.save() self.is_read_changed()
def created_previous_hour(self): """ Return items created between 61 and 120 minutes inclusive. """ one_hour_delta = timedelta(seconds=3600) one_hour_before = now() - one_hour_delta two_hours_before = one_hour_before - one_hour_delta return self.filter(date_created__lte=one_hour_before, date_created__gte=two_hours_before)
def update_last_fetch(self): """ Allow to customize the last fetch datetime. This method exists to be overriden by “under development” classes, to allow not updating the last_fetch attribute, and continue fetching data forever on development machines. """ self.date_last_fetch = now()
def created_previous_month(self): """ Return items created between 32 and 62 days inclusive. """ one_month_delta = timedelta(days=31) one_month_before = now() - one_month_delta two_months_before = one_month_before - one_month_delta return self.filter(date_created__lte=one_month_before, date_created__gte=two_months_before)
def older_than_delta(self, custom_timedelta): """ Return items created more than :param:`delta` ago. :param delta: a python :class:`~datetime.timedelta` object. """ custom_delta_before = now() - custom_timedelta return self.filter(date_created__lt=custom_delta_before)
def created_previous_week(self): """ Return items created between 8 and 14 days inclusive. """ one_week_delta = timedelta(days=7) one_week_before = now() - one_week_delta two_weeks_before = one_week_before - one_week_delta return self.filter(date_created__lte=one_week_before, date_created__gte=two_weeks_before)
def created_previous_day(self): """ Return items created between 25 and 48 hours inclusive. """ one_day_delta = timedelta(days=1) one_day_before = now() - one_day_delta two_days_before = one_day_before - one_day_delta return self.filter(date_created__lte=one_day_before, date_created__gte=two_days_before)
def created_previous_hour(self): """ Return items created between 61 and 120 minutes inclusive. """ one_hour_delta = timedelta(seconds=3600) one_hour_before = now() - one_hour_delta two_hours_before = one_hour_before - one_hour_delta return self.filter(date_created__lte=one_hour_before, date_created__gte=two_hours_before)
def created_previous_day(self): """ Return items created between 25 and 48 hours inclusive. """ one_day_delta = timedelta(days=1) one_day_before = now() - one_day_delta two_days_before = one_day_before - one_day_delta return self.filter(date_created__lte=one_day_before, date_created__gte=two_days_before)
def created_previous_month(self): """ Return items created between 32 and 62 days inclusive. """ one_month_delta = timedelta(days=31) one_month_before = now() - one_month_delta two_months_before = one_month_before - one_month_delta return self.filter(date_created__lte=one_month_before, date_created__gte=two_months_before)
def older_than_delta(self, custom_timedelta): """ Return items created more than :param:`delta` ago. :param delta: a python :class:`~datetime.timedelta` object. """ custom_delta_before = now() - custom_timedelta return self.filter(date_created__lt=custom_delta_before)
def created_previous_week(self): """ Return items created between 8 and 14 days inclusive. """ one_week_delta = timedelta(days=7) one_week_before = now() - one_week_delta two_weeks_before = one_week_before - one_week_delta return self.filter(date_created__lte=one_week_before, date_created__gte=two_weeks_before)
def format_quota(quota): if quota['remaining'] is None: return u' (no quota information)' if quota['remaining']: return u'; quota: %s call(s) remaining' % quota['remaining'] else: return u'; quota exhausted, reset in %s' % ( naturaldelta(now() - quota['reset'].replace(tzinfo=utc)))
def create_tweet_from_id(tweet_id, feeds=None, origin=None): """ From a Tweet ID, create a 1flow tweet via the REST API. https://dev.twitter.com/rest/reference/get/statuses/show/%3Aid .. todo:: use http://celery.readthedocs.org/en/latest/reference/celery.contrib.batches.html # NOQA to bulk get statuses and not exhaust the API Quota. """ raise NotImplementedError('Needs a full review / redesign for tweets.') if feeds is None: feeds = [] elif not hasattr(feeds, '__iter__'): feeds = [feeds] # TODO: find tweet publication date while fetching content… # TODO: set Title during fetch… try: new_tweet, created = Tweet.create_tweet( url=tweet_id.replace(' ', '%20'), title=_(u'Imported item from {0}').format(clean_url(tweet_id)), feeds=feeds, origin=ORIGINS.WEBIMPORT) except: # NOTE: duplication handling is already # taken care of in Tweet.create_tweet(). LOGGER.exception(u'Tweet creation from URL %s failed.', tweet_id) return None, False mutualized = created is None if created or mutualized: for feed in feeds: feed.recent_items_count += 1 feed.all_items_count += 1 ze_now = now() for feed in feeds: feed.latest_item_date_published = ze_now # Even if the tweet wasn't created, we need to create reads. # In the case of a mutualized tweet, it will be fetched only # once, but all subscribers of all feeds must be connected to # it to be able to read it. for subscription in feed.subscriptions.all(): subscription.create_read(new_tweet, verbose=created) # Don't forget the parenthesis else we return ``False`` everytime. return new_tweet, created or (None if mutualized else False)
def refresh_must_abort(self, force=False, commit=True): """ Returns ``True`` if one or more abort conditions is met. Checks the feed cache lock, the ``last_fetch`` date, etc. """ if not self.is_active: LOGGER.info(u'%s %s: is currently inactive, refresh aborted.', self._meta.verbose_name, self.id) return True if self.is_internal: LOGGER.info(u'%s %s: beiing internal, no need to refresh.', self._meta.verbose_name, self.id) return True if config.FEED_FETCH_DISABLED: # we do not raise .retry() because the global refresh # task will call us again anyway at next global check. LOGGER.info(u'%s %s: refresh disabled by configuration.', self._meta.verbose_name, self.id) return True try: if self.refresh_must_abort_internal(): return True except AttributeError: pass # ———————————————————————————————————————————————— Try to acquire lock if not self.refresh_lock.acquire(): if force: LOGGER.warning(u'%s %s: forcing refresh unlocking.', self._meta.verbose_name, self.id) self.refresh_lock.release() self.refresh_lock.acquire() else: LOGGER.info(u'%s %s: refresh already locked, aborting.', self._meta.verbose_name, self.id) return True if self.date_last_fetch is not None and self.date_last_fetch >= ( now() - timedelta(seconds=self.fetch_interval)): if force: LOGGER.warning( u'%s %s: forcing refresh despite recently ' u'fetched.', self._meta.verbose_name, self.id) else: LOGGER.info(u'%s %s: last refresh too recent, aborting.', self._meta.verbose_name, self.id) return True return False
def format_quota(quota): if quota['remaining'] is None: return u' (no quota information)' if quota['remaining']: return u'; quota: %s call(s) remaining' % quota['remaining'] else: return u'; quota exhausted, reset in %s' % ( naturaldelta(now() - quota['reset'].replace(tzinfo=utc)) )
def create_tweet_from_id(tweet_id, feeds=None, origin=None): """ From a Tweet ID, create a 1flow tweet via the REST API. https://dev.twitter.com/rest/reference/get/statuses/show/%3Aid .. todo:: use http://celery.readthedocs.org/en/latest/reference/celery.contrib.batches.html # NOQA to bulk get statuses and not exhaust the API Quota. """ raise NotImplementedError('Needs a full review / redesign for tweets.') if feeds is None: feeds = [] elif not hasattr(feeds, '__iter__'): feeds = [feeds] # TODO: find tweet publication date while fetching content… # TODO: set Title during fetch… try: new_tweet, created = Tweet.create_tweet( url=tweet_id.replace(' ', '%20'), title=_(u'Imported item from {0}').format(clean_url(tweet_id)), feeds=feeds, origin=ORIGINS.WEBIMPORT) except: # NOTE: duplication handling is already # taken care of in Tweet.create_tweet(). LOGGER.exception(u'Tweet creation from URL %s failed.', tweet_id) return None, False mutualized = created is None if created or mutualized: for feed in feeds: feed.recent_items_count += 1 feed.all_items_count += 1 ze_now = now() for feed in feeds: feed.latest_item_date_published = ze_now # Even if the tweet wasn't created, we need to create reads. # In the case of a mutualized tweet, it will be fetched only # once, but all subscribers of all feeds must be connected to # it to be able to read it. for subscription in feed.subscriptions.all(): subscription.create_read(new_tweet, verbose=created) # Don't forget the parenthesis else we return ``False`` everytime. return new_tweet, created or (None if mutualized else False)
def toggle(request, klass, oid, key): """ Toggle any object property, given its a boolean on the DB side. """ # # TODO: push notifications on error to the user. # try: obj = get_object_or_404(globals()[klass], id=oid) except: LOGGER.exception(u'Oops in toggle! Model “%s” not imported?', klass) return HttpResponseTemporaryServerError() if obj.user != request.user: return HttpResponseForbidden(u'Not owner') try: new_value = not getattr(obj, key) setattr(obj, key, new_value) except: msg = (u'Unable to toggle %s of %s', key, obj) LOGGER.exception(*msg) return HttpResponseTemporaryServerError(msg[0] % msg[1:]) else: if key.startswith('is_'): date_attr = 'date_' + key[3:] if hasattr(obj, date_attr): # LOGGER.info(u'%s %s: set %s to NOW.', # obj._meta.verbose_name, obj.id, date_attr) setattr(obj, date_attr, now() if new_value else None) try: getattr(obj, key + '_changed')() except AttributeError: pass except: LOGGER.exception( u'Unhandled exception while running ' u', %s.%s_changed() on %s.', obj.__class__.__name__, key, obj) obj.save() if request.is_ajax(): return HttpResponse(u'DONE.') else: return HttpResponseRedirect( request.META.get('HTTP_REFERER', reverse('home')))
def toggle(request, klass, oid, key): """ Toggle any object property, given its a boolean on the DB side. """ # # TODO: push notifications on error to the user. # try: obj = get_object_or_404(globals()[klass], id=oid) except: LOGGER.exception(u'Oops in toggle! Model “%s” not imported?', klass) return HttpResponseTemporaryServerError() if obj.user != request.user: return HttpResponseForbidden(u'Not owner') try: new_value = not getattr(obj, key) setattr(obj, key, new_value) except: msg = (u'Unable to toggle %s of %s', key, obj) LOGGER.exception(*msg) return HttpResponseTemporaryServerError(msg[0] % msg[1:]) else: if key.startswith('is_'): date_attr = 'date_' + key[3:] if hasattr(obj, date_attr): # LOGGER.info(u'%s %s: set %s to NOW.', # obj._meta.verbose_name, obj.id, date_attr) setattr(obj, date_attr, now() if new_value else None) try: getattr(obj, key + '_changed')() except AttributeError: pass except: LOGGER.exception(u'Unhandled exception while running ' u', %s.%s_changed() on %s.', obj.__class__.__name__, key, obj) obj.save() if request.is_ajax(): return HttpResponse(u'DONE.') else: return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('home')))
def reopen(self, message=None, verbose=True, commit=True): """ Reopen the feed, clearing errors, date closed, etc. """ self.errors = [] self.is_active = True self.date_closed = now() self.closed_reason = u'Reopen on %s' % now().isoformat() if commit: self.save() statsd.gauge('feeds.counts.open', 1, delta=True) if verbose: if message is None: LOGGER.info(u'%s %s: %sre-opened.', self._meta.verbose_name, self.id, u'' if commit else u'temporarily ') else: LOGGER.info(u'%s %s: %s', self._meta.verbose_name, self.id, message)
def close(self, reason=None, commit=True): """ Close the feed with or without a reason. """ self.is_active = False self.date_closed = now() self.closed_reason = reason or _(u'NO REASON GIVEN') if commit: self.save() statsd.gauge('feeds.counts.open', -1, delta=True) LOGGER.warning(u'%s %s: closed with reason “%s”.', self._meta.verbose_name, self.id, self.closed_reason)
def mark_unusable(self, message, args=(), exc=None, commit=True): """ Mark account unsable with date & message, log exception if any. """ if exc is not None: if args: message = message % args message = u'{0} ({1})'.format(message, unicode(exc)) LOGGER.exception(u'%s unusable: %s', self, message) self.date_last_conn = now() self.conn_error = message self.is_usable = False if commit: self.save()
def mark_unusable(self, message, args=(), exc=None, commit=True): """ Mark account unsable with date & message, log exception if any. """ if exc is not None: if args: message = message % args message = u'{0} ({1})'.format(message, unicode(exc)) LOGGER.exception(u'%s unusable: %s', self, message) self.date_last_conn = now() self.conn_error = message self.is_usable = False if commit: self.save()
def run(self): """ Run the import. """ # # NOTE: we don't care if the import was already running, finished, # whatever. This class is able to recover and re-run itself # over and over without doing bad thing in the database. # is_retrying = self.status == IMPORT_STATUS.RETRY self.status = IMPORT_STATUS.RUNNING self.date_started = now() self.save() try: return self.run_internal() except: LOGGER.exception(u'User import %s failed') if is_retrying: message_user(self.user, _(u'Your import #{0} failed to run after a ' u'retry. Please review it before relaunching ' u'it manually again.').format(self.id), constants.ERROR) self.status = IMPORT_STATUS.FAILED else: countdown = randrange(1800, 3600) delta_cd = naturaldelta(timedelta(seconds=countdown)) message_user(self.user, _(u'Your import #{0} failed to run. If will ' u'be automatically retried in {1}').format( self.id, delta_cd), constants.WARNING) globals()['userimport_run_task'].apply_async( (self.id, ), countdown=countdown) self.status = IMPORT_STATUS.RETRY self.save()
def run_internal(self): """ Import dirty work. """ self._import_validator_ = URLValidator() self._import_to_create_ = set() self._import_created_ = { 'feeds': [], 'articles': [] } self._import_failed_ = [] all_in_one = False for importer in self.importers: if importer(): all_in_one = True break if not all_in_one: urls = self.urls.splitlines() for url in urls: self.validate_url(url) for url in self._import_to_create_: self.import_from_one_url(url) self.results = { 'created': self._import_created_, 'failed': self._import_failed_, } if self._import_created_['articles'] or self._import_created_['feeds']: self.status = IMPORT_STATUS.FINISHED elif self._import_failed_: self.status = IMPORT_STATUS.FAILED self.date_finished = now() self.save()
def repair_missing_authors_migration_201411(cls): # from oneflow.core.tasks.migration import vacuum_analyze articles = Article.objects.filter( authors=None, date_created__gt=datetime(2014, 10, 31)) count = articles.count() done = 0 LOGGER.info(u'Starting repairing %s missing authors @%s', count, now()) with benchmark(u'Fix missing authors on rel-DB fetched content…'): for article in articles: article.postprocess_original_data(force=True) # if done % 25000 == 0: # vacuum_analyze() done += 1
def create_poke(cls, sender, recipients, message=None, attachments=None): """ Create a poke in all required feeds, with all required links. """ if message is None and not bool(attachments): raise RuntimeError(u'Poke message and attachments cannot be ' u'both empty. You have to send something, ' u'we are not on fesse-bouc!') # BaseItem must have a name. poke_name = uuid.uuid4().hex poke = cls( name=poke_name, slug=poke_name, user=sender, message=message, is_restricted=True, date_published=now(), origin=ORIGINS.INTERNAL, ) # This HAS to succeed, thus no try/except poke.save() if attachments is None: attachments = [] poke.attachments.add(*attachments) poke.set_sender(sender) LOGGER.info(u'Created poke %s from %s.', poke.id, sender) poke.send_to_recipients(set(recipients)) return poke, True
def create_poke(cls, sender, recipients, message=None, attachments=None): """ Create a poke in all required feeds, with all required links. """ if message is None and not bool(attachments): raise RuntimeError(u'Poke message and attachments cannot be ' u'both empty. You have to send something, ' u'we are not on fesse-bouc!') # BaseItem must have a name. poke_name = uuid.uuid4().hex poke = cls( name=poke_name, slug=poke_name, user=sender, message=message, is_restricted=True, date_published=now(), origin=ORIGINS.INTERNAL, ) # This HAS to succeed, thus no try/except poke.save() if attachments is None: attachments = [] poke.attachments.add(*attachments) poke.set_sender(sender) LOGGER.info(u'Created poke %s from %s.', poke.id, sender) poke.send_to_recipients(set(recipients)) return poke, True
def global_feeds_checker(): """ Check all RSS feeds and their dependants. Close them if needed. No parameter. """ def pretty_print_feed(feed): return (u'- %s,\n' u' - admin url: http://%s%s\n' u' - public url: %s\n' u' - %s\n' u' - reason: %s\n' u' - last error: %s') % ( feed, settings.SITE_DOMAIN, reverse('admin:%s_%s_change' % ( feed._meta.app_label, feed._meta.module_name), args=[feed.id]), # Only RSS/Atom feeds have an URL… feed.url if hasattr(feed, 'url') else '(NO URL)', (u'closed on %s' % feed.date_closed) if feed.date_closed else u'(no closing date)', feed.closed_reason or u'none (or manually closed from the admin interface)', feed.errors[0] if len(feed.errors) else u'(no error recorded)') def pretty_print_feed_list(feed_list): return '\n\n'.join( pretty_print_feed(feed) for feed in feed_list ) dtnow = now() limit_days = config.FEED_CLOSED_WARN_LIMIT closed_limit = dtnow - timedelta(days=limit_days) closed_tested = 0 reopened_list = [] # ———————————————————————————————— See if old closed feeds can be reopened. old_closed_feeds = BaseFeed.objects.filter(is_active=False).filter( date_closed__lt=closed_limit) for feed in old_closed_feeds: # check all closed feeds monthly, on their closing date anniversary. if feed.date_closed.day == dtnow.day: if feed.check_old_closed(): reopened_list.append(feed) closed_tested += 1 # ——————————————————————————————————————————— Report recently closed feeds. recently_closed_feeds = BaseFeed.objects.filter(is_active=False).filter( Q(date_closed=None) | Q(date_closed__gte=closed_limit)) if not recently_closed_feeds.exists(): LOGGER.info('No feed was closed in the last %s days, %s already ' u'closed checked for eventual back-to-life, of which ' u'%s were reopened.', limit_days, closed_tested, len(reopened_list)) return count = recently_closed_feeds.count() mail_managers(_(u'Reminder: {0} feed(s) closed in last ' u'{1} day(s), {2} automatically reopened').format( count, limit_days, len(reopened_list)), FEED_CHECK_TEMPLATE_TXT.format( feed_list=pretty_print_feed_list(recently_closed_feeds), closed_tested=closed_tested, reopened_count=len(reopened_list), reopened_list=pretty_print_feed_list(reopened_list)), ) start_time = pytime.time() # Close the feeds, but after sending the mail, # So that initial reason is displayed at least # once to a real human. for feed in recently_closed_feeds: if feed.date_closed is None: feed.close('Automatic close by periodic checker task') LOGGER.info('Closed %s feeds in %s.', count, naturaldelta(pytime.time() - start_time))
def go(limit=None, all_hint=None): """ Do the dirty things, fast. """ TO_CLEAN = ('url', 'content', ) URL_CLEANING_QUERY = """ UPDATE core_article SET {0}_error = NULL WHERE core_article.baseitem_ptr_id IN ( SELECT baseitem_ptr_id FROM core_article WHERE {0}_error = '' LIMIT {1} ); """ COUNT_QUERY = """ SELECT COUNT(*) FROM core_article WHERE {0}_error = ''; """ def one_line(a_string): return re.sub(u' +', u' ', u' '.join(a_string.splitlines())) if limit is None: limit = 10000 if all_hint is None: all_hint = 7000000 LOGGER.info(u'Starting to fix the world @ %s', now()) with benchmark(u'Fix everything'): for to_clean in TO_CLEAN: done = 0 with benchmark(u'Fixing %s' % to_clean): while True: do_whatever_SQL( one_line(URL_CLEANING_QUERY).format( to_clean, limit ), [], u'Fixing %s, round %s' % (to_clean, done) ) done += 1 # if done % 10 == 0: # vacuum_analyze('at %s' % (done * 50000)) if done > (all_hint / limit): count = do_whatever_SQL( one_line(COUNT_QUERY).format(to_clean), [], u'Counting things', commit=False ) if count == 0: break time.sleep(20)
def check_reads(self, items=None, extended_check=False, force=False, commit=True): """ Also available as a task for background execution. """ in_the_past = combine( today() - timedelta(days=config.SUBSCRIPTIONS_ITEMS_UNREAD_DAYS), time(0, 0, 0)) my_now = now() counters = CheckReadsCounter() def create_read_for_item(item, params): _, created = self.create_read(item, verbose=False, **params) if created: counters.missing += 1 if params.get('is_read', False): counters.reads += 1 else: counters.unreads += 1 elif created is False: counters.rechecked += 1 if extended_check: try: item.activate_reads() except: LOGGER.exception( u'Problem while activating reads ' u'of item #%s in Subscription ' u'#%s.check_reads(), continuing ' u'check.', item.id, self.id) else: counters.failed += 1 # ——————————————————————————————————————————————— First, check articles # We can order them by date and connect reads in the same order. if items is None: on_items = self.feed.good_items.article().order_by( 'Article___date_published') else: on_items = items.article().order_by('Article___date_published') for item in on_items.filter(Article___date_published__lt=in_the_past): # We reconnect the user to the whole feed history, but marking # old articles auto read, else there could be too much to read. create_read_for_item( item, { 'is_read': True, 'is_auto_read': True, 'date_read': my_now, 'date_auto_read': my_now, }) for item in on_items.filter( Q(Article___date_published__gte=in_the_past) | Q(Article___date_published=None)): # default parameters, reads will be unread. create_read_for_item(item, {}) # ——————————————————————————————————————————————————— Then, other items # Do the same, but based on the date_created if items is None: on_items = self.feed.good_items.not_instance_of(Article) else: on_items = items.not_instance_of(Article) for item in on_items.filter(date_updated__lt=in_the_past): # We reconnect the user to the whole feed history, but marking # old items auto read, else there could be too much to read. create_read_for_item( item, { 'is_read': True, 'is_auto_read': True, 'date_read': my_now, 'date_auto_read': my_now, }) for item in on_items.filter(date_updated__gte=in_the_past): # default parameters, reads will be unread. create_read_for_item(item, {}) for item in on_items: create_read_for_item(item, {}) # —————————————————————————————————————————————————— Update descriptors if counters.missing or counters.rechecked: # # TODO: don't recompute everything, just # add or subscribe the changed counts. # self.compute_cached_descriptors(all=True, unread=True) for folder in self.folders.all(): folder.compute_cached_descriptors(all=True, unread=True) LOGGER.info( u'Checked subscription #%s. ' u'%s/%s non-existing/re-checked, ' u'%s/%s read/unread and %s not created.', self.id, counters.missing, counters.rechecked, counters.reads, counters.unreads, counters.failed) return counters
def refresh_all_mongo_feeds(limit=None, force=False): u""" Refresh all MongoEngine feeds (RSS). .. note:: this task should vanish when MongoDB → PostgreSQL migration is done. """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_mongo_feeds succeeded in 1.99886608124s. my_lock = RedisExpiringLock( 'refresh_all_mongo_feeds', expire_time=config.FEED_GLOBAL_REFRESH_INTERVAL * 180 - 1 ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_mongo_feeds() is already ' u'locked, aborting.') return feeds = MongoFeed.objects.filter(closed__ne=True, is_internal__ne=True) if limit: feeds = feeds.limit(limit) # No need for caching and cluttering CPU/memory for a one-shot thing. feeds.no_cache() with benchmark('refresh_all_mongo_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): LOGGER.info(u'Feed %s already locked, skipped.', feed) continue interval = timedelta(seconds=feed.fetch_interval) if feed.last_fetch is None: mongo_feed_refresh_task.delay(feed.id) LOGGER.info(u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) elif force or feed.last_fetch + interval < mynow: how_late = feed.last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds countdown = 0 mongo_feed_refresh_task.delay(feed.id, force) LOGGER.info(u'%s refresh of feed %s %s (%s late).', u'Scheduled randomized' if countdown else u'Launched', feed, u' in {0}'.format(naturaldelta(countdown)) if countdown else u'in the background', naturaldelta(how_late)) count += 1 finally: # HEADS UP: see core.tasks.refresh_all_feeds() for note. # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def global_duplicates_checker(limit=None, force=False): """ Check that duplicate articles have no more Reads anywhere. Fix it if not, and update all counters accordingly. :param limit: integer, the maximum number of duplicates to check. Default: none. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. """ if config.CHECK_DUPLICATES_DISABLED: LOGGER.warning(u'Duplicates check disabled in configuration.') return # This task runs one a day. Acquire the lock for just a # little more time to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing duplicates check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_subscriptions_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_DUPLICATES_LIMIT start_time = pytime.time() duplicates = BaseItem.objects.duplicate() total_dupes_count = duplicates.count() total_reads_count = 0 processed_dupes = 0 done_dupes_count = 0 purged_dupes_count = 0 purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS) purge_after_weeks_count = min(52, purge_after_weeks_count) purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7) LOGGER.info( u'Done counting (took %s of pure SQL joy), starting procedure.', naturaldelta(pytime.time() - start_time)) with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all', total_dupes_count)): try: for duplicate in duplicates.iterator(): reads = duplicate.reads.all() processed_dupes += 1 if reads.exists(): done_dupes_count += 1 reads_count = reads.count() total_reads_count += reads_count LOGGER.info( u'Duplicate %s #%s still has %s reads, fixing…', duplicate._meta.model.__name__, duplicate.id, reads_count) duplicate.duplicate_of.register_duplicate( duplicate, force=duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED) if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED: # # TODO: check we didn't get some race-conditions new # dependancies between the moment the duplicate # was marked duplicate and now. if duplicate.date_created < purge_before_date: try: with transaction.atomic(): duplicate.delete() except: LOGGER.exception( u'Exception while deleting ' u'duplicate %s #%s', duplicate._meta.model.__name__, duplicate.id) purged_dupes_count += 1 LOGGER.info(u'Purged duplicate %s #%s from database.', duplicate._meta.model.__name__, duplicate.id) elif duplicate.duplicate_status in ( DUPLICATE_STATUS.NOT_REPLACED, DUPLICATE_STATUS.FAILED): # Something went wrong, perhaps the # task was purged before beiing run. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 elif duplicate.duplicate_status is None: # Something went very wrong. If the article is a known # duplicate, its status field should have been set to # at least NOT_REPLACED. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 LOGGER.error( u'Corrected duplicate %s #%s found with no ' u'status.', duplicate._meta.model.__name__, duplicate.id) if limit and processed_dupes >= limit: break finally: my_lock.release() LOGGER.info( u'global_duplicates_checker(): %s/%s duplicates processed ' u'(%.2f%%; limit: %s), %s corrected (%.2f%%), ' u'%s purged (%.2f%%); %s reads altered.', processed_dupes, total_dupes_count, processed_dupes * 100.0 / total_dupes_count, limit or u'none', done_dupes_count, (done_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, purged_dupes_count, (purged_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, total_reads_count)
def global_feeds_checker(): """ Check all RSS feeds and their dependants. Close them if needed. No parameter. """ def pretty_print_feed(feed): return (u'- %s,\n' u' - admin url: http://%s%s\n' u' - public url: %s\n' u' - %s\n' u' - reason: %s\n' u' - last error: %s') % ( feed, settings.SITE_DOMAIN, reverse('admin:%s_%s_change' % (feed._meta.app_label, feed._meta.module_name), args=[feed.id]), # Only RSS/Atom feeds have an URL… feed.url if hasattr(feed, 'url') else '(NO URL)', (u'closed on %s' % feed.date_closed) if feed.date_closed else u'(no closing date)', feed.closed_reason or u'none (or manually closed from the admin interface)', feed.errors[0] if len(feed.errors) else u'(no error recorded)') def pretty_print_feed_list(feed_list): return '\n\n'.join(pretty_print_feed(feed) for feed in feed_list) dtnow = now() limit_days = config.FEED_CLOSED_WARN_LIMIT closed_limit = dtnow - timedelta(days=limit_days) closed_tested = 0 reopened_list = [] # ———————————————————————————————— See if old closed feeds can be reopened. old_closed_feeds = BaseFeed.objects.filter(is_active=False).filter( date_closed__lt=closed_limit) for feed in old_closed_feeds: # check all closed feeds monthly, on their closing date anniversary. if feed.date_closed.day == dtnow.day: if feed.check_old_closed(): reopened_list.append(feed) closed_tested += 1 # ——————————————————————————————————————————— Report recently closed feeds. recently_closed_feeds = BaseFeed.objects.filter(is_active=False).filter( Q(date_closed=None) | Q(date_closed__gte=closed_limit)) if not recently_closed_feeds.exists(): LOGGER.info( 'No feed was closed in the last %s days, %s already ' u'closed checked for eventual back-to-life, of which ' u'%s were reopened.', limit_days, closed_tested, len(reopened_list)) return count = recently_closed_feeds.count() mail_managers( _(u'Reminder: {0} feed(s) closed in last ' u'{1} day(s), {2} automatically reopened').format( count, limit_days, len(reopened_list)), FEED_CHECK_TEMPLATE_TXT.format( feed_list=pretty_print_feed_list(recently_closed_feeds), closed_tested=closed_tested, reopened_count=len(reopened_list), reopened_list=pretty_print_feed_list(reopened_list)), ) start_time = pytime.time() # Close the feeds, but after sending the mail, # So that initial reason is displayed at least # once to a real human. for feed in recently_closed_feeds: if feed.date_closed is None: feed.close('Automatic close by periodic checker task') LOGGER.info('Closed %s feeds in %s.', count, naturaldelta(pytime.time() - start_time))
def feed_distribution_by_last_fetch(): """ compute and group feeds by last_fetch delta from now. """ start_time = pytime.time() # open_feeds = Feed.objects(Q(closed=False) | Q(closed__exists=False)) open_feeds_count = Feed.objects.filter(closed__ne=True).count() lower_value = None loop_count = 0 fetched_feeds = 0 delta_lengths = ( timedelta(seconds=config.FEED_FETCH_DEFAULT_INTERVAL / 6), timedelta(seconds=config.FEED_FETCH_DEFAULT_INTERVAL / 2), timedelta(seconds=config.FEED_FETCH_DEFAULT_INTERVAL), timedelta(seconds=config.FEED_FETCH_DEFAULT_INTERVAL * 2), timedelta(seconds=config.FEED_FETCH_DEFAULT_INTERVAL * 6), timedelta(seconds=config.FEED_FETCH_DEFAULT_INTERVAL * 12), timedelta(days=1), timedelta(days=2), timedelta(days=3), timedelta(days=4), timedelta(days=5), timedelta(days=6), timedelta(days=7), timedelta(days=10), None ) results = {} for delta in delta_lengths: upper_value = (now() - delta) if delta else None if lower_value is None: kwargs = {'last_fetch__gt': upper_value} elif upper_value is None: kwargs = {'last_fetch__lte': lower_value} else: kwargs = {'last_fetch__lte': lower_value, 'last_fetch__gt': upper_value} feeds = Feed.objects(**kwargs) count = feeds.count() percent = float(count * 100.0 / open_feeds_count) avg_fi = sum(f.fetch_interval for f in feeds) * 1.0 / (count or 1.0) results[loop_count] = [ feeds, count, percent, lower_value, upper_value, avg_fi, ] fetched_feeds += count lower_value = upper_value loop_count += 1 results['meta'] = {'fetched_feeds': fetched_feeds, 'open_feeds_count': open_feeds_count, 'duration': pytime.time() - start_time, 'loop_count': loop_count} return results
def recently_usable(self): """ Return True if the account has been tested/connected recently. """ return self.is_usable and ( now() - self.date_last_conn < timedelta(seconds=self.config_account_refresh_period))
def __consume_items(self, api_path, parameters=None, backfilling=False): """ Consume tweets from a stream (public/user). This is an internal method, called from :meth:`consume`. """ def format_quota(quota): if quota['remaining'] is None: return u' (no quota information)' if quota['remaining']: return u'; quota: %s call(s) remaining' % quota['remaining'] else: return u'; quota exhausted, reset in %s' % ( naturaldelta(now() - quota['reset'].replace(tzinfo=utc)) ) def backfill_if_needed(old_latest, max_id): """ See if we need to backfill, or not. """ # If we already have a latest_id, it means we connected # before. Thus, we check if a backfill is needed between # previous session and now. If latest recorded and current # are different, we could eventually have missed something. # → BACKFILL. # # If we don't have a latest_id, it's our first # connection ever. Backfilling is for history, and # has already been launched by consume(), to not # wait for an hypothetical first item in low-trafic # streams. → NO ACTION if old_latest and old_latest < max_id: globals()[ 'twitterfeed_backfill_task' ].apply_async( args=(self.id, ), kwargs={ 'since_id': old_latest, 'max_id': max_id - 1, } ) LOGGER.info(u'%s: starting consume() %sloop on %s(%s)', self, u'for backfilling ' if backfilling else u'', api_path, u'' if parameters is None else ', '.join(u'{0}: {1}'.format(k, v) for k, v in parameters.items())) # We create it here to have it in scope to get quota at the end. result = None if parameters is None: parameters = {} exit_loop = False max_rewind_range = config.TWITTER_BACKFILL_ALLOWED_REWIND_RANGE max_rewind_range_as_dt_from_now = ( now() - timedelta(days=max_rewind_range * 7)) infinite_count = 0 all_processed = 0 cur_processed = 0 old_latest = self.latest_id last_item = None if self.account.exists(): twitter_account = self.account.order_by('?').first() else: twitter_account = self.user.accounts.twitter().order_by('?').first() if twitter_account is None: self.close(u'No more account to run this Twitter feed!') return LOGGER.info(u'%s: consuming via account %s.', self, twitter_account) if not backfilling: self.update_last_fetch() self.save() with twitter_account as tweetapi: while True: LOGGER.debug(u'%s: %s (loop #%s)…', self, u'backfilling' if backfilling else u'consuming', infinite_count) infinite_count += 1 try: logging.disable(logging.CRITICAL) try: if parameters: result = tweetapi.request(api_path, parameters) else: result = tweetapi.request(api_path) finally: logging.disable(logging.NOTSET) if result.get_rest_quota()['remaining'] == 0: LOGGER.error(u'%s: quota exhausted, exiting to ' u'postpone processing.', self) break for item in result.get_iterator(): processed, exit_loop = self.__handle_one_item( item, backfilling=backfilling) if processed: if cur_processed == 0 and not backfilling: # At the first received item while streaming, # we need to check if backfill is needed. backfill_if_needed(old_latest, item['id']) cur_processed += 1 if backfilling: # Backfilling doesn't touch the lock. continue if config.FEED_FETCH_TWITTER_DISABLED: LOGGER.warning( u'%s: exiting because ' u'config.FEED_FETCH_TWITTER_DISABLED is ' u'now true.', self) exit_loop = True last_item = item if exit_loop: break if backfilling and max_rewind_range: if last_item \ and twitter_datetime(last_item['created_at']) \ < max_rewind_range_as_dt_from_now: self.backfill_completed = max_rewind_range self.save() LOGGER.info(u'%s: backfilled to the maximum ' u'allowed.', self) break if cur_processed == 0: if backfilling: # Twitter did not send us any new data while # were backfilling for full history. We won't # get any data further in the past, we just # hit the 800/3200 limit. if parameters.get('since_id', None) is None: LOGGER.info(u'%s: reached end of available ' u'data on the Twitter side.', self) self.backfill_completed = 0 self.save() else: # We got out of the loop without getting any new # item. Just bail out, else we will keep polling # Twitter again and again, exhausting our REST # API quota, hitting duplicates in our database. LOGGER.info(u'%s: no new item in stream.', self) break else: # We got out of the loop, with max items (200) # reached, or at least more than 0. Try to # {fore,back}fill again to fill the gap. If we # were already at max items, we'll got 0 and # then stop. Else, the process will continue. # Only if we were already at end will it cost # us an API call for nothing. if backfilling: parameters['max_id'] = self.oldest_id - 1 else: parameters['since_id'] = self.latest_id if not backfilling: self.update_last_fetch() self.save() # TODO: this creates a race condition. we should # just re-aquire the lock with celery current task # ID, but it's not available in the current scope # (we are a method in the task, not the task # itself)… self.refresh_lock.release() if not self.refresh_lock.acquire(): LOGGER.critical(u'%s: could not re-acquire ' u'our own lock, abruptly ' u'terminating stream ' u'consumption.', self) exit_loop = True except KeyboardInterrupt: LOGGER.warning(u'Interrupting stream consumption ' u'at user request.') break except SoftTimeLimitExceeded: # This should happen only on streaming APIs. LOGGER.info(u'%s: time limit reached, terminating ' u'to let things flow.', self) if self.can_continue_consuming(): if not backfilling: # update last fetch date for global refresh task # not to relaunch us again while we already do it. self.update_last_fetch() self.save() # relaunch immediately if a worker is available, # to not loose any tweet in case of a prolix feed. globals()['twitterfeed_consume_task'].delay(self.id) else: LOGGER.warning(u'%s: not active anymore, exiting.', self) break except TwitterRequestError, e: # https://dev.twitter.com/overview/api/response-codes if e.status_code in (420, 429): LOGGER.error(u'%s: API rate exceeded (%s) while %s, ' u'exiting loop to throttle down.', self, e.status_code, u'backfilling' if backfilling else u'consuming') statsd.incr('api.twitter.messages.rate_exceeded') else: LOGGER.error(u'%s: Twitter error %s while %s %s, ' u'exiting.', self, unicode(e), u'backfilling' if backfilling else u'consuming', api_path) exit_loop = True except Exception: # # TODO: handle network errors, set last_fetch, # last TID, and exit for a while if relevant. # Else, just continue and let the stream flow. # LOGGER.exception(u'%s: exception in loop #%s after ' u'having consumed %s item(s), ' u're-starting…', self, infinite_count, cur_processed) statsd.incr('api.twitter.items.exception') all_processed += cur_processed cur_processed = 0 if exit_loop: break
def recently_usable(self): """ Return True if the account has been tested/connected recently. """ return self.is_usable and (now() - self.date_last_conn < timedelta( seconds=self.config_account_refresh_period))
def archive_articles(limit=None): """ Archive articles that pollute the production database. """ raise NotImplementedError('REVIEW for RELDB.') # cf. https://docs.djangoproject.com/en/dev/topics/db/multi-db/#selecting-a-database-to-delete-from # NOQA counts = { 'duplicates': 0, 'orphaned': 0, 'bad_articles': 0, 'archived_dupes': 0, } if limit is None: limit = config.ARTICLE_ARCHIVE_BATCH_SIZE with no_dereference(Article) as ArticleOnly: if config.ARTICLE_ARCHIVE_OLDER_THAN > 0: older_than = now() - timedelta( days=config.ARTICLE_ARCHIVE_OLDER_THAN) duplicates = ArticleOnly.objects( duplicate_of__ne=None, date_published__lt=older_than).limit(limit) orphaned = ArticleOnly.objects( orphaned=True, date_published__lt=older_than).limit(limit) else: duplicates = ArticleOnly.objects(duplicate_of__ne=None ).limit(limit) orphaned = ArticleOnly.objects(orphaned=True).limit(limit) duplicates.no_cache() orphaned.no_cache() counts['duplicates'] = duplicates.count() counts['orphaned'] = orphaned.count() if counts['duplicates']: current = 0 LOGGER.info(u'Archiving of %s duplicate article(s) started.', counts['duplicates']) with benchmark('Archiving of %s duplicate article(s)' % counts['duplicates']): for article in duplicates: archive_article_one_internal(article, counts) current += 1 if current % 50 == 0: LOGGER.info(u'Archived %s/%s duplicate articles so far.', current, counts['duplicates']) if counts['orphaned']: current = 0 LOGGER.info(u'Archiving of %s orphaned article(s) started.', counts['orphaned']) with benchmark('Archiving of %s orphaned article(s)' % counts['orphaned']): for article in orphaned: archive_article_one_internal(article, counts) current += 1 if current % 50 == 0: LOGGER.info(u'Archived %s/%s orphaned articles so far.', current, counts['duplicates']) if counts['duplicates'] or counts['orphaned']: synchronize_statsd_articles_gauges(full=True) LOGGER.info('%s already archived and %s bad articles were found ' u'during the operation.', counts['archived_dupes'], counts['bad_articles']) else: LOGGER.info(u'No article to archive.')
def global_duplicates_checker(limit=None, force=False): """ Check that duplicate articles have no more Reads anywhere. Fix it if not, and update all counters accordingly. :param limit: integer, the maximum number of duplicates to check. Default: none. :param force: boolean, default ``False``, allows to by bypass and reacquire the lock. """ if config.CHECK_DUPLICATES_DISABLED: LOGGER.warning(u'Duplicates check disabled in configuration.') return # This task runs one a day. Acquire the lock for just a # little more time to avoid over-parallelized runs. my_lock = RedisExpiringLock('check_all_duplicates', expire_time=3600 * 25) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(u'Forcing duplicates check…') else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'global_subscriptions_checker() is already ' u'locked, aborting.') return if limit is None: limit = config.CHECK_DUPLICATES_LIMIT start_time = pytime.time() duplicates = BaseItem.objects.duplicate() total_dupes_count = duplicates.count() total_reads_count = 0 processed_dupes = 0 done_dupes_count = 0 purged_dupes_count = 0 purge_after_weeks_count = max(1, config.CHECK_DUPLICATES_PURGE_AFTER_WEEKS) purge_after_weeks_count = min(52, purge_after_weeks_count) purge_before_date = now() - timedelta(days=purge_after_weeks_count * 7) LOGGER.info(u'Done counting (took %s of pure SQL joy), starting procedure.', naturaldelta(pytime.time() - start_time)) with benchmark(u"Check {0}/{1} duplicates".format(limit or u'all', total_dupes_count)): try: for duplicate in duplicates.iterator(): reads = duplicate.reads.all() processed_dupes += 1 if reads.exists(): done_dupes_count += 1 reads_count = reads.count() total_reads_count += reads_count LOGGER.info(u'Duplicate %s #%s still has %s reads, fixing…', duplicate._meta.model.__name__, duplicate.id, reads_count) duplicate.duplicate_of.register_duplicate( duplicate, force=duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED) if duplicate.duplicate_status == DUPLICATE_STATUS.FINISHED: # # TODO: check we didn't get some race-conditions new # dependancies between the moment the duplicate # was marked duplicate and now. if duplicate.date_created < purge_before_date: try: with transaction.atomic(): duplicate.delete() except: LOGGER.exception(u'Exception while deleting ' u'duplicate %s #%s', duplicate._meta.model.__name__, duplicate.id) purged_dupes_count += 1 LOGGER.info(u'Purged duplicate %s #%s from database.', duplicate._meta.model.__name__, duplicate.id) elif duplicate.duplicate_status in ( DUPLICATE_STATUS.NOT_REPLACED, DUPLICATE_STATUS.FAILED): # Something went wrong, perhaps the # task was purged before beiing run. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 elif duplicate.duplicate_status is None: # Something went very wrong. If the article is a known # duplicate, its status field should have been set to # at least NOT_REPLACED. duplicate.duplicate_of.register_duplicate(duplicate) done_dupes_count += 1 LOGGER.error(u'Corrected duplicate %s #%s found with no ' u'status.', duplicate._meta.model.__name__, duplicate.id) if limit and processed_dupes >= limit: break finally: my_lock.release() LOGGER.info(u'global_duplicates_checker(): %s/%s duplicates processed ' u'(%.2f%%; limit: %s), %s corrected (%.2f%%), ' u'%s purged (%.2f%%); %s reads altered.', processed_dupes, total_dupes_count, processed_dupes * 100.0 / total_dupes_count, limit or u'none', done_dupes_count, (done_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, purged_dupes_count, (purged_dupes_count * 100.0 / processed_dupes) if processed_dupes else 0.0, total_reads_count)
def refresh_all_feeds(limit=None, force=False): u""" Refresh all feeds (RSS/Mail/Twitter…). """ if config.FEED_FETCH_DISABLED: # Do not raise any .retry(), this is a scheduled task. LOGGER.warning(u'Feed refresh disabled in configuration.') return # As FEED_GLOBAL_REFRESH_INTERVAL is dynamically modifiable, # we should re-evaluate it each time we run. this_round_expire_time = ( config.FEED_GLOBAL_REFRESH_INTERVAL * 60 - config.FEED_GLOBAL_REFRESH_INTERVAL ) # Be sure two refresh operations don't overlap, but don't hold the # lock too long if something goes wrong. In production conditions # as of 20130812, refreshing all feeds takes only a moment: # [2013-08-12 09:07:02,028: INFO/MainProcess] Task # oneflow.core.tasks.refresh_all_feeds succeeded in 1.99886608124s. # my_lock = RedisExpiringLock( REFRESH_ALL_FEEDS_LOCK_NAME, expire_time=this_round_expire_time ) if not my_lock.acquire(): if force: my_lock.release() my_lock.acquire() LOGGER.warning(_(u'Forcing all feed refresh…')) else: # Avoid running this task over and over again in the queue # if the previous instance did not yet terminate. Happens # when scheduled task runs too quickly. LOGGER.warning(u'refresh_all_feeds() is already locked, aborting.') return # This should bring us a Polymorphic Query to refresh all feeds types. feeds = BaseFeed.objects.filter(is_active=True, is_internal=False).order_by( 'date_last_fetch') if limit: feeds = feeds[:limit] with benchmark('refresh_all_feeds()'): try: count = 0 mynow = now() for feed in feeds: if feed.refresh_lock.is_locked(): # The refresh task lauched before its expiration, and is # still [long] running while we want to launch another. # Avoid, because the new would exit immediately on # date_last_fetch too recent. LOGGER.debug(u'Feed %s already locked, skipped.', feed) continue if feed.date_last_fetch is None: basefeed_refresh_task.apply_async( args=(feed.id, ), # in `this_round_expire_time`, we will relaunch it # anyway, so don't clutter the queue with double work. expire=this_round_expire_time, ) LOGGER.info(u'Launched immediate refresh of feed %s which ' u'has never been refreshed.', feed) count += 1 continue if feed.fetch_interval > 86399: interval_days = feed.fetch_interval / 86400 interval_seconds = feed.fetch_interval - ( interval_days * 86400) interval = timedelta(days=interval_days, seconds=interval_seconds) else: interval = timedelta(seconds=feed.fetch_interval) if force or feed.date_last_fetch + interval < mynow: how_late = feed.date_last_fetch + interval - mynow how_late = how_late.days * 86400 + how_late.seconds late = feed.date_last_fetch + interval < mynow basefeed_refresh_task.apply_async( args=(feed.id, ), kwargs={'force': force}, expire=this_round_expire_time, ) LOGGER.info(u'Launched refresh of feed %s (%s %s).', feed, naturaldelta(how_late), u'late' if late else u'earlier') count += 1 finally: # HEADS UP: in case the system is overloaded and feeds refresh() # tasks don't complete fast enough, the current task # will overload it even more. Thus, we intentionaly # don't release the lock to avoid over-re-launched # global tasks to feed the refresh queue with useless # double-triple-Nble individual tasks. # # my_lock.release() pass LOGGER.info(u'Launched %s refreshes out of %s feed(s) checked.', count, feeds.count())
def guess_and_import_wallabag(self): """ Try to import a JSON export file from wallabag. """ try: wallabag_json = json.loads(self.urls) except: return False try: first_object = wallabag_json[0] except: return False for attr_name in ( "0", "1", "2", "3", "4", "5", "6", "content", "id", "is_fav", "is_read", "title", "url", "user_id", ): if attr_name not in first_object: return False message_user(self.user, _(u'Wallabag JSON export format detected.'), constants.INFO) for wallabag_object in wallabag_json: url = wallabag_object['url'] if self.validate_url(url): article = self.import_from_one_url( url, origin=ORIGINS.WALLABAG ) if article is None: # article was not created, we # already have it in the database. article = Article.objects.get(url=url) # Now comes the wallabag-specific part of the import, # eg. get back user meta-data as much as possible in 1flow. article_needs_save = False article_needs_convert = False title = wallabag_object.get('title', None) if title: article.name = title article_needs_save = True content = wallabag_object['content'] if content: article.content = content article.content_type = CONTENT_TYPES.HTML article_needs_save = True article_needs_convert = True if article_needs_save: article.save() if article_needs_convert: article.convert_to_markdown() read = article.reads.get( subscriptions=self.user.user_subscriptions.imported_items) # About parsing dates: # http://stackoverflow.com/q/127803/654755 # http://stackoverflow.com/a/18150817/654755 read_needs_save = False if wallabag_object.get('is_fav', False): read.is_starred = True read_needs_save = True # This information is not in wallabag. read.date_starred = now() if wallabag_object.get('is_read', False): read.is_read = True read_needs_save = True # This information is not in wallabag. read.date_read = now() if read_needs_save: read.save() return True