Exemple #1
0
def read_post_save(instance, **kwargs):
    """ Method meant to be run from a celery task. """

    read = instance

    if kwargs.get('created', False):

        with statsd.pipeline() as spipe:
            spipe.gauge('reads.counts.total', 1, delta=True)

            if read.is_good:
                spipe.gauge('reads.counts.good', 1, delta=True)

            else:
                spipe.gauge('reads.counts.bad', 1, delta=True)

        LOGGER.debug(u'Read created %s: %s',
                     u'GOOD' if read.is_good else u'BAD', read)

        if read.date_created < MIGRATION_DATETIME:
            # HEADS UP: REMOVE THIS WHEN migration is finished
            return

        read.rating = read.item.default_rating

        read.set_subscriptions(commit=False)

    # HEADS UP: this should be done manually in methods like activate().
    #           This will avoid double counting, and users seeing reads
    #           while these reads are not yet "good", and thus not really
    #           available to the user in the interface.
    # read.update_cached_descriptors()
    pass
Exemple #2
0
    def update_statsd_errors_count(self, processor, instance, delta_value):
        """ Send statsd delta_value for workflow processor categories. """

        # We don't report stats errors for chains, only processors.
        # For chains this would mean too much false-positive stats,
        # they can have too many categories.
        if isinstance(processor.item, Processor):

            workflow_categories = processor.item.categories.exclude(
                # Sorry for this too-bare selector.
                # It's the simplest way for now.
                slug__contains=u'-')

            if workflow_categories.exists():
                # Graphite will complain (or ignore)
                # if there are spaces in the name.
                plural_name = slugify(
                    instance._meta.verbose_name_plural.lower())

                with statsd.pipeline() as spipe:
                    for category in workflow_categories.all():
                        spipe.gauge(
                            '{0}.counts.{1}_errors'.format(
                                plural_name,
                                category.slug),
                            delta_value, delta=True)
Exemple #3
0
def read_post_save(instance, **kwargs):
    """ Method meant to be run from a celery task. """

    read = instance

    if kwargs.get('created', False):

        with statsd.pipeline() as spipe:
            spipe.gauge('reads.counts.total', 1, delta=True)

            if read.is_good:
                spipe.gauge('reads.counts.good', 1, delta=True)

            else:
                spipe.gauge('reads.counts.bad', 1, delta=True)

        LOGGER.debug(u'Read created %s: %s',
                     u'GOOD' if read.is_good else u'BAD', read)

        if read.date_created < MIGRATION_DATETIME:
            # HEADS UP: REMOVE THIS WHEN migration is finished
            return

        read.rating = read.item.default_rating

        read.set_subscriptions(commit=False)

    # HEADS UP: this should be done manually in methods like activate().
    #           This will avoid double counting, and users seeing reads
    #           while these reads are not yet "good", and thus not really
    #           available to the user in the interface.
    # read.update_cached_descriptors()
    pass
Exemple #4
0
def synchronize_statsd_articles_gauges(full=False):
    """ synchronize all articles-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Article.*'):

        empty = Article.objects.empty()
        # empty_pending       = empty.filter(content_error='', url_error='')
        # empty_content_error = empty.filter(content_error__ne='')
        # empty_url_error     = empty.filter(url_error__ne='')

        parsed = Article.objects.parsed()
        html = parsed.filter(content_type=CONTENT_TYPES.HTML)
        markdown = parsed.filter(content_type=CONTENT_TYPES.MARKDOWN)

        absolutes = Article.objects.absolute()
        duplicates = Article.objects.duplicate()
        orphaned = Article.objects.orphaned().master()
        content_errors = Article.objects.exclude(content_error=None)
        url_errors = Article.objects.exclude(url_error=None)

        with statsd.pipeline() as spipe:
            spipe.gauge('articles.counts.total', Article.objects.all().count())
            spipe.gauge('articles.counts.markdown', markdown.count())
            spipe.gauge('articles.counts.html', html.count())
            spipe.gauge('articles.counts.empty', empty.count())
            spipe.gauge('articles.counts.content_errors',
                        content_errors.count())
            spipe.gauge('articles.counts.url_errors', url_errors.count())

            if full:
                spipe.gauge('articles.counts.orphaned', orphaned.count())
                spipe.gauge('articles.counts.absolutes', absolutes.count())
                spipe.gauge('articles.counts.duplicates', duplicates.count())
Exemple #5
0
def article_post_save(instance, **kwargs):

    article = instance

    if kwargs.get('created', False):

        with statsd.pipeline() as spipe:
            spipe.gauge('articles.counts.total', 1, delta=True)
            spipe.gauge('articles.counts.empty', 1, delta=True)

            if article.is_orphaned:
                spipe.gauge('articles.counts.orphaned', 1, delta=True)

            if article.duplicate_of:
                spipe.gauge('articles.counts.duplicates', 1, delta=True)

            if article.url_error:
                spipe.gauge('articles.counts.url_error', 1, delta=True)

            if article.content_error:
                spipe.gauge('articles.counts.content_error', 1, delta=True)

        # Some articles are created "already orphaned" or duplicates.
        # In the archive database this is more immediate than looking
        # up the database name.
        if not (article.is_orphaned or article.duplicate_of):

            # MIGRATION: remove this "if".
            if article.date_created >= MIGRATION_DATETIME:

                # HEADS UP: this task name will be registered later
                # by the register_task_method() call.
                globals()['article_post_create_task'].apply_async(
                    args=(article.id, ),
                    countdown=config.POST_CREATE_TASKS_DELAY)
Exemple #6
0
    def update_statsd_errors_count(self, processor, instance, delta_value):
        """ Send statsd delta_value for workflow processor categories. """

        # We don't report stats errors for chains, only processors.
        # For chains this would mean too much false-positive stats,
        # they can have too many categories.
        if isinstance(processor.item, Processor):

            workflow_categories = processor.item.categories.exclude(
                # Sorry for this too-bare selector.
                # It's the simplest way for now.
                slug__contains=u'-')

            if workflow_categories.exists():
                # Graphite will complain (or ignore)
                # if there are spaces in the name.
                plural_name = slugify(
                    instance._meta.verbose_name_plural.lower())

                with statsd.pipeline() as spipe:
                    for category in workflow_categories.all():
                        spipe.gauge('{0}.counts.{1}_errors'.format(
                            plural_name, category.slug),
                                    delta_value,
                                    delta=True)
Exemple #7
0
    def create_read(self, item, verbose=True, **kwargs):
        """ Return a tuple (read, created) with the new (or existing) read.

        ``created`` is a boolean indicating if it was actually created
        or if it existed before.
        """

        # We force item.id, to be sure the item is reloaded from the DB.
        # In many cases, the item has been fetched in the background, and
        # the current one is not fresh enough, it will report
        # `is_good` == False whereas in reality it's OK.
        item = BaseItem.objects.get(id=item.id)

        read, created = Read.objects.get_or_create(item_id=item.id,
                                                   user=self.user)

        # If another feed has already created the read, be sure the
        # current one is registered in the read via the subscriptions.
        #
        # NOTE: there is no problem adding again the same subscription,
        #       it will result in only one M2M entry (cf. Django docs).
        read.subscriptions.add(self)

        need_save = False

        if created:
            read.tags.add(*item.tags.all())

            for key, value in kwargs.items():
                setattr(read, key, value)
                need_save = True

            # This will include the current subscription,
            # all its folders & the user global counters.
            read.update_cached_descriptors(update_only=(
                'all',
                'unread',
            ))

        # If the item was already there and fetched (mutualized from
        # another feed, for example), activate the read immediately.
        # If we don't do this here, the only alternative is the daily
        # global_reads_checker() task, which is not acceptable for
        # "just-added" subscriptions, whose reads are created via the
        # current method.
        if item.is_good and not read.is_good:
            read.is_good = True
            need_save = True

            # The post_save() signal updates stats only if created.
            # It already sent "bad" read on creation. Invert the situation.
            with statsd.pipeline() as spipe:
                spipe.gauge('reads.counts.good', 1, delta=True)
                spipe.gauge('reads.counts.bad', -1, delta=True)

        if need_save:
            read.save()

        return read, created
Exemple #8
0
def email_post_save(instance, **kwargs):

    email = instance

    if kwargs.get('created', False):

        with statsd.pipeline() as spipe:
            spipe.gauge('emails.counts.total', 1, delta=True)

        globals()['email_post_create_task'].apply_async(
            args=(email.id, ), countdown=config.POST_CREATE_TASKS_DELAY)
Exemple #9
0
def poke_post_save(instance, **kwargs):

    poke = instance

    if kwargs.get('created', False):

        with statsd.pipeline() as spipe:
            spipe.gauge('pokes.counts.total', 1, delta=True)

        globals()['poke_post_create_task'].apply_async(
            args=(poke.id, ), countdown=config.POST_CREATE_TASKS_DELAY)
Exemple #10
0
def read_pre_delete(instance, **kwargs):
    """ before deleting a read, update the subscriptions cached descriptors. """

    read = instance

    with statsd.pipeline() as spipe:
        spipe.gauge('reads.counts.total', -1, delta=True)

        if read.is_good:
            spipe.gauge('reads.counts.good', -1, delta=True)

        else:
            spipe.gauge('reads.counts.bad', -1, delta=True)

    if not read.is_good:
        # counters already don't take this read into account.
        return

    read.update_cached_descriptors(operation='-')
Exemple #11
0
    def activate(self, force=False):
        """ This method will mark the Read ``.is_good=True``
            and do whatever in consequence. """

        if not force and not self.item.is_good:
            LOGGER.error(u'Cannot activate read %s, whose article '
                         u'is not ready for public use!', self)
            return

        self.is_good = True
        self.save()

        with statsd.pipeline() as spipe:
            spipe.gauge('reads.counts.good', 1, delta=True)
            spipe.gauge('reads.counts.bad', -1, delta=True)

            LOGGER.debug(u'Read switched GOOD %s', self)

        self.update_cached_descriptors()
Exemple #12
0
def read_pre_delete(instance, **kwargs):
    """ before deleting a read, update the subscriptions cached descriptors. """

    read = instance

    with statsd.pipeline() as spipe:
        spipe.gauge('reads.counts.total', -1, delta=True)

        if read.is_good:
            spipe.gauge('reads.counts.good', -1, delta=True)

        else:
            spipe.gauge('reads.counts.bad', -1, delta=True)

    if not read.is_good:
        # counters already don't take this read into account.
        return

    read.update_cached_descriptors(operation='-')
Exemple #13
0
    def activate(self, force=False):
        """ This method will mark the Read ``.is_good=True``
            and do whatever in consequence. """

        if not force and not self.item.is_good:
            LOGGER.error(
                u'Cannot activate read %s, whose article '
                u'is not ready for public use!', self)
            return

        self.is_good = True
        self.save()

        with statsd.pipeline() as spipe:
            spipe.gauge('reads.counts.good', 1, delta=True)
            spipe.gauge('reads.counts.bad', -1, delta=True)

            LOGGER.debug(u'Read switched GOOD %s', self)

        self.update_cached_descriptors()
Exemple #14
0
def synchronize_statsd_reads_gauges(full=False):
    """ synchronize all read-related gauges on our statsd server. """

    with benchmark('synchronize statsd gauges for Read.*'):

        count = Read.objects.all().count()
        good = Read.objects.good().count()
        bad = Read.objects.bad().count()

        with statsd.pipeline() as spipe:
            spipe.gauge('reads.counts.total', count)
            spipe.gauge('reads.counts.good', good)
            spipe.gauge('reads.counts.bad', bad)

        # Am I paranoïd?!? No, I come from two years of MongoDB.
        # Sorry PostgreSQL, I'm underway healing.
        if bad != (count - good):
            LOGGER.warning(
                u'Bad count (%s) is different from total-good (%s)!', bad,
                count - good)
Exemple #15
0
def article_pre_delete(instance, **kwargs):

    article = instance

    if isinstance(article, Article):
        with statsd.pipeline() as spipe:
            spipe.gauge('articles.counts.total', -1, delta=True)

            if article.is_orphaned:
                spipe.gauge('articles.counts.orphaned', -1, delta=True)

            if article.duplicate_of_id:
                spipe.gauge('articles.counts.duplicates', -1, delta=True)

            if article.url_error:
                spipe.gauge('articles.counts.url_error', -1, delta=True)

            if article.content_error:
                spipe.gauge('articles.counts.content_error', -1, delta=True)

            if article.content_type == CONTENT_TYPES.HTML:
                spipe.gauge('articles.counts.html', -1, delta=True)

            elif article.content_type in (CONTENT_TYPES.MARKDOWN, ):
                spipe.gauge('articles.counts.markdown', -1, delta=True)

            elif article.content_type in (None, CONTENT_TYPES.NONE, ):
                spipe.gauge('articles.counts.empty', -1, delta=True)

    if instance.processing_errors.exists():
        try:
            instance.processing_errors.clear()

        except:
            LOGGER.exception(u'%s %s: could not clear processing errors',
                             instance._meta.verbose_name, instance.id)
Exemple #16
0
    def convert_to_markdown(self, force=False, commit=True):

        if config.ARTICLE_MARKDOWN_DISABLED:
            LOGGER.info(u'Article markdown convert disabled in '
                        u'configuration.')
            return

        if self.content_type == CONTENT_TYPES.MARKDOWN:
            if not force:
                LOGGER.info(u'%s #%s already converted to Markdown.',
                            self._meta.verbose_name, self.id)
                return

            else:
                statsd.gauge('articles.counts.markdown', -1, delta=True)

        elif self.content_type != CONTENT_TYPES.HTML:
            LOGGER.warning(u'%s #%s cannot be converted to Markdown, '
                           u'it is not currently HTML.',
                           self._meta.verbose_name, self.id)
            return

        LOGGER.info(u'Converting %s #%s to markdown…',
                    self._meta.verbose_name, self.id)

        md_converter = html2text.HTML2Text()

        # Set sane defaults. body_width > 0 breaks
        # some links by inserting \n inside them.
        #
        # MD_V1 had [False, False, 78] (=default parameters)
        md_converter.unicode_snob = True
        md_converter.escape_snob  = True
        md_converter.body_width   = 0

        try:
            # NOTE: everything should stay in Unicode during this call.
            self.content = md_converter.handle(self.content)

        except Exception as e:
            statsd.gauge('articles.counts.content_errors', 1, delta=True)

            self.content_error = str(e)
            self.save()

            LOGGER.exception(u'Markdown convert failed for item #%s.', self.id)
            return e

        self.content_type = CONTENT_TYPES.MARKDOWN

        if self.content_error:
            statsd.gauge('articles.counts.content_errors', -1, delta=True)
            self.content_error = None

        #
        # TODO: word count here
        #
        self.postprocess_markdown_links(commit=False, force=force)

        if commit:
            self.save()

        with statsd.pipeline() as spipe:
            spipe.gauge('articles.counts.html', -1, delta=True)
            spipe.gauge('articles.counts.markdown', 1, delta=True)

        if config.ARTICLE_FETCHING_DEBUG:
            LOGGER.info(u'————————— #%s Markdown %s —————————'
                        u'\n%s\n'
                        u'————————— end #%s Markdown —————————',
                        self.id, self.content.__class__.__name__,
                        self.content, self.id)
Exemple #17
0
    def absolutize_url(self, requests_response=None, force=False, commit=True):
        """ Make the current article URL absolute.

        Eg. transform:

        http://feedproxy.google.com/~r/francaistechcrunch/~3/hEIhLwVyEEI/

        into:

        http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/ # NOQA
            ?utm_source=feeurner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29 # NOQA

        and then remove all these F*G utm_* parameters to get a clean
        final URL for the current article.

        Returns ``True`` if the operation succeeded, ``False`` if the
        absolutization pointed out that the current article is a
        duplicate of another. In this case the caller should stop its
        processing because the current article will be marked for deletion.

        Can also return ``None`` if absolutizing is disabled globally
        in ``constance`` configuration.
        """

        # Another example: http://rss.lefigaro.fr/~r/lefigaro/laune/~3/7jgyrQ-PmBA/story01.htm # NOQA

        if self.absolutize_url_must_abort(force=force, commit=commit):
            return

        if requests_response is None:
            try:
                requests_response = requests.get(self.url)

            except requests.ConnectionError as e:
                statsd.gauge('articles.counts.url_errors', 1, delta=True)
                message = u'Connection error while absolutizing “%s”: %s'
                args = (
                    self.url,
                    str(e),
                )

                self.url_error = message % args
                # Don't waste a version just for that.
                self.save_without_historical_record()

                LOGGER.error(message, *args)
                return

        if not requests_response.ok or requests_response.status_code != 200:

            message = u'HTTP Error %s while absolutizing “%s”: %s'
            args = (requests_response.status_code, requests_response.url,
                    requests_response.reason)

            with statsd.pipeline() as spipe:
                spipe.gauge('articles.counts.url_errors', 1, delta=True)

                if requests_response.status_code in (404, ):
                    self.is_orphaned = True

                    # This is not handled by the post_save()
                    # which acts only at article creation.
                    spipe.gauge('articles.counts.orphaned', 1, delta=True)

            self.url_error = message % args

            # Don't waste a version just for that.
            self.save_without_historical_record()

            LOGGER.error(message, *args)
            return

        #
        # NOTE: we could also get it eventually from r.headers['link'],
        #       which contains '<another_url>'. We need to strip out
        #       the '<>', and re-absolutize this link, because in the
        #       example it's another redirector. Also r.links is a good
        #       candidate but in the example I used, it contains the
        #       shortlink, which must be re-resolved too.
        #
        #       So: as we already are at the final address *now*, no need
        #       bothering re-following another which would lead us to the
        #       the same final place.
        #

        final_url = clean_url(requests_response.url)

        # LOGGER.info(u'\n\nFINAL: %s vs. ORIG: %s\n\n', final_url, self.url)

        if final_url != self.url:

            # Just for displaying purposes, see below.
            old_url = self.url

            if self.url_error:
                statsd.gauge('articles.counts.url_errors', -1, delta=True)

            # Even if we are a duplicate, we came until here and everything
            # went fine. We won't need to lookup again the absolute URL.
            statsd.gauge('articles.counts.absolutes', 1, delta=True)
            self.url_absolute = True
            self.url_error = None

            self.url = final_url

            try:
                if self.name.endswith(old_url):
                    self.name = self.name.replace(old_url, final_url)
            except:
                LOGGER.exception(u'Could not replace URL in name of %s #%s',
                                 self._meta.model.__name__, self.id)

            duplicate = False

            with transaction.atomic():
                # Without the atomic() block, saving the current article
                # (beiing a duplicate) will trigger the IntegrityError,
                # but will render the current SQL context unusable, unable
                # to register duplicate, potentially leading to massive
                # inconsistencies in the caller's context.
                try:
                    # Don't waste a version just for that.
                    self.save_without_historical_record()

                except IntegrityError:
                    duplicate = True

            if duplicate:
                params = {'%s___url' % self._meta.model.__name__: final_url}
                original = BaseItem.objects.get(**params)

                # Just to display the right “old” one in logs.
                self.url = old_url

                LOGGER.info(
                    u'%s #%s is a duplicate of #%s, '
                    u'registering as such.', self._meta.model.__name__,
                    self.id, original.id)

                original.register_duplicate(self)
                return False

            # Any other exception will raise. This is intentional.
            else:
                LOGGER.info(
                    u'URL of %s (#%s) successfully absolutized '
                    u'from %s to %s.', self._meta.model.__name__, self.id,
                    old_url, final_url)

        else:
            # Don't do the job twice.
            if self.url_error:
                statsd.gauge('articles.counts.url_errors', -1, delta=True)

            statsd.gauge('articles.counts.absolutes', 1, delta=True)
            self.url_absolute = True
            self.url_error = None

            # Don't waste a version just for that.
            self.save_without_historical_record()

        return True
Exemple #18
0
        if feed_status in (400, 401, 402, 403, 404, 500, 502, 503):
            self.error(u'HTTP %s on %s' % (http_logger.log[-1]['status'],
                       http_logger.log[-1]['url']), last_fetch=True)
            return

        try:
            Feed.check_feedparser_error(parsed_feed, self)

        except Exception, e:
            self.close(reason=str(e))
            return

        if feed_status == 304:
            LOGGER.info(u'No new content in feed %s.', self)

            with statsd.pipeline() as spipe:
                spipe.incr('feeds.refresh.fetch.global.unchanged')

        else:
            tags = Tag.get_tags_set(getattr(parsed_feed, 'tags', []),
                                    origin=self)

            if tags != set(self.tags):
                # We consider the publisher knows the nature of his content
                # better than us, and we trust him about the tags he sets
                # on the feed. Thus, we don't union() with the new tags,
                # but simply replace current by new ones.
                LOGGER.info(u'Updating tags of feed %s from %s to %s.',
                            self.tags, tags)
                self.tags = list(tags)
Exemple #19
0
def email_pre_delete(instance, **kwargs):

    with statsd.pipeline() as spipe:
        spipe.gauge('emails.counts.total', -1, delta=True)
Exemple #20
0
    def absolutize_url(self, requests_response=None, force=False, commit=True):
        """ Make the current article URL absolute.

        Eg. transform:

        http://feedproxy.google.com/~r/francaistechcrunch/~3/hEIhLwVyEEI/

        into:

        http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/ # NOQA
            ?utm_source=feeurner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29 # NOQA

        and then remove all these F*G utm_* parameters to get a clean
        final URL for the current article.

        Returns ``True`` if the operation succeeded, ``False`` if the
        absolutization pointed out that the current article is a
        duplicate of another. In this case the caller should stop its
        processing because the current article will be marked for deletion.

        Can also return ``None`` if absolutizing is disabled globally
        in ``constance`` configuration.
        """

        # Another example: http://rss.lefigaro.fr/~r/lefigaro/laune/~3/7jgyrQ-PmBA/story01.htm # NOQA

        if self.absolutize_url_must_abort(force=force, commit=commit):
            return

        if requests_response is None:
            try:
                requests_response = requests.get(self.url)

            except requests.ConnectionError as e:
                statsd.gauge('articles.counts.url_errors', 1, delta=True)
                message = u'Connection error while absolutizing “%s”: %s'
                args = (self.url, str(e), )

                self.url_error = message % args
                # Don't waste a version just for that.
                self.save_without_historical_record()

                LOGGER.error(message, *args)
                return

        if not requests_response.ok or requests_response.status_code != 200:

            message = u'HTTP Error %s while absolutizing “%s”: %s'
            args = (
                requests_response.status_code,
                requests_response.url,
                requests_response.reason
            )

            with statsd.pipeline() as spipe:
                spipe.gauge('articles.counts.url_errors', 1, delta=True)

                if requests_response.status_code in (404, ):
                    self.is_orphaned = True

                    # This is not handled by the post_save()
                    # which acts only at article creation.
                    spipe.gauge('articles.counts.orphaned', 1, delta=True)

            self.url_error = message % args

            # Don't waste a version just for that.
            self.save_without_historical_record()

            LOGGER.error(message, *args)
            return

        #
        # NOTE: we could also get it eventually from r.headers['link'],
        #       which contains '<another_url>'. We need to strip out
        #       the '<>', and re-absolutize this link, because in the
        #       example it's another redirector. Also r.links is a good
        #       candidate but in the example I used, it contains the
        #       shortlink, which must be re-resolved too.
        #
        #       So: as we already are at the final address *now*, no need
        #       bothering re-following another which would lead us to the
        #       the same final place.
        #

        final_url = clean_url(requests_response.url)

        # LOGGER.info(u'\n\nFINAL: %s vs. ORIG: %s\n\n', final_url, self.url)

        if final_url != self.url:

            # Just for displaying purposes, see below.
            old_url = self.url

            if self.url_error:
                statsd.gauge('articles.counts.url_errors', -1, delta=True)

            # Even if we are a duplicate, we came until here and everything
            # went fine. We won't need to lookup again the absolute URL.
            statsd.gauge('articles.counts.absolutes', 1, delta=True)
            self.url_absolute = True
            self.url_error = None

            self.url = final_url

            try:
                if self.name.endswith(old_url):
                    self.name = self.name.replace(old_url, final_url)
            except:
                LOGGER.exception(u'Could not replace URL in name of %s #%s',
                                 self._meta.model.__name__, self.id)

            duplicate = False

            with transaction.atomic():
                # Without the atomic() block, saving the current article
                # (beiing a duplicate) will trigger the IntegrityError,
                # but will render the current SQL context unusable, unable
                # to register duplicate, potentially leading to massive
                # inconsistencies in the caller's context.
                try:
                    # Don't waste a version just for that.
                    self.save_without_historical_record()

                except IntegrityError:
                    duplicate = True

            if duplicate:
                params = {
                    '%s___url' % self._meta.model.__name__: final_url
                }
                original = BaseItem.objects.get(**params)

                # Just to display the right “old” one in logs.
                self.url = old_url

                LOGGER.info(u'%s #%s is a duplicate of #%s, '
                            u'registering as such.',
                            self._meta.model.__name__, self.id, original.id)

                original.register_duplicate(self)
                return False

            # Any other exception will raise. This is intentional.
            else:
                LOGGER.info(u'URL of %s (#%s) successfully absolutized '
                            u'from %s to %s.', self._meta.model.__name__,
                            self.id, old_url, final_url)

        else:
            # Don't do the job twice.
            if self.url_error:
                statsd.gauge('articles.counts.url_errors', -1, delta=True)

            statsd.gauge('articles.counts.absolutes', 1, delta=True)
            self.url_absolute = True
            self.url_error = None

            # Don't waste a version just for that.
            self.save_without_historical_record()

        return True
Exemple #21
0
    def refresh(self, force=False, commit=True):
        """ Look for new content in a 1flow feed. """

        # HEADS UP: refresh_must_abort() has already acquire()'d our lock.
        if self.refresh_must_abort(force=force):
            self.refresh_lock.release()
            return

        preventive_slow_down = False

        try:
            data = self.refresh_feed_internal(force=force, commit=commit)

        except:
            LOGGER.exception(
                u'Could not refresh feed %s, operating '
                u'preventive slowdown.', self)
            preventive_slow_down = True

        else:
            if data is None:
                # An error occured and has already been stored. The feed
                # has eventually already been closed if too many errors.
                # In case it's still open, slow down things.
                preventive_slow_down = True

            elif data is True:
                # The feed is handling its internals on his own behalf.
                # Eg. a Twitter feed will tweak self.last_fetch anyhow
                # it needs to prevent quota overflows. Just let it go.
                return

        if preventive_slow_down:
            # do not the queue be overflowed by refresh_all_feeds()
            # checking this feed over and over again. Let the lock
            # expire slowly until fetch_interval.
            #
            # self.refresh_lock.release()

            # Artificially slow down things to let the remote site
            # eventually recover while not bothering us too much.
            if not force:
                self.throttle_fetch_interval(0, 0, 1)

            self.update_last_fetch()

            if commit:
                self.save()

            return

        new_items, duplicates, mutualized = data

        if new_items == duplicates == mutualized == 0:

            with statsd.pipeline() as spipe:
                spipe.incr('feeds.refresh.fetch.global.unchanged')

        else:
            with statsd.pipeline() as spipe:
                spipe.incr('feeds.refresh.fetch.global.updated')

        if not force:
            # forcing the refresh is most often triggered by admins
            # and developers. It should not trigger the adaptative
            # throttling computations, because it generates a lot
            # of false-positive duplicates.
            self.throttle_fetch_interval(new_items, mutualized, duplicates)

        with statsd.pipeline() as spipe:
            spipe.incr('feeds.refresh.global.fetched', new_items)
            spipe.incr('feeds.refresh.global.duplicates', duplicates)
            spipe.incr('feeds.refresh.global.mutualized', mutualized)

        # Everything went fine, be sure to reset the "error counter".
        self.errors = []

        self.update_last_fetch()

        if commit:
            self.save()

        with statsd.pipeline() as spipe:
            spipe.incr('feeds.refresh.fetch.global.done')

        # As the last_fetch is now up-to-date, we can release the fetch lock.
        # If any other refresh job comes, it will check last_fetch and will
        # terminate if called too early.
        self.refresh_lock.release()
Exemple #22
0
def poke_pre_delete(instance, **kwargs):

    with statsd.pipeline() as spipe:
        spipe.gauge('pokes.counts.total', -1, delta=True)
Exemple #23
0
    def convert_to_markdown(self, force=False, commit=True):

        if config.ARTICLE_MARKDOWN_DISABLED:
            LOGGER.info(u'Article markdown convert disabled in '
                        u'configuration.')
            return

        if self.content_type == CONTENT_TYPES.MARKDOWN:
            if not force:
                LOGGER.info(u'%s #%s already converted to Markdown.',
                            self._meta.verbose_name, self.id)
                return

            else:
                statsd.gauge('articles.counts.markdown', -1, delta=True)

        elif self.content_type != CONTENT_TYPES.HTML:
            LOGGER.warning(
                u'%s #%s cannot be converted to Markdown, '
                u'it is not currently HTML.', self._meta.verbose_name, self.id)
            return

        LOGGER.info(u'Converting %s #%s to markdown…', self._meta.verbose_name,
                    self.id)

        md_converter = html2text.HTML2Text()

        # Set sane defaults. body_width > 0 breaks
        # some links by inserting \n inside them.
        #
        # MD_V1 had [False, False, 78] (=default parameters)
        md_converter.unicode_snob = True
        md_converter.escape_snob = True
        md_converter.body_width = 0

        try:
            # NOTE: everything should stay in Unicode during this call.
            self.content = md_converter.handle(self.content)

        except Exception as e:
            statsd.gauge('articles.counts.content_errors', 1, delta=True)

            self.content_error = str(e)
            self.save()

            LOGGER.exception(u'Markdown convert failed for item #%s.', self.id)
            return e

        self.content_type = CONTENT_TYPES.MARKDOWN

        if self.content_error:
            statsd.gauge('articles.counts.content_errors', -1, delta=True)
            self.content_error = None

        #
        # TODO: word count here
        #
        self.postprocess_markdown_links(commit=False, force=force)

        if commit:
            self.save()

        with statsd.pipeline() as spipe:
            spipe.gauge('articles.counts.html', -1, delta=True)
            spipe.gauge('articles.counts.markdown', 1, delta=True)

        if config.ARTICLE_FETCHING_DEBUG:
            LOGGER.info(
                u'————————— #%s Markdown %s —————————'
                u'\n%s\n'
                u'————————— end #%s Markdown —————————', self.id,
                self.content.__class__.__name__, self.content, self.id)