Python clean_url Examples, oneflow.base.utils.http.clean_url Python Examples

Example #1

0

Show file

def import_web_url(request, url):
    """ Import an URL from the web (can be anything). """

    form = forms.WebPagesImportForm({
        'urls': url,
        'status': IMPORT_STATUS.MANUAL
    })

    article = None

    if form.is_valid():
        user_import = form.save(request.user)

        if user_import.status == IMPORT_STATUS.FINISHED:

            if 'articles' in user_import.results['created']:
                article_url = user_import.results['created']['articles'][0]

                try:
                    article = Article.objects.get(url=article_url)

                except:
                    # Just in case we hit
                    # http://dev.1flow.net/1flow/1flow/group/51970/
                    # But it should have been wrapped earlier, thus we
                    # do not do it in first intention.
                    article = Article.objects.get(url=clean_url(article_url))

                if article.content_type in CONTENT_TYPES_FINAL:
                    return HttpResponsePermanentRedirect(
                        redirect_to_read(request.user, article))

            else:
                feed_url = user_import.results['created']['feeds'][0]

                subscription = Subscription.objects.get(
                    feed=BaseFeed.objects.get(feed_url), user=request.user)

                return HttpResponsePermanentRedirect(
                    reverse('source_selector') + u"#" + subscription.id)

        else:
            messages.warning(
                request,
                _(u'Could not import url “<code>{0}</code>”. Check your '
                  u'latest history entry to know why.').format(url),
                extra_tags='sticky safe')

            return HttpResponsePermanentRedirect(reverse('historyentry_list'))

    return render(
        request, 'import-web-url.html', {
            'article': article,
            'url': url,
            'poll_url': reverse('article_conversion_status',
                                args=(article.id, ))
        })

Example #2

0

Show file

File: tweet.py Project: dolanor-galaxy/1flow

def create_tweet_from_id(tweet_id, feeds=None, origin=None):
    """ From a Tweet ID, create a 1flow tweet via the REST API.


    https://dev.twitter.com/rest/reference/get/statuses/show/%3Aid

    .. todo:: use http://celery.readthedocs.org/en/latest/reference/celery.contrib.batches.html  # NOQA
        to bulk get statuses and not exhaust the API Quota.
    """

    raise NotImplementedError('Needs a full review / redesign for tweets.')

    if feeds is None:
        feeds = []

    elif not hasattr(feeds, '__iter__'):
        feeds = [feeds]

    # TODO: find tweet publication date while fetching content…
    # TODO: set Title during fetch…

    try:
        new_tweet, created = Tweet.create_tweet(
            url=tweet_id.replace(' ', '%20'),
            title=_(u'Imported item from {0}').format(clean_url(tweet_id)),
            feeds=feeds,
            origin=ORIGINS.WEBIMPORT)

    except:
        # NOTE: duplication handling is already
        # taken care of in Tweet.create_tweet().
        LOGGER.exception(u'Tweet creation from URL %s failed.', tweet_id)
        return None, False

    mutualized = created is None

    if created or mutualized:
        for feed in feeds:
            feed.recent_items_count += 1
            feed.all_items_count += 1

    ze_now = now()

    for feed in feeds:
        feed.latest_item_date_published = ze_now

        # Even if the tweet wasn't created, we need to create reads.
        # In the case of a mutualized tweet, it will be fetched only
        # once, but all subscribers of all feeds must be connected to
        # it to be able to read it.
        for subscription in feed.subscriptions.all():
            subscription.create_read(new_tweet, verbose=created)

    # Don't forget the parenthesis else we return ``False`` everytime.
    return new_tweet, created or (None if mutualized else False)

Example #3

0

Show file

File: __init__.py Project: 1flow/1flow

def import_web_url(request, url):
    """ Import an URL from the web (can be anything). """

    form = forms.WebPagesImportForm({'urls': url,
                                     'status': IMPORT_STATUS.MANUAL})

    article = None

    if form.is_valid():
        user_import = form.save(request.user)

        if user_import.status == IMPORT_STATUS.FINISHED:

            if 'articles' in user_import.results['created']:
                article_url = user_import.results['created']['articles'][0]

                try:
                    article = Article.objects.get(url=article_url)

                except:
                    # Just in case we hit
                    # http://dev.1flow.net/1flow/1flow/group/51970/
                    # But it should have been wrapped earlier, thus we
                    # do not do it in first intention.
                    article = Article.objects.get(url=clean_url(article_url))

                if article.content_type in CONTENT_TYPES_FINAL:
                    return HttpResponsePermanentRedirect(
                        redirect_to_read(request.user, article)
                    )

            else:
                feed_url = user_import.results['created']['feeds'][0]

                subscription = Subscription.objects.get(
                    feed=BaseFeed.objects.get(feed_url),
                    user=request.user
                )

                return HttpResponsePermanentRedirect(
                    reverse('source_selector') + u"#" + subscription.id)

        else:
            messages.warning(
                request,
                _(u'Could not import url “<code>{0}</code>”. Check your '
                  u'latest history entry to know why.').format(url),
                extra_tags='sticky safe')

            return HttpResponsePermanentRedirect(reverse('historyentry_list'))

    return render(request, 'import-web-url.html',
                  {'article': article, 'url': url,
                   'poll_url': reverse('article_conversion_status',
                                       args=(article.id, ))})

Example #4

0

Show file

File: tweet.py Project: 1flow/1flow

def create_tweet_from_id(tweet_id, feeds=None, origin=None):
    """ From a Tweet ID, create a 1flow tweet via the REST API.


    https://dev.twitter.com/rest/reference/get/statuses/show/%3Aid

    .. todo:: use http://celery.readthedocs.org/en/latest/reference/celery.contrib.batches.html  # NOQA
        to bulk get statuses and not exhaust the API Quota.
    """

    raise NotImplementedError('Needs a full review / redesign for tweets.')

    if feeds is None:
        feeds = []

    elif not hasattr(feeds, '__iter__'):
        feeds = [feeds]

    # TODO: find tweet publication date while fetching content…
    # TODO: set Title during fetch…

    try:
        new_tweet, created = Tweet.create_tweet(
            url=tweet_id.replace(' ', '%20'),
            title=_(u'Imported item from {0}').format(clean_url(tweet_id)),
            feeds=feeds, origin=ORIGINS.WEBIMPORT)

    except:
        # NOTE: duplication handling is already
        # taken care of in Tweet.create_tweet().
        LOGGER.exception(u'Tweet creation from URL %s failed.', tweet_id)
        return None, False

    mutualized = created is None

    if created or mutualized:
        for feed in feeds:
            feed.recent_items_count += 1
            feed.all_items_count += 1

    ze_now = now()

    for feed in feeds:
        feed.latest_item_date_published = ze_now

        # Even if the tweet wasn't created, we need to create reads.
        # In the case of a mutualized tweet, it will be fetched only
        # once, but all subscribers of all feeds must be connected to
        # it to be able to read it.
        for subscription in feed.subscriptions.all():
            subscription.create_read(new_tweet, verbose=created)

    # Don't forget the parenthesis else we return ``False`` everytime.
    return new_tweet, created or (None if mutualized else False)

Example #5

0

Show file

File: url.py Project: 1flow/1flow

    def resolve(cls, url, clean=False):
        """ return the real URL of :param:`url` if it is a dupe.

        Return ``None`` if not registered as duplicate.
        """

        if clean:
            url = clean_url(url)

        try:
            return cls.objects.get(url=url).values('real_url')

        except:
            return None

Example #6

0

Show file

File: url.py Project: dolanor-galaxy/1flow

    def resolve(cls, url, clean=False):
        """ return the real URL of :param:`url` if it is a dupe.

        Return ``None`` if not registered as duplicate.
        """

        if clean:
            url = clean_url(url)

        try:
            return cls.objects.get(url=url).values('real_url')

        except:
            return None

Example #7

0

Show file

File: test_utils.py Project: EliotBerriot/1flow

    def test_utm_with_other_things(self):

            for bad_url, good_url in (
                (u'http://www.begeek.fr/visitez-le-tardis-de-doctor-who-sur-google-maps-101125?utm_source=Plus+d‘actu&utm_medium=cpc&utm_campaign=Plus+d‘actu', # NOQA
                 u'http://www.begeek.fr/visitez-le-tardis-de-doctor-who-sur-google-maps-101125'), # NOQA
                (u'http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29', # NOQA
                 u'http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/'), # NOQA
                (u'http://www.liberation.fr/politiques/2013/09/24/la-niche-fiscale-pour-les-parents-d-enfants-scolarises-sera-conservee_934193?=rss-450', # NOQA
                 u'http://www.liberation.fr/politiques/2013/09/24/la-niche-fiscale-pour-les-parents-d-enfants-scolarises-sera-conservee_934193'), # NOQA

                # This one must not be changed.
                (u'http://tctechcrunch2011.files.wordpress.com/2013/09/screen-shot-2013-09-24-at-5-57-35-am.png?w=1280&h=948', # NOQA
                 u'http://tctechcrunch2011.files.wordpress.com/2013/09/screen-shot-2013-09-24-at-5-57-35-am.png?w=1280&h=948'), # NOQA
                      ):
                self.assertEquals(clean_url(bad_url), good_url)

Example #8

0

Show file

File: importers.py Project: 1flow/1flow

    def save(self, user):
        """ Record the current user and the lines count. """

        # Just in case.
        self.instance.urls = u'\n'.join(
            clean_url(l.strip()) for l in self.instance.urls.splitlines())

        self.instance.user = user
        self.instance.lines = self.instance.count

        super(WebPagesImportForm, self).save()

        if self.instance.status == models.IMPORT_STATUS.MANUAL:
            self.instance.run()

        return self.instance

Example #9

0

Show file

    def save(self, user):
        """ Record the current user and the lines count. """

        # Just in case.
        self.instance.urls = u'\n'.join(
            clean_url(l.strip()) for l in self.instance.urls.splitlines())

        self.instance.user = user
        self.instance.lines = self.instance.count

        super(WebPagesImportForm, self).save()

        if self.instance.status == models.IMPORT_STATUS.MANUAL:
            self.instance.run()

        return self.instance

Example #10

0

Show file

File: rssatom.py Project: 1flow/1flow

def prepare_feed_url(feed_url):
    """ Try to validate an URL as much as possible. """

    feed_url = clean_url(feed_url)

    URLValidator()(feed_url)

    requests_response = requests.get(feed_url)

    if not requests_response.ok or requests_response.status_code != 200:
        raise Exception(u'Requests response is not OK/200, aborting')

    # Switch to the last hop of eventually (multiple-)redirected URLs.
    feed_url = requests_response.url

    # Be sure we get the XML result from them,
    # else FeedBurner gives us a poor HTML page…
    if u'feedburner' in feed_url and not feed_url.endswith(u'?format=xml'):
        feed_url += u'?format=xml'

    return feed_url

Example #11

0

Show file

def prepare_feed_url(feed_url):
    """ Try to validate an URL as much as possible. """

    feed_url = clean_url(feed_url)

    URLValidator()(feed_url)

    requests_response = requests.get(feed_url)

    if not requests_response.ok or requests_response.status_code != 200:
        raise Exception(u'Requests response is not OK/200, aborting')

    # Switch to the last hop of eventually (multiple-)redirected URLs.
    feed_url = requests_response.url

    # Be sure we get the XML result from them,
    # else FeedBurner gives us a poor HTML page…
    if u'feedburner' in feed_url and not feed_url.endswith(u'?format=xml'):
        feed_url += u'?format=xml'

    return feed_url

Example #12

0

Show file

def create_article_from_url(url, feeds, origin):
    """ Create an article from a web url, in feeds, with an origin. """

    # TODO: find article publication date while fetching content…
    # TODO: set Title during fetch…

    try:
        new_article, created = Article.create_article(
            url=url.replace(' ', '%20'),
            title=_(u'Imported item from {0}').format(clean_url(url)),
            feeds=feeds, origin=origin)

    except:
        # NOTE: duplication handling is already
        # taken care of in Article.create_article().
        LOGGER.exception(u'Article creation from URL %s failed.', url)
        return None, False

    mutualized = created is None

    if created or mutualized:
        for feed in feeds:
            feed.recent_items_count += 1
            feed.all_items_count += 1

    for feed in feeds:
        if new_article.date_published:
            if new_article.date_published > feed.latest_item_date_published:
                feed.latest_item_date_published = new_article.date_published

        # Even if the article wasn't created, we need to create reads.
        # In the case of a mutualized article, it will be fetched only
        # once, but all subscribers of all feeds must be connected to
        # it to be able to read it.
        for subscription in feed.subscriptions.all():
            subscription.create_read(new_article, verbose=created)

    # Don't forget the parenthesis else we return ``False`` everytime.
    return new_article, created or (None if mutualized else False)

Example #13

0

Show file

File: rssatom.py Project: 1flow/1flow

def create_feeds_from_url(feed_url, creator=None, recurse=True):
    """ Return a list of one or more tuple(s) ``(feed, created)``,
        from a given URL.

        If the URL given is an RSS/Atom URL, the method will create a feed
        (if not already in the database), and will return it associated
        with the ``created`` boolean, given if it was created now, or not.
        For consistency, the tuple will be returned in a list, so that this
        method *always* returns a list of tuples.

        If the URL is a simple website one, it will be opened and parsed
        to discover eventual RSS/Atom feeds referenced in the page headers,
        and the method will return a list of tuples.

        .. todo:: parse the content body to find any RSS/Atom feeds inside.
            Will make it easy to parse http://www.bbc.co.uk/news/10628494

        :param creator: a :class:`User` that will be set as the feed(s)
            creator. This will allow to eventually give acheivements to
            users, or at the contrary to ban them if they pollute the DB.

        :param recurse: In case of a simple web URL, this method will be
            called recursively. Subsequent calls will be non-recursive
            by default. You can consider this argument to be "internal".
    """

    feed_url = prepare_feed_url(feed_url)

    try:
        feed = RssAtomFeed.objects.get(url=feed_url)

    except RssAtomFeed.DoesNotExist:
        # We will create it now.
        pass

    else:
        # Get the right one for the user subscription.
        if feed.duplicate_of_id:
            return [(feed.duplicate_of, False)]

        else:
            return [(feed, False)]

    http_logger = HttpResponseLogProcessor()
    parsed_feed = feedparser.parse(feed_url, handlers=[http_logger])
    feed_status = http_logger.log[-1]['status']

    # Stop on HTTP errors before stopping on feedparser errors,
    # because he is much more lenient in many conditions.
    if feed_status in (400, 401, 402, 403, 404, 500, 502, 503):
        raise FeedFetchException(u'Error {0} when fetching feed {1}'.format(
            feed_status, feed_url))

    try:
        check_feedparser_error(parsed_feed)

    except FeedIsHtmlPageException:
        if recurse:
            new_feeds = []
            urls_to_try = set(parse_feeds_urls(parsed_feed))

            for sub_url in urls_to_try:
                try:
                    new_feeds += create_feeds_from_url(
                        sub_url, creator=creator, recurse=False)

                except FeedIsHtmlPageException:
                    # We don't warn for every URL we find,
                    # most of them are CSS/JS/whatever ones.
                    pass

                except:
                    LOGGER.exception(u'Could not create a feed from '
                                     u'recursed url {0} (from {1})'.format(
                                         sub_url, feed_url))

            if new_feeds:
                # LOGGER.info(u'Returning %s created feeds.', len(new_feeds))
                return new_feeds

            # Just before giving up, try a little more with newspaper.
            # As it is quite slow, do it in the background.
            discover_feeds_urls.delay(feed_url)

            raise

        else:
            raise

    except Exception as e:
        raise Exception(u'Unparsable feed {0}: {1}'.format(feed_url, e))

    else:
        # Wow. FeedParser creates a <anything>.feed . Impressive.
        fp_feed = parsed_feed.feed
        website = WebSite.get_from_url(clean_url(
                                       fp_feed.get('link', feed_url)))

        defaults = {
            'name': fp_feed.get('title', u'Feed from {0}'.format(feed_url)),
            'is_good': True,
            # Try the RSS description, then the Atom subtitle.
            'description_en': fp_feed.get(
                'description',
                fp_feed.get('subtitle', u'')),
            'website': website
        }

        new_feed, created = RssAtomFeed.objects.get_or_create(
            url=feed_url, defaults=defaults
        )

        if created:
            new_feed.user = creator
            new_feed.save()

        return [(new_feed, created)]

Example #14

0

Show file

File: content.py Project: 1flow/1flow

        if replace_newlines:
            for repl_src in re.findall(ur'[[][^]]+[]][(]', content):

                # In link text, we replace by a space.
                repl_dst = repl_src.replace(u'\n', u' ')
                content  = content.replace(repl_src, repl_dst)

        for repl_src in re.findall(ur'[]][(][^)]+[)]', content):

            if replace_newlines:
                # In link URLs, we just cut out newlines.
                repl_dst = repl_src.replace(u'\n', u'')
            else:
                repl_dst = repl_src

            repl_dst = clean_url(insert_website(repl_dst))
            content  = content.replace(repl_src, repl_dst)

        if test:
            return content

        else:
            # Everything went OK. Put back the content where it belongs.
            self.content = content

            if replace_newlines:
                self.content_type = CONTENT_TYPES.MARKDOWN

            # Disabled until more love is put inside.
            # self.find_image(commit=False, force=force)

Example #15

0

Show file

File: url.py Project: dolanor-galaxy/1flow

    def absolutize_url(self, requests_response=None, force=False, commit=True):
        """ Make the current article URL absolute.

        Eg. transform:

        http://feedproxy.google.com/~r/francaistechcrunch/~3/hEIhLwVyEEI/

        into:

        http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/ # NOQA
            ?utm_source=feeurner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29 # NOQA

        and then remove all these F*G utm_* parameters to get a clean
        final URL for the current article.

        Returns ``True`` if the operation succeeded, ``False`` if the
        absolutization pointed out that the current article is a
        duplicate of another. In this case the caller should stop its
        processing because the current article will be marked for deletion.

        Can also return ``None`` if absolutizing is disabled globally
        in ``constance`` configuration.
        """

        # Another example: http://rss.lefigaro.fr/~r/lefigaro/laune/~3/7jgyrQ-PmBA/story01.htm # NOQA

        if self.absolutize_url_must_abort(force=force, commit=commit):
            return

        if requests_response is None:
            try:
                requests_response = requests.get(self.url)

            except requests.ConnectionError as e:
                statsd.gauge('articles.counts.url_errors', 1, delta=True)
                message = u'Connection error while absolutizing “%s”: %s'
                args = (
                    self.url,
                    str(e),
                )

                self.url_error = message % args
                # Don't waste a version just for that.
                self.save_without_historical_record()

                LOGGER.error(message, *args)
                return

        if not requests_response.ok or requests_response.status_code != 200:

            message = u'HTTP Error %s while absolutizing “%s”: %s'
            args = (requests_response.status_code, requests_response.url,
                    requests_response.reason)

            with statsd.pipeline() as spipe:
                spipe.gauge('articles.counts.url_errors', 1, delta=True)

                if requests_response.status_code in (404, ):
                    self.is_orphaned = True

                    # This is not handled by the post_save()
                    # which acts only at article creation.
                    spipe.gauge('articles.counts.orphaned', 1, delta=True)

            self.url_error = message % args

            # Don't waste a version just for that.
            self.save_without_historical_record()

            LOGGER.error(message, *args)
            return

        #
        # NOTE: we could also get it eventually from r.headers['link'],
        #       which contains '<another_url>'. We need to strip out
        #       the '<>', and re-absolutize this link, because in the
        #       example it's another redirector. Also r.links is a good
        #       candidate but in the example I used, it contains the
        #       shortlink, which must be re-resolved too.
        #
        #       So: as we already are at the final address *now*, no need
        #       bothering re-following another which would lead us to the
        #       the same final place.
        #

        final_url = clean_url(requests_response.url)

        # LOGGER.info(u'\n\nFINAL: %s vs. ORIG: %s\n\n', final_url, self.url)

        if final_url != self.url:

            # Just for displaying purposes, see below.
            old_url = self.url

            if self.url_error:
                statsd.gauge('articles.counts.url_errors', -1, delta=True)

            # Even if we are a duplicate, we came until here and everything
            # went fine. We won't need to lookup again the absolute URL.
            statsd.gauge('articles.counts.absolutes', 1, delta=True)
            self.url_absolute = True
            self.url_error = None

            self.url = final_url

            try:
                if self.name.endswith(old_url):
                    self.name = self.name.replace(old_url, final_url)
            except:
                LOGGER.exception(u'Could not replace URL in name of %s #%s',
                                 self._meta.model.__name__, self.id)

            duplicate = False

            with transaction.atomic():
                # Without the atomic() block, saving the current article
                # (beiing a duplicate) will trigger the IntegrityError,
                # but will render the current SQL context unusable, unable
                # to register duplicate, potentially leading to massive
                # inconsistencies in the caller's context.
                try:
                    # Don't waste a version just for that.
                    self.save_without_historical_record()

                except IntegrityError:
                    duplicate = True

            if duplicate:
                params = {'%s___url' % self._meta.model.__name__: final_url}
                original = BaseItem.objects.get(**params)

                # Just to display the right “old” one in logs.
                self.url = old_url

                LOGGER.info(
                    u'%s #%s is a duplicate of #%s, '
                    u'registering as such.', self._meta.model.__name__,
                    self.id, original.id)

                original.register_duplicate(self)
                return False

            # Any other exception will raise. This is intentional.
            else:
                LOGGER.info(
                    u'URL of %s (#%s) successfully absolutized '
                    u'from %s to %s.', self._meta.model.__name__, self.id,
                    old_url, final_url)

        else:
            # Don't do the job twice.
            if self.url_error:
                statsd.gauge('articles.counts.url_errors', -1, delta=True)

            statsd.gauge('articles.counts.absolutes', 1, delta=True)
            self.url_absolute = True
            self.url_error = None

            # Don't waste a version just for that.
            self.save_without_historical_record()

        return True

Example #16

0

Show file

    def create_article(cls, title, url, feeds, **kwargs):
        """ Returns ``True`` if article created, ``False`` if a pure duplicate
            (already exists in the same feed), ``None`` if exists but not in
            the same feed. If more than one feed given, only returns ``True``
            or ``False`` (mutualized state is not checked). """

        tags = kwargs.pop('tags', [])

        if url is None:
            # We have to build a reliable orphaned URL, because orphaned
            # articles are often duplicates. RSS feeds serve us many times
            # the same article, without any URL, and we keep recording it
            # as new (but orphaned) content… Seen 20141111 on Chuck Norris
            # facts, where the content is in the title, and there is no URL.
            # We have 860k+ items, out of 1k real facts… Doomed.
            url = ARTICLE_ORPHANED_BASE + generate_orphaned_hash(title, feeds)

            defaults = {
                'name': title,
                'is_orphaned': True,

                # Skip absolutization, it's useless.
                'url_absolute': True
            }

            defaults.update(kwargs)

            article, created = cls.objects.get_or_create(url=url,
                                                         defaults=defaults)

            # HEADS UP: no statsd here, it's handled by post_save().

        else:
            url = clean_url(url)

            defaults = {'name': title}
            defaults.update(kwargs)

            article, created = cls.objects.get_or_create(url=url,
                                                         defaults=defaults)

        if created:
            created_retval = True

            LOGGER.info(u'Created %sarticle %s %s.', u'orphaned '
                        if article.is_orphaned else u'', article.id,
                        u'in feed(s) {0}'.format(_format_feeds(feeds))
                        if feeds else u'without any feed')

        else:
            created_retval = False

            if article.duplicate_of_id:
                LOGGER.info(u'Swaping duplicate %s %s for master %s on '
                            u'the fly.', article._meta.verbose_name,
                            article.id, article.duplicate_of_id)

                article = article.duplicate_of

            if len(feeds) == 1 and feeds[0] not in article.feeds.all():
                # This article is already there, but has not yet been
                # fetched for this feed. It's mutualized, and as such
                # it is considered at partly new. At least, it's not
                # as bad as being a true duplicate.
                created_retval = None

                LOGGER.info(u'Mutualized article %s in feed(s) %s.',
                            article.id, _format_feeds(feeds))

                article.create_reads(feeds=feeds)

            else:
                # No statsd, because we didn't create any record in database.
                LOGGER.info(u'Duplicate article %s in feed(s) %s.',
                            article.id, _format_feeds(feeds))

            # Special case where a mutualized article arrives from RSS
            # (with date/author) while it was already here from Twitter
            # (no date/author). Post-processing of original data will
            # handle the authors, but at lest we update the date now for
            # users to have sorted articles until original data is
            # post-processed (this can take time, given the server load).
            if article.date_published is None:
                date_published = kwargs.get('date_published', None)

                if date_published is not None:
                    article.date_published = date_published
                    article.save()

        # Tags & feeds are ManyToMany, they
        # need the article to be saved before.

        if tags:
            try:
                with transaction.atomic():
                    article.tags.add(*tags)

            except IntegrityError:
                LOGGER.exception(u'Could not add tags %s to article %s',
                                 tags, article.id)

        if feeds:
            try:
                with transaction.atomic():
                    article.feeds.add(*feeds)

            except:
                LOGGER.exception(u'Could not add feeds to article %s',
                                 article.id)

        # Get a chance to catch the duplicate if workers were fast.
        # At the cost of another DB read, this will save some work
        # in repair scripts, and avoid some writes when creating reads.
        article = cls.objects.get(id=article.id)

        if article.duplicate_of_id:
            if settings.DEBUG:
                LOGGER.debug(u'Catched on-the-fly duplicate %s, returning '
                             u'master %s instead.', article.id,
                             article.duplicate_of_id)

            return article.duplicate_of, False

        return article, created_retval

Example #17

0

Show file

File: test_utils.py Project: EliotBerriot/1flow

    def test_utm_star(self):

        good_url = u'http://test.com/mytest/'

        for bad_url in (
            u'http://test.com/mytest/?=',
            u'http://test.com/mytest/?#',
            u'http://test.com/mytest/#?=',
            u'http://test.com/mytest/?=rss',
            u'http://test.com/mytest/?=rss-450',
            u'http://test.com/mytest/?=rss-450&',
            u'http://test.com/mytest/?=rss-450&=rss',

            u'http://test.com/mytest/?utm_X',
            u'http://test.com/mytest/?utm_X&',
            u'http://test.com/mytest/?utm_X=',
            u'http://test.com/mytest/?utm_X=&',
            u'http://test.com/mytest/?utm_X=toto',
            u'http://test.com/mytest/?utm_X=toto&',

            u'http://test.com/mytest/?utm_source=toto&utm_Y',
            u'http://test.com/mytest/?utm_source=toto&utm_Y&',
            u'http://test.com/mytest/?utm_source=toto&utm_Y=',
            u'http://test.com/mytest/?utm_source=toto&utm_Y=&',
            u'http://test.com/mytest/?utm_source=toto&utm_Y=titi',
            u'http://test.com/mytest/?utm_source=toto&utm_Y=titi&',

            u'http://test.com/mytest/#xtor',
            u'http://test.com/mytest/#xtor=',
            u'http://test.com/mytest/#xtor=tata',
            u'http://test.com/mytest/#xtor&',
            u'http://test.com/mytest/#xtor=&',
            u'http://test.com/mytest/#xtor=tata&',

            u'http://test.com/mytest/?utm_X#xtor',
            u'http://test.com/mytest/?utm_X#xtor=',
            u'http://test.com/mytest/?utm_X#xtor=tata',

            u'http://test.com/mytest/?utm_campaign&#xtor',
            u'http://test.com/mytest/?utm_campaign&#xtor=',
            u'http://test.com/mytest/?utm_campaign&#xtor=tata',

            u'http://test.com/mytest/?utm_X=&#xtor',
            u'http://test.com/mytest/?utm_X=&#xtor=',
            u'http://test.com/mytest/?utm_X=&#xtor=tata',

            u'http://test.com/mytest/?utm_X=toto#xtor',
            u'http://test.com/mytest/?utm_X=toto#xtor=',
            u'http://test.com/mytest/?utm_X=toto#xtor=tata',

            u'http://test.com/mytest/?utm_X=toto&#xtor',
            u'http://test.com/mytest/?utm_X=toto&#xtor=',
            u'http://test.com/mytest/?utm_X=toto&#xtor=tata',

            u'http://test.com/mytest/?utm_X=toto&utm_Y#xtor',
            u'http://test.com/mytest/?utm_X=toto&utm_Y#xtor=',
            u'http://test.com/mytest/?utm_X=toto&utm_Y#xtor=tata',

            u'http://test.com/mytest/?utm_X=toto&utm_Y=#xtor',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=#xtor=',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=#xtor=tata',

            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi#xtor',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi#xtor=',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi#xtor=tata',

            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&#xtor',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&#xtor=',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&#xtor=tata',

            u'http://test.com/mytest/?xtor',
            u'http://test.com/mytest/?xtor=',
            u'http://test.com/mytest/?xtor=tata',
            u'http://test.com/mytest/?xtor=tata&',

            u'http://test.com/mytest/?utm_X&xtor',
            u'http://test.com/mytest/?utm_X&xtor=',
            u'http://test.com/mytest/?utm_X&xtor=tata',
            u'http://test.com/mytest/?utm_X&xtor=tata&',

            u'http://test.com/mytest/?utm_X=&xtor',
            u'http://test.com/mytest/?utm_X=&xtor=',
            u'http://test.com/mytest/?utm_X=&xtor=tata',
            u'http://test.com/mytest/?utm_X=&xtor=tata&',

            u'http://test.com/mytest/?utm_X=toto&xtor',
            u'http://test.com/mytest/?utm_X=toto&xtor=',
            u'http://test.com/mytest/?utm_X=toto&xtor=tata',
            u'http://test.com/mytest/?utm_X=toto&xtor=tata&',

            u'http://test.com/mytest/?utm_X=toto&utm_Y&xtor',
            u'http://test.com/mytest/?utm_X=toto&utm_Y&xtor=',
            u'http://test.com/mytest/?utm_X=toto&utm_Y&xtor=tata',
            u'http://test.com/mytest/?utm_X=toto&utm_Y&xtor=tata&',

            u'http://test.com/mytest/?utm_X=toto&utm_Y=&xtor',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=&xtor=',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=&xtor=tata',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=&xtor=tata&',

            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&xtor',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&xtor=',
            u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&xtor=tata',
                u'http://test.com/mytest/?utm_X=toto&utm_Y=titi&xtor=tata&', ):

            self.assertEquals(clean_url(bad_url), good_url)

Example #18

0

Show file

File: fetch_content_urls.py Project: 1flow/1flow

def process(self, instance, parameters=None, commit=True, **kwargs):
    """ See source code. """

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    urls = URL_MATCH_REGEX.findall(instance.content)

    if not urls:
        LOGGER.info(u'url-crawler: nothing to crawl in %s %s.',
                    instance_name, instance_id)
        return

    # Start with EasyList
    adblock_rules_list = requests_get(
        # WARNING: do not .split() with no parameters, else
        # adblock will block everything due to empty rules.
        'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n')

    # Append our eventual specific exclusions
    adblock_rules_list.extend(
        parameters.get(
            'integration', {}).get(
                'fetch_content_urls',
                {}).get(
                    'adblock_rules',
                    []))

    if re2 is None:
        # Things will be dogly slow…
        adblock_rules = AdblockRules(
            adblock_rules_list,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    else:
        # Things will go faster
        adblock_rules = AdblockRules(
            adblock_rules_list, use_re2=True,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    if isinstance(instance, models.Email):
        origin = models.ORIGINS.EMAIL

        # NOTE: there will be at least one here, else
        # accepts() would have rejected the email.
        feeds = instance.feeds.exclude(
            MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE)

    else:
        origin = models.ORIGINS.CRAWLING
        feeds = instance.feeds.all()

    dupes = 0
    blocked = 0

    # LOGGER.debug('URLS: %s %s', len(urls), urls)

    for url in urls:
        if url.startswith('('):
            url = url[1:]

            if url.endswith(')'):
                # Skip Markdown's enclosing parenthesis
                # that we explicitely matched manually.
                url = url[:-1]

            # In case we've got garbage at the end of the RE.
            splitted = url.split(')')

            if len(splitted) == 1:
                pass

            if len(splitted) == 2 and len(splitted[1]) < 4:
                    # Highly probable that we got some garbage at the end.
                url = splitted[0]

            else:
                LOGGER.error(u'url-crawler: probable nasty unhandled '
                             u'URL “%s” too-greedily matched by RE.',
                             url)

        if adblock_rules.should_block(url):
            LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.',
                        url)
            blocked += 1
            continue

        LOGGER.info('url-crawler: importing from %s.', url)

        try:
            item, created = create_item_from_url(
                url=clean_url(url), feeds=feeds, origin=origin,
            )

        except:
            LOGGER.exception(u'Could not create item from URL “%s”', url)

        else:
            if created:
                LOGGER.info(u'url-crawler: successfully imported %s from '
                            u'%s %s.', item, instance_name, instance_id)

            else:
                dupes += 1
                LOGGER.warning(u'url-crawler: %s already in database.', item)

            # link newly created item to the item it was found into.
            item.sources.add(instance)

    LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.',
                len(urls) - blocked, len(urls) - blocked - dupes,
                instance_name, instance_id)

Example #19

0

Show file

def process(self, instance, verbose=True, commit=True, **kwargs):
    """ See source code. """

    # from https://github.com/erikriver/opengraph
    # site_name       => YouTube
    # description     => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the...  # NOQA
    # title           => While My Guitar Gently Weeps
    # url             => http://www.youtube.com/watch?v=q3ixBmDzylQ
    # image           => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg
    # video:type      => application/x-shockwave-flash
    # video:height    => 224
    # video           => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1  # NOQA
    # video:width     => 398
    # type            => video

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    try:
        og_article = opengraph.OpenGraph(html=instance.content)

    except:
        # Not worth a round trip to sentry in most cases.
        # A warning will suffice. Developers can still debug
        # the article manually if wanted.
        LOGGER.warning(u'opengraph: parsing %s %s failed, aborting.',
                       instance_name, instance_id)
        return

    if not og_article.is_valid():
        LOGGER.warning(
            u'opengraph: invalid OpenGraph data in %s %s, '
            u'aborting.', instance_name, instance_id)
        return

    needs_commit = False

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Title

    name_needs_extraction = get_processor_by_slug(
        '1fs-article-title-extract-accept-conditions').accepts(instance,
                                                               verbose=verbose,
                                                               commit=commit,
                                                               **kwargs)

    if data_ok(og_article.title) and name_needs_extraction:
        if isinstance(og_article.title, list):
            # Cf. http://blog.dbth.fr/2015/03/la-liberte-de-fermer-ta-gueule-ou-du-sexisme-dans-la-musique/  # NOQA
            instance.name = og_article.title[0]

        else:
            instance.name = og_article.title

        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s name to “%s”.', instance_name,
                        instance_id, instance.name)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Date published
    # http://ogp.me/#type_article
    #
    # article:published_time - datetime - When the article was first published.
    # article:modified_time - datetime - When the article was last changed.
    # article:expiration_time - datetime - When the article is out of date after.  # NOQA
    # article:author - profile array - Writers of the article.
    # article:section - string - A high-level section name. E.g. Technology
    # article:tag - string array - Tag words associated with this article.
    #
    # http://ogp.me/#type_profile (for author)

    og_pub_time = og_article.get('article__published_time', None)

    if instance.date_published is None and data_ok(og_pub_time):

        parsed_datetime = datetime_extended_parser(og_pub_time)

        if parsed_datetime is None:
            LOGGER.warning(
                u'OpenGraph article:published_time “%s” is '
                u'unparseable.', og_pub_time)

        else:
            date_published = datetime(*parsed_datetime[:6])

            instance.date_published = date_published
            needs_commit = True
            LOGGER.info(u'opengraph: set %s %s published date.', instance_name,
                        instance_id)

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Description

    og_description = og_article.get('description', None)

    if data_ok(og_description) and not data_ok(instance.excerpt):
        instance.excerpt = og_description
        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s excerpt.', instance_name,
                        instance_id)

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––—–––––––––– Authors

    #
    # TODO
    #

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Language

    og_language = og_article.get('language', None)

    if data_ok(og_language) and instance.language_id is None:
        instance.language = models.Language.get_by_code(og_language)
        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s language to %s.', instance_name,
                        instance_id, instance.language)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Tags

    og_tags = og_article.get('article__tag', None)

    if data_ok(og_tags):

        if not isinstance(og_tags, list):
            og_tags = [og_tags]

        if og_tags and not instance.tags.exists():
            instance.tags.add(
                *models.SimpleTag.get_tags_set(og_tags, origin=instance))

            if verbose:
                LOGGER.info(u'opengraph: set %s %s tag(s) to %s.',
                            instance_name, instance_id, u', '.join(og_tags))

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Front image

    og_image = og_article.get('image', None)

    if data_ok(og_image) and not data_ok(instance.image_url):

        if isinstance(og_image, list):
            instance.image_url = clean_url(og_image[0])

        else:
            instance.image_url = clean_url(og_image)

        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s image_url to %s.',
                        instance_name, instance_id, instance.image_url)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——————— Beer

    if needs_commit and commit:
        # As we changed only fields that were previously
        # unset, no need to waste a version.
        instance.save_without_historical_record()

Example #20

0

Show file

def create_feeds_from_url(feed_url, creator=None, recurse=True):
    """ Return a list of one or more tuple(s) ``(feed, created)``,
        from a given URL.

        If the URL given is an RSS/Atom URL, the method will create a feed
        (if not already in the database), and will return it associated
        with the ``created`` boolean, given if it was created now, or not.
        For consistency, the tuple will be returned in a list, so that this
        method *always* returns a list of tuples.

        If the URL is a simple website one, it will be opened and parsed
        to discover eventual RSS/Atom feeds referenced in the page headers,
        and the method will return a list of tuples.

        .. todo:: parse the content body to find any RSS/Atom feeds inside.
            Will make it easy to parse http://www.bbc.co.uk/news/10628494

        :param creator: a :class:`User` that will be set as the feed(s)
            creator. This will allow to eventually give acheivements to
            users, or at the contrary to ban them if they pollute the DB.

        :param recurse: In case of a simple web URL, this method will be
            called recursively. Subsequent calls will be non-recursive
            by default. You can consider this argument to be "internal".
    """

    feed_url = prepare_feed_url(feed_url)

    try:
        feed = RssAtomFeed.objects.get(url=feed_url)

    except RssAtomFeed.DoesNotExist:
        # We will create it now.
        pass

    else:
        # Get the right one for the user subscription.
        if feed.duplicate_of_id:
            return [(feed.duplicate_of, False)]

        else:
            return [(feed, False)]

    http_logger = HttpResponseLogProcessor()
    parsed_feed = feedparser.parse(feed_url, handlers=[http_logger])
    feed_status = http_logger.log[-1]['status']

    # Stop on HTTP errors before stopping on feedparser errors,
    # because he is much more lenient in many conditions.
    if feed_status in (400, 401, 402, 403, 404, 500, 502, 503):
        raise FeedFetchException(u'Error {0} when fetching feed {1}'.format(
            feed_status, feed_url))

    try:
        check_feedparser_error(parsed_feed)

    except FeedIsHtmlPageException:
        if recurse:
            new_feeds = []
            urls_to_try = set(parse_feeds_urls(parsed_feed))

            for sub_url in urls_to_try:
                try:
                    new_feeds += create_feeds_from_url(sub_url,
                                                       creator=creator,
                                                       recurse=False)

                except FeedIsHtmlPageException:
                    # We don't warn for every URL we find,
                    # most of them are CSS/JS/whatever ones.
                    pass

                except:
                    LOGGER.exception(u'Could not create a feed from '
                                     u'recursed url {0} (from {1})'.format(
                                         sub_url, feed_url))

            if new_feeds:
                # LOGGER.info(u'Returning %s created feeds.', len(new_feeds))
                return new_feeds

            # Just before giving up, try a little more with newspaper.
            # As it is quite slow, do it in the background.
            discover_feeds_urls.delay(feed_url)

            raise

        else:
            raise

    except Exception as e:
        raise Exception(u'Unparsable feed {0}: {1}'.format(feed_url, e))

    else:
        # Wow. FeedParser creates a <anything>.feed . Impressive.
        fp_feed = parsed_feed.feed
        website = WebSite.get_from_url(clean_url(fp_feed.get('link',
                                                             feed_url)))

        defaults = {
            'name':
            fp_feed.get('title', u'Feed from {0}'.format(feed_url)),
            'is_good':
            True,
            # Try the RSS description, then the Atom subtitle.
            'description_en':
            fp_feed.get('description', fp_feed.get('subtitle', u'')),
            'website':
            website
        }

        new_feed, created = RssAtomFeed.objects.get_or_create(
            url=feed_url, defaults=defaults)

        if created:
            new_feed.user = creator
            new_feed.save()

        return [(new_feed, created)]

Example #21

0

Show file

        if replace_newlines:
            for repl_src in re.findall(ur'[[][^]]+[]][(]', content):

                # In link text, we replace by a space.
                repl_dst = repl_src.replace(u'\n', u' ')
                content = content.replace(repl_src, repl_dst)

        for repl_src in re.findall(ur'[]][(][^)]+[)]', content):

            if replace_newlines:
                # In link URLs, we just cut out newlines.
                repl_dst = repl_src.replace(u'\n', u'')
            else:
                repl_dst = repl_src

            repl_dst = clean_url(insert_website(repl_dst))
            content = content.replace(repl_src, repl_dst)

        if test:
            return content

        else:
            # Everything went OK. Put back the content where it belongs.
            self.content = content

            if replace_newlines:
                self.content_type = CONTENT_TYPES.MARKDOWN

            # Disabled until more love is put inside.
            # self.find_image(commit=False, force=force)

Example #22

0

Show file

File: url.py Project: 1flow/1flow

    def absolutize_url(self, requests_response=None, force=False, commit=True):
        """ Make the current article URL absolute.

        Eg. transform:

        http://feedproxy.google.com/~r/francaistechcrunch/~3/hEIhLwVyEEI/

        into:

        http://techcrunch.com/2013/05/18/hell-no-tumblr-users-wont-go-to-yahoo/ # NOQA
            ?utm_source=feeurner&utm_medium=feed&utm_campaign=Feed%3A+francaistechcrunch+%28TechCrunch+en+Francais%29 # NOQA

        and then remove all these F*G utm_* parameters to get a clean
        final URL for the current article.

        Returns ``True`` if the operation succeeded, ``False`` if the
        absolutization pointed out that the current article is a
        duplicate of another. In this case the caller should stop its
        processing because the current article will be marked for deletion.

        Can also return ``None`` if absolutizing is disabled globally
        in ``constance`` configuration.
        """

        # Another example: http://rss.lefigaro.fr/~r/lefigaro/laune/~3/7jgyrQ-PmBA/story01.htm # NOQA

        if self.absolutize_url_must_abort(force=force, commit=commit):
            return

        if requests_response is None:
            try:
                requests_response = requests.get(self.url)

            except requests.ConnectionError as e:
                statsd.gauge('articles.counts.url_errors', 1, delta=True)
                message = u'Connection error while absolutizing “%s”: %s'
                args = (self.url, str(e), )

                self.url_error = message % args
                # Don't waste a version just for that.
                self.save_without_historical_record()

                LOGGER.error(message, *args)
                return

        if not requests_response.ok or requests_response.status_code != 200:

            message = u'HTTP Error %s while absolutizing “%s”: %s'
            args = (
                requests_response.status_code,
                requests_response.url,
                requests_response.reason
            )

            with statsd.pipeline() as spipe:
                spipe.gauge('articles.counts.url_errors', 1, delta=True)

                if requests_response.status_code in (404, ):
                    self.is_orphaned = True

                    # This is not handled by the post_save()
                    # which acts only at article creation.
                    spipe.gauge('articles.counts.orphaned', 1, delta=True)

            self.url_error = message % args

            # Don't waste a version just for that.
            self.save_without_historical_record()

            LOGGER.error(message, *args)
            return

        #
        # NOTE: we could also get it eventually from r.headers['link'],
        #       which contains '<another_url>'. We need to strip out
        #       the '<>', and re-absolutize this link, because in the
        #       example it's another redirector. Also r.links is a good
        #       candidate but in the example I used, it contains the
        #       shortlink, which must be re-resolved too.
        #
        #       So: as we already are at the final address *now*, no need
        #       bothering re-following another which would lead us to the
        #       the same final place.
        #

        final_url = clean_url(requests_response.url)

        # LOGGER.info(u'\n\nFINAL: %s vs. ORIG: %s\n\n', final_url, self.url)

        if final_url != self.url:

            # Just for displaying purposes, see below.
            old_url = self.url

            if self.url_error:
                statsd.gauge('articles.counts.url_errors', -1, delta=True)

            # Even if we are a duplicate, we came until here and everything
            # went fine. We won't need to lookup again the absolute URL.
            statsd.gauge('articles.counts.absolutes', 1, delta=True)
            self.url_absolute = True
            self.url_error = None

            self.url = final_url

            try:
                if self.name.endswith(old_url):
                    self.name = self.name.replace(old_url, final_url)
            except:
                LOGGER.exception(u'Could not replace URL in name of %s #%s',
                                 self._meta.model.__name__, self.id)

            duplicate = False

            with transaction.atomic():
                # Without the atomic() block, saving the current article
                # (beiing a duplicate) will trigger the IntegrityError,
                # but will render the current SQL context unusable, unable
                # to register duplicate, potentially leading to massive
                # inconsistencies in the caller's context.
                try:
                    # Don't waste a version just for that.
                    self.save_without_historical_record()

                except IntegrityError:
                    duplicate = True

            if duplicate:
                params = {
                    '%s___url' % self._meta.model.__name__: final_url
                }
                original = BaseItem.objects.get(**params)

                # Just to display the right “old” one in logs.
                self.url = old_url

                LOGGER.info(u'%s #%s is a duplicate of #%s, '
                            u'registering as such.',
                            self._meta.model.__name__, self.id, original.id)

                original.register_duplicate(self)
                return False

            # Any other exception will raise. This is intentional.
            else:
                LOGGER.info(u'URL of %s (#%s) successfully absolutized '
                            u'from %s to %s.', self._meta.model.__name__,
                            self.id, old_url, final_url)

        else:
            # Don't do the job twice.
            if self.url_error:
                statsd.gauge('articles.counts.url_errors', -1, delta=True)

            statsd.gauge('articles.counts.absolutes', 1, delta=True)
            self.url_absolute = True
            self.url_error = None

            # Don't waste a version just for that.
            self.save_without_historical_record()

        return True

Example #23

0

Show file

File: opengraph_extractor.py Project: 1flow/1flow

def process(self, instance, verbose=True, commit=True, **kwargs):
    """ See source code. """

    # from https://github.com/erikriver/opengraph
    # site_name       => YouTube
    # description     => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the...  # NOQA
    # title           => While My Guitar Gently Weeps
    # url             => http://www.youtube.com/watch?v=q3ixBmDzylQ
    # image           => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg
    # video:type      => application/x-shockwave-flash
    # video:height    => 224
    # video           => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1  # NOQA
    # video:width     => 398
    # type            => video

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    try:
        og_article = opengraph.OpenGraph(html=instance.content)

    except:
        # Not worth a round trip to sentry in most cases.
        # A warning will suffice. Developers can still debug
        # the article manually if wanted.
        LOGGER.warning(u'opengraph: parsing %s %s failed, aborting.',
                       instance_name, instance_id)
        return

    if not og_article.is_valid():
        LOGGER.warning(u'opengraph: invalid OpenGraph data in %s %s, '
                       u'aborting.', instance_name, instance_id)
        return

    needs_commit = False

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Title

    name_needs_extraction = get_processor_by_slug(
        '1fs-article-title-extract-accept-conditions').accepts(
            instance, verbose=verbose, commit=commit, **kwargs)

    if data_ok(og_article.title) and name_needs_extraction:
        if isinstance(og_article.title, list):
            # Cf. http://blog.dbth.fr/2015/03/la-liberte-de-fermer-ta-gueule-ou-du-sexisme-dans-la-musique/  # NOQA
            instance.name = og_article.title[0]

        else:
            instance.name = og_article.title

        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s name to “%s”.',
                        instance_name, instance_id, instance.name)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Date published
    # http://ogp.me/#type_article
    #
    # article:published_time - datetime - When the article was first published.
    # article:modified_time - datetime - When the article was last changed.
    # article:expiration_time - datetime - When the article is out of date after.  # NOQA
    # article:author - profile array - Writers of the article.
    # article:section - string - A high-level section name. E.g. Technology
    # article:tag - string array - Tag words associated with this article.
    #
    # http://ogp.me/#type_profile (for author)

    og_pub_time = og_article.get('article__published_time', None)

    if instance.date_published is None and data_ok(og_pub_time):

        parsed_datetime = datetime_extended_parser(og_pub_time)

        if parsed_datetime is None:
            LOGGER.warning(u'OpenGraph article:published_time “%s” is '
                           u'unparseable.', og_pub_time)

        else:
            date_published = datetime(*parsed_datetime[:6])

            instance.date_published = date_published
            needs_commit = True
            LOGGER.info(u'opengraph: set %s %s published date.',
                        instance_name, instance_id)

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Description

    og_description = og_article.get('description', None)

    if data_ok(og_description) and not data_ok(instance.excerpt):
        instance.excerpt = og_description
        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s excerpt.',
                        instance_name, instance_id)

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––—–––––––––– Authors

    #
    # TODO
    #

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Language

    og_language = og_article.get('language', None)

    if data_ok(og_language) and instance.language_id is None:
        instance.language = models.Language.get_by_code(og_language)
        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s language to %s.',
                        instance_name, instance_id, instance.language)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Tags

    og_tags = og_article.get('article__tag', None)

    if data_ok(og_tags):

        if not isinstance(og_tags, list):
            og_tags = [og_tags]

        if og_tags and not instance.tags.exists():
            instance.tags.add(*models.SimpleTag.get_tags_set(og_tags,
                              origin=instance))

            if verbose:
                LOGGER.info(u'opengraph: set %s %s tag(s) to %s.',
                            instance_name, instance_id, u', '.join(og_tags))

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Front image

    og_image = og_article.get('image', None)

    if data_ok(og_image) and not data_ok(instance.image_url):

        if isinstance(og_image, list):
            instance.image_url = clean_url(og_image[0])

        else:
            instance.image_url = clean_url(og_image)

        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s image_url to %s.',
                        instance_name, instance_id, instance.image_url)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——————— Beer

    if needs_commit and commit:
        # As we changed only fields that were previously
        # unset, no need to waste a version.
        instance.save_without_historical_record()

Example #24

0

Show file

File: fetch_content_urls.py Project: dolanor-galaxy/1flow

def process(self, instance, parameters=None, commit=True, **kwargs):
    """ See source code. """

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    urls = URL_MATCH_REGEX.findall(instance.content)

    if not urls:
        LOGGER.info(u'url-crawler: nothing to crawl in %s %s.', instance_name,
                    instance_id)
        return

    # Start with EasyList
    adblock_rules_list = requests_get(
        # WARNING: do not .split() with no parameters, else
        # adblock will block everything due to empty rules.
        'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n')

    # Append our eventual specific exclusions
    adblock_rules_list.extend(
        parameters.get('integration', {}).get('fetch_content_urls',
                                              {}).get('adblock_rules', []))

    if re2 is None:
        # Things will be dogly slow…
        adblock_rules = AdblockRules(
            adblock_rules_list,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    else:
        # Things will go faster
        adblock_rules = AdblockRules(
            adblock_rules_list,
            use_re2=True,
            max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY)

    if isinstance(instance, models.Email):
        origin = models.ORIGINS.EMAIL

        # NOTE: there will be at least one here, else
        # accepts() would have rejected the email.
        feeds = instance.feeds.exclude(
            MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE)

    else:
        origin = models.ORIGINS.CRAWLING
        feeds = instance.feeds.all()

    dupes = 0
    blocked = 0

    # LOGGER.debug('URLS: %s %s', len(urls), urls)

    for url in urls:
        if url.startswith('('):
            url = url[1:]

            if url.endswith(')'):
                # Skip Markdown's enclosing parenthesis
                # that we explicitely matched manually.
                url = url[:-1]

            # In case we've got garbage at the end of the RE.
            splitted = url.split(')')

            if len(splitted) == 1:
                pass

            if len(splitted) == 2 and len(splitted[1]) < 4:
                # Highly probable that we got some garbage at the end.
                url = splitted[0]

            else:
                LOGGER.error(
                    u'url-crawler: probable nasty unhandled '
                    u'URL “%s” too-greedily matched by RE.', url)

        if adblock_rules.should_block(url):
            LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.',
                        url)
            blocked += 1
            continue

        LOGGER.info('url-crawler: importing from %s.', url)

        try:
            item, created = create_item_from_url(
                url=clean_url(url),
                feeds=feeds,
                origin=origin,
            )

        except:
            LOGGER.exception(u'Could not create item from URL “%s”', url)

        else:
            if created:
                LOGGER.info(
                    u'url-crawler: successfully imported %s from '
                    u'%s %s.', item, instance_name, instance_id)

            else:
                dupes += 1
                LOGGER.warning(u'url-crawler: %s already in database.', item)

            # link newly created item to the item it was found into.
            item.sources.add(instance)

    LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.',
                len(urls) - blocked,
                len(urls) - blocked - dupes, instance_name, instance_id)