Exemple #1
0
    def repair_missing_authors_migration_201411(cls):

        # from oneflow.core.tasks.migration import vacuum_analyze

        articles = Article.objects.filter(
            authors=None,
            date_created__gt=datetime(2014, 10, 31))

        count = articles.count()
        done = 0

        LOGGER.info(u'Starting repairing %s missing authors @%s', count, now())

        with benchmark(u'Fix missing authors on rel-DB fetched content…'):

            for article in articles:
                article.postprocess_original_data(force=True)

                # if done % 25000 == 0:
                #     vacuum_analyze()

                done += 1
Exemple #2
0
def long_in_the_past():
    """ Return a datetime long before 1flow existed. """

    return datetime(2007, 1, 1)
Exemple #3
0
def process(self, instance, verbose=True, commit=True, **kwargs):
    """ See source code. """

    # from https://github.com/erikriver/opengraph
    # site_name       => YouTube
    # description     => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the...  # NOQA
    # title           => While My Guitar Gently Weeps
    # url             => http://www.youtube.com/watch?v=q3ixBmDzylQ
    # image           => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg
    # video:type      => application/x-shockwave-flash
    # video:height    => 224
    # video           => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1  # NOQA
    # video:width     => 398
    # type            => video

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    try:
        og_article = opengraph.OpenGraph(html=instance.content)

    except:
        # Not worth a round trip to sentry in most cases.
        # A warning will suffice. Developers can still debug
        # the article manually if wanted.
        LOGGER.warning(u'opengraph: parsing %s %s failed, aborting.',
                       instance_name, instance_id)
        return

    if not og_article.is_valid():
        LOGGER.warning(u'opengraph: invalid OpenGraph data in %s %s, '
                       u'aborting.', instance_name, instance_id)
        return

    needs_commit = False

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Title

    name_needs_extraction = get_processor_by_slug(
        '1fs-article-title-extract-accept-conditions').accepts(
            instance, verbose=verbose, commit=commit, **kwargs)

    if data_ok(og_article.title) and name_needs_extraction:
        if isinstance(og_article.title, list):
            # Cf. http://blog.dbth.fr/2015/03/la-liberte-de-fermer-ta-gueule-ou-du-sexisme-dans-la-musique/  # NOQA
            instance.name = og_article.title[0]

        else:
            instance.name = og_article.title

        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s name to “%s”.',
                        instance_name, instance_id, instance.name)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Date published
    # http://ogp.me/#type_article
    #
    # article:published_time - datetime - When the article was first published.
    # article:modified_time - datetime - When the article was last changed.
    # article:expiration_time - datetime - When the article is out of date after.  # NOQA
    # article:author - profile array - Writers of the article.
    # article:section - string - A high-level section name. E.g. Technology
    # article:tag - string array - Tag words associated with this article.
    #
    # http://ogp.me/#type_profile (for author)

    og_pub_time = og_article.get('article__published_time', None)

    if instance.date_published is None and data_ok(og_pub_time):

        parsed_datetime = datetime_extended_parser(og_pub_time)

        if parsed_datetime is None:
            LOGGER.warning(u'OpenGraph article:published_time “%s” is '
                           u'unparseable.', og_pub_time)

        else:
            date_published = datetime(*parsed_datetime[:6])

            instance.date_published = date_published
            needs_commit = True
            LOGGER.info(u'opengraph: set %s %s published date.',
                        instance_name, instance_id)

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Description

    og_description = og_article.get('description', None)

    if data_ok(og_description) and not data_ok(instance.excerpt):
        instance.excerpt = og_description
        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s excerpt.',
                        instance_name, instance_id)

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––—–––––––––– Authors

    #
    # TODO
    #

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Language

    og_language = og_article.get('language', None)

    if data_ok(og_language) and instance.language_id is None:
        instance.language = models.Language.get_by_code(og_language)
        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s language to %s.',
                        instance_name, instance_id, instance.language)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Tags

    og_tags = og_article.get('article__tag', None)

    if data_ok(og_tags):

        if not isinstance(og_tags, list):
            og_tags = [og_tags]

        if og_tags and not instance.tags.exists():
            instance.tags.add(*models.SimpleTag.get_tags_set(og_tags,
                              origin=instance))

            if verbose:
                LOGGER.info(u'opengraph: set %s %s tag(s) to %s.',
                            instance_name, instance_id, u', '.join(og_tags))

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Front image

    og_image = og_article.get('image', None)

    if data_ok(og_image) and not data_ok(instance.image_url):

        if isinstance(og_image, list):
            instance.image_url = clean_url(og_image[0])

        else:
            instance.image_url = clean_url(og_image)

        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s image_url to %s.',
                        instance_name, instance_id, instance.image_url)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——————— Beer

    if needs_commit and commit:
        # As we changed only fields that were previously
        # unset, no need to waste a version.
        instance.save_without_historical_record()
Exemple #4
0
    DjangoUser as User,
    READ_STATUS_DATA,
    WATCH_ATTRIBUTES_FIELDS_NAMES,
)

from folder import Folder

# Avoid import loop.
# from subscription import Subscription, generic_check_subscriptions_method

from tag import AbstractTaggedModel, SimpleTag as Tag
from item import BaseItem, Poke

LOGGER = logging.getLogger(__name__)

MIGRATION_DATETIME = datetime(2014, 11, 1)

__all__ = [
    'Read',
    'ReadManager',
    'ReadQuerySet',
    'BOOKMARK_TYPES',
]

BOOKMARK_TYPES = NamedTupleChoices(
    'BOOKMARK_TYPES',
    ('UNDEFINED', u'U', _(u'Undefined')),
    ('AFTERNOON', u'A', _(u'This afternoon')),
    ('WEEK_END', u'W', _(u'This week-end')),

    # The second char will be used for user defined bookmark types.
Exemple #5
0
from django.conf import settings
from django.core.mail import mail_admins

from ..models.nonrel import (RATINGS, Article, Feed, Subscription, Read, User
                             as MongoUser)

from ..gr_import import GoogleReaderImport

from oneflow.base.utils.dateutils import (now, ftstamp, datetime, naturaldelta,
                                          naturaltime)

from common import User

# We don't fetch articles too far in the past, even if google has them.
GR_OLDEST_DATE = datetime(2008, 1, 1)

LOGGER = logging.getLogger(__name__)


def get_user_from_dbs(user_id):
    django_user = User.objects.get(id=user_id)
    MongoUser.objects(django_user=django_user.id).update_one(
        set__django_user=django_user.id, upsert=True)

    return django_user, MongoUser.objects.get(django_user=django_user.id)


def import_google_reader_trigger(user_id, refresh=False):
    """ This function allow to trigger the celery task from anywhere.
        just pass it a user ID. It's called from the views, and we created
Exemple #6
0
def process(self, instance, parameters=None, verbose=True,
            force=False, commit=True, **kwargs):
    """ See source code. """

    # Get an eventual siteconfig override from parameters. If `None`,
    # the `process()` wrapper will fetch it from repositories as usual.
    siteconfig_string = parameters.get('metadata', {}).get('siteconfig', None)

    if siteconfig_string is None:
        siteconfig = None

    else:
        try:
            siteconfig = ftr.SiteConfig(site_config_text=siteconfig_string)

        except:
            LOGGER.exception(
                u'ftr-extractor: unusable custom siteconfig, aborting.')
            # TODO: mail admins…
            return

        if verbose:
            LOGGER.info(u'ftr-extractor: custom siteconfig override loaded.')

    # FTR logs a lot, and it's useless if not in verbose / debug mode,
    # because siteconfigs are debugged at the lower-level, not from the
    # 1flow processor.
    logging.disable(logging.WARNING)

    try:
        try:
            # Note: in case of multiple-pages article, this should
            # bring us ALL the content, concatenated in one page.
            extractor = ftr.process(
                # HEADS UP: Email items have no `.url` field.
                url=getattr(instance, 'url', None),
                content=instance.content,
                config=siteconfig
            )

        except ftr.SiteConfigException:
            # No configuration for the website or syntax
            # error in siteconfig. Bail out, another
            # processor will take care of this article.
            return
    finally:
        logging.disable(logging.NOTSET)

    instance_name = instance._meta.verbose_name
    instance_id = instance.id
    needs_save = False

    # General processing note:
    # We use extracted attributes only if they were the result of an
    # intended extraction. If they come from automatic extraction after
    # a failure, we discard them.

    # ——————————————————————————————————————————————————————————————————— Title

    if (extractor.title is not None and 'title' not in extractor.failures) and (
            force or instance.name is None):
        instance.name = extractor.title
        needs_save = True

        if verbose:
            LOGGER.info(u'ftr-extractor: Set %s %s title to “%s”.',
                        instance_name, instance_id, instance.name)

    # —————————————————————————————————————————————————————————— Body / content

    if (extractor.body is not None and 'body' not in extractor.failures) and (
            force or instance.content_type is models.CONTENT_TYPES.HTML):
        instance.content = extractor.body
        instance.content_type = (
            # `prune` option already produces
            # cleaned HTML and is usually sufficient.
            models.CONTENT_TYPES.CLEANED_HTML
            if extractor.config.prune
            else models.CONTENT_TYPES.HTML
        )

        needs_save = True

        if verbose:
            LOGGER.info(u'ftr-extractor: Set %s %s content to %s of '
                        u'genuine %s.', instance_name, instance_id,
                        naturalsize(len(instance.content)),
                        models.CONTENT_TYPES.symbolic(instance.content_type))

    # ———————————————————————————————————————————————————————— Multi-pages URLs

    # HEADS UP: getattr(…) because Email items have no `.pages_urls` field.
    if (bool(extractor.next_page_link)
        and 'next_page_link' not in extractor.failures) and (
            force or getattr(instance, 'pages_urls', None) in (None, [])):
        instance.pages_urls = extractor.next_page_link
        needs_save = True

        if verbose:
            LOGGER.info(u'ftr-extractor: Recorded %s multi-pages URLs '
                        u'for %s %s.', len(extractor.next_page_link),
                        instance_name, instance_id)

    # —————————————————————————————————————————————————————————— Date published

    if (extractor.date is not None and 'title' not in extractor.failures) and (
            force or instance.date_published is None):
        try:
            the_datetime = datetime(datetime_extended_parser(
                                    extractor.date)[:6])

        except:
            pass

        else:
            if is_naive(the_datetime):
                the_datetime = make_aware(the_datetime, utc)

            instance.date_published = the_datetime
            needs_save = True

            if verbose:
                LOGGER.info(u'ftr-extractor: Set %s %s date to %s.',
                            instance_name, instance_id, instance.date_published)

    # ———————————————————————————————————————————————————————————————— Language

    if (extractor.language is not None
        and 'language' not in extractor.failures) and (
            force or instance.language_id is None):
        instance.language = models.Language.get_by_code(extractor.language)
        needs_save = True

        if verbose:
            LOGGER.info(u'ftr-extractor: Set %s %s language to %s.',
                        instance_name, instance_id, instance.language)

    # ——————————————————————————————————————————————————————————————— Author(s)

    if (bool(extractor.author) and 'author' not in extractor.failures) and (
            force or not instance.authors.exists()):

        authors = models.Author.get_authors_from_name_emails_and_article(
            authors=[{'name': name} for name in extractor.author],
            origin_article=instance
        )

        if verbose:
            LOGGER.info(u'ftr-extractor: Set %s %s author(s) to %s.',
                        instance_name, instance_id,
                        u', '.join(unicode(a) for a in authors))

    if needs_save and commit:
        # If the processing was forced, we consider keeping the previous
        # version is a safe practice, to be able to eventually go back.
        # Else, just don't clutter the system with another version.

        if force:
            instance.save()

        else:
            instance.save_without_historical_record()
Exemple #7
0
def process(self, instance, verbose=True, commit=True, **kwargs):
    """ See source code. """

    # from https://github.com/erikriver/opengraph
    # site_name       => YouTube
    # description     => Eric Clapton and Paul McCartney perform George Harrison's "While My Guitar Gently Weeps" at the...  # NOQA
    # title           => While My Guitar Gently Weeps
    # url             => http://www.youtube.com/watch?v=q3ixBmDzylQ
    # image           => http://i2.ytimg.com/vi/q3ixBmDzylQ/default.jpg
    # video:type      => application/x-shockwave-flash
    # video:height    => 224
    # video           => http://www.youtube.com/v/q3ixBmDzylQ?version=3&autohide=1  # NOQA
    # video:width     => 398
    # type            => video

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    try:
        og_article = opengraph.OpenGraph(html=instance.content)

    except:
        # Not worth a round trip to sentry in most cases.
        # A warning will suffice. Developers can still debug
        # the article manually if wanted.
        LOGGER.warning(u'opengraph: parsing %s %s failed, aborting.',
                       instance_name, instance_id)
        return

    if not og_article.is_valid():
        LOGGER.warning(
            u'opengraph: invalid OpenGraph data in %s %s, '
            u'aborting.', instance_name, instance_id)
        return

    needs_commit = False

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Title

    name_needs_extraction = get_processor_by_slug(
        '1fs-article-title-extract-accept-conditions').accepts(instance,
                                                               verbose=verbose,
                                                               commit=commit,
                                                               **kwargs)

    if data_ok(og_article.title) and name_needs_extraction:
        if isinstance(og_article.title, list):
            # Cf. http://blog.dbth.fr/2015/03/la-liberte-de-fermer-ta-gueule-ou-du-sexisme-dans-la-musique/  # NOQA
            instance.name = og_article.title[0]

        else:
            instance.name = og_article.title

        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s name to “%s”.', instance_name,
                        instance_id, instance.name)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Date published
    # http://ogp.me/#type_article
    #
    # article:published_time - datetime - When the article was first published.
    # article:modified_time - datetime - When the article was last changed.
    # article:expiration_time - datetime - When the article is out of date after.  # NOQA
    # article:author - profile array - Writers of the article.
    # article:section - string - A high-level section name. E.g. Technology
    # article:tag - string array - Tag words associated with this article.
    #
    # http://ogp.me/#type_profile (for author)

    og_pub_time = og_article.get('article__published_time', None)

    if instance.date_published is None and data_ok(og_pub_time):

        parsed_datetime = datetime_extended_parser(og_pub_time)

        if parsed_datetime is None:
            LOGGER.warning(
                u'OpenGraph article:published_time “%s” is '
                u'unparseable.', og_pub_time)

        else:
            date_published = datetime(*parsed_datetime[:6])

            instance.date_published = date_published
            needs_commit = True
            LOGGER.info(u'opengraph: set %s %s published date.', instance_name,
                        instance_id)

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Description

    og_description = og_article.get('description', None)

    if data_ok(og_description) and not data_ok(instance.excerpt):
        instance.excerpt = og_description
        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s excerpt.', instance_name,
                        instance_id)

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––—–––––––––– Authors

    #
    # TODO
    #

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Language

    og_language = og_article.get('language', None)

    if data_ok(og_language) and instance.language_id is None:
        instance.language = models.Language.get_by_code(og_language)
        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s language to %s.', instance_name,
                        instance_id, instance.language)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——— Tags

    og_tags = og_article.get('article__tag', None)

    if data_ok(og_tags):

        if not isinstance(og_tags, list):
            og_tags = [og_tags]

        if og_tags and not instance.tags.exists():
            instance.tags.add(
                *models.SimpleTag.get_tags_set(og_tags, origin=instance))

            if verbose:
                LOGGER.info(u'opengraph: set %s %s tag(s) to %s.',
                            instance_name, instance_id, u', '.join(og_tags))

    # ––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––– Front image

    og_image = og_article.get('image', None)

    if data_ok(og_image) and not data_ok(instance.image_url):

        if isinstance(og_image, list):
            instance.image_url = clean_url(og_image[0])

        else:
            instance.image_url = clean_url(og_image)

        needs_commit = True

        if verbose:
            LOGGER.info(u'opengraph: set %s %s image_url to %s.',
                        instance_name, instance_id, instance.image_url)

    # –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––——————— Beer

    if needs_commit and commit:
        # As we changed only fields that were previously
        # unset, no need to waste a version.
        instance.save_without_historical_record()
Exemple #8
0
from oneflow.base.utils.dateutils import now, datetime

from ..common import ORIGINS, CONTENT_TYPES
from ..author import Author

from base import (
    BaseItemQuerySet,
    BaseItemManager,
    BaseItem,
    baseitem_create_reads_task,
)


LOGGER = logging.getLogger(__name__)

MIGRATION_DATETIME = datetime(2014, 11, 1)


__all__ = [
    'Tweet',
    'create_tweet_from_id',
    'mark_tweet_deleted',

    # Tasks will be added below by register_task_method().
]


def create_tweet_from_id(tweet_id, feeds=None, origin=None):
    """ From a Tweet ID, create a 1flow tweet via the REST API.

Exemple #9
0
def process(self, instance, parameters=None,
            verbose=True, commit=True, **kwargs):
    """ See source code. """

    CONTENT_TYPES = models.CONTENT_TYPES

    instance_name = instance._meta.verbose_name
    instance_id = instance.id

    # Only used in accepts() code.
    # repair = parameters.get('repair', False)

    if instance.content_type == CONTENT_TYPES.HTML:
        html_to_work_on = instance.content

    else:
        # The existence of this has already been tested in accepts().
        # We cannot run process() if the instance is not HTML or not
        # repairing it with a known HTML history version.
        html_to_work_on = instance.history.filter(
            content_type=CONTENT_TYPES.HTML).earliest('history_date').content

    try:
        # The microdata parser expects an utf-8 encoded string… too bad.
        items = microdata.get_items(html_to_work_on.encode('utf-8'))

    except:
        LOGGER.warning(u'schema.org-extractor: could not extract microdata '
                       u'from %s %s', instance_name, instance_id)
        return

    need_save = False

    # ————————————————————————————————————————————————————————————————— Extract

    attributes = OrderedDict()

    for item in items:
        schema_properties = item.props

        # LOGGER.info(u'item %s', item.json())

        # Common attributes to all types we handle in 1flow.

        name = schema_properties.get('name', None)

        # Do not overwrite with a less specific value if
        # name was already set via 'Article::headline'.
        if name is not None and attributes.get('name', None) is not None:
            attributes['name'] = get_property(name)

        date_published = schema_properties.get('datePublished', None)

        if date_published is not None:
            attributes['date_published'] = get_property(date_published)

        excerpt = schema_properties.get('description', None)

        if excerpt is not None:
            attributes['excerpt'] = get_property(excerpt)

        tags = schema_properties.get('keywords', None)

        if tags is not None:
            attributes['tags'] = extract_tags(tags)

        image_url = schema_properties.get('thumbnailUrl', None)

        if image_url is not None:
            attributes['image_url'] = get_property(image_url)

        authors = schema_properties.get('author', None)

        # Author can be a link to the author page, which
        # will give us a Person or Organization schema.
        if authors is not None:
            found_authors = get_microdata_authors(authors, instance)
            if found_authors:
                attributes['authors'] = found_authors

        genre = schema_properties.get('genre', None)

        if genre is not None:
            if 'tags' not in attributes:
                attributes['tags'] = []
            for one_genre in genre:
                attributes['tags'].extend(extract_tags(one_genre))

        if item.type == 'http://schema.org/VideoObject':

            if instance.content_type != CONTENT_TYPES.VIDEO:
                instance.content_type = CONTENT_TYPES.VIDEO
                need_save = True

                LOGGER.info(u'schema.org-extractor: Set %s %s content type '
                            u'to VIDEO.', instance_name, instance_id)

        elif item.type in (
            'http://schema.org/Article',
            'http://schema.org/NewsArticle',
            'http://schema.org/TechArticle',

            'http://schema.org/BlogPosting',
            'http://schema.org/WebPage',
            'http://schema.org/CreativeWork',
        ):

            # HeadLine overwrites name, it's more specific.
            attributes['name'] = get_property(
                schema_properties.get('headline', None))
            attributes['language'] = get_property(
                schema_properties.get('inLanguage', None))
            attributes['word_count'] = get_property(
                schema_properties.get('wordCount', None))

            creators = schema_properties.get('creator', None)

            # Author can be a link to the creator page, which
            # will give us a Person or Organization schema.
            if creators is not None:
                creators = get_microdata_authors(creators, instance)

                if creators:
                    if 'authors' in attributes:
                        attributes['authors'].extend(creators)

                    else:
                        attributes['authors'] = creators

            # TODO:
            # citation
            # comment
            # articleBody → content
            # articleSection → Tags
            #
            # News:
            # dateline → ?
            #
            # Tech:
            # dependencies
            # proficiencyLevel
            #
            # WebPage:
            # specialy → ?
            # significantLink → crawl ?
            # reviewedBy → ?
            # lastReviewed → ?
            # relatedLink → ?
            # primaryImageOfPage

    # —————————————————————————————————————————————————————— Transform & assign
    # turn attributes into their python / 1flow native-internals formats.

    if attributes.get('date_published', None) is not None:
        try:
            attributes['date_published'] = datetime(*datetime_extended_parser(
                attributes['date_published'])[:6])

        except:
            LOGGER.exception(u'schema.org-extractor: unparseable date “%s”',
                             attributes['date_published'])

            # Be sure we don't try to use it below.
            attributes['date_published'] = None

    if attributes.get('language', None) is not None:
        try:
            attributes['language'] = models.Language.get_by_code(
                attributes['language'])

        except:
            LOGGER.exception(u'schema.org-extractor: unable to get '
                             u'language “%s”', attributes['language'])

            # Be sure we don't try to use it below.
            attributes['language'] = None

    if attributes.get('word_count', None) is not None:
        attributes['word_count'] = int(attributes['word_count'])

    if attributes.get('tags', None) is not None:
        # We pop() tags to avoid trying to setattr() it below.
        tags = models.SimpleTag.get_tags_set(attributes.pop('tags'),
                                             origin=instance)
        instance.tags.add(*tags)

        if verbose:
            LOGGER.info(u'schema.org-extractor: added tags %s to %s %s.',
                        u', '.join(tag.name for tag in tags),
                        instance_name, instance_id)

    if attributes.get('authors', None) is not None:
        # We pop() tags to avoid trying to setattr() it below.
        authors = attributes.pop('authors')

        # LOGGER.info(authors)

        # This will implicitely add() the author to the instance.
        authors = models.Author.get_authors_from_name_emails_and_article(
            authors, origin_article=instance)

        # LOGGER.info(authors)

        LOGGER.info(u'schema.org-extractor: added author(s) %s to %s %s.',
                    u', '.join(unicode(a) for a in authors),
                    instance_name, instance_id)

    # if verbose:
    #     LOGGER.debug(u'schema.org-extractor: %s', attributes)

    for attribute, value in attributes.items():
        if value is None:
            continue

        if getattr(instance, attribute) is None:
            setattr(instance, attribute, value)

            need_save = True

            if verbose:
                LOGGER.info(u'schema.org-extractor: Set %s %s to %s %s.',
                            attribute, value, instance_name, instance_id)

    if need_save and commit:
        instance.save()
Exemple #10
0
def long_in_the_past():
    """ Return a datetime long before 1flow existed. """

    return datetime(2007, 1, 1)
Exemple #11
0
def process(self,
            instance,
            parameters=None,
            verbose=True,
            force=False,
            commit=True,
            **kwargs):
    """ See source code. """

    # Get an eventual siteconfig override from parameters. If `None`,
    # the `process()` wrapper will fetch it from repositories as usual.
    siteconfig_string = parameters.get('metadata', {}).get('siteconfig', None)

    if siteconfig_string is None:
        siteconfig = None

    else:
        try:
            siteconfig = ftr.SiteConfig(site_config_text=siteconfig_string)

        except:
            LOGGER.exception(
                u'ftr-extractor: unusable custom siteconfig, aborting.')
            # TODO: mail admins…
            return

        if verbose:
            LOGGER.info(u'ftr-extractor: custom siteconfig override loaded.')

    # FTR logs a lot, and it's useless if not in verbose / debug mode,
    # because siteconfigs are debugged at the lower-level, not from the
    # 1flow processor.
    logging.disable(logging.WARNING)

    try:
        try:
            # Note: in case of multiple-pages article, this should
            # bring us ALL the content, concatenated in one page.
            extractor = ftr.process(
                # HEADS UP: Email items have no `.url` field.
                url=getattr(instance, 'url', None),
                content=instance.content,
                config=siteconfig)

        except ftr.SiteConfigException:
            # No configuration for the website or syntax
            # error in siteconfig. Bail out, another
            # processor will take care of this article.
            return
    finally:
        logging.disable(logging.NOTSET)

    instance_name = instance._meta.verbose_name
    instance_id = instance.id
    needs_save = False

    # General processing note:
    # We use extracted attributes only if they were the result of an
    # intended extraction. If they come from automatic extraction after
    # a failure, we discard them.

    # ——————————————————————————————————————————————————————————————————— Title

    if (extractor.title is not None and 'title'
            not in extractor.failures) and (force or instance.name is None):
        instance.name = extractor.title
        needs_save = True

        if verbose:
            LOGGER.info(u'ftr-extractor: Set %s %s title to “%s”.',
                        instance_name, instance_id, instance.name)

    # —————————————————————————————————————————————————————————— Body / content

    if (extractor.body is not None and 'body' not in extractor.failures) and (
            force or instance.content_type is models.CONTENT_TYPES.HTML):
        instance.content = extractor.body
        instance.content_type = (
            # `prune` option already produces
            # cleaned HTML and is usually sufficient.
            models.CONTENT_TYPES.CLEANED_HTML
            if extractor.config.prune else models.CONTENT_TYPES.HTML)

        needs_save = True

        if verbose:
            LOGGER.info(
                u'ftr-extractor: Set %s %s content to %s of '
                u'genuine %s.', instance_name, instance_id,
                naturalsize(len(instance.content)),
                models.CONTENT_TYPES.symbolic(instance.content_type))

    # ———————————————————————————————————————————————————————— Multi-pages URLs

    # HEADS UP: getattr(…) because Email items have no `.pages_urls` field.
    if (bool(extractor.next_page_link)
            and 'next_page_link' not in extractor.failures) and (
                force or getattr(instance, 'pages_urls', None) in (None, [])):
        instance.pages_urls = extractor.next_page_link
        needs_save = True

        if verbose:
            LOGGER.info(
                u'ftr-extractor: Recorded %s multi-pages URLs '
                u'for %s %s.', len(extractor.next_page_link), instance_name,
                instance_id)

    # —————————————————————————————————————————————————————————— Date published

    if (extractor.date is not None and 'title' not in extractor.failures) and (
            force or instance.date_published is None):
        try:
            the_datetime = datetime(
                datetime_extended_parser(extractor.date)[:6])

        except:
            pass

        else:
            if is_naive(the_datetime):
                the_datetime = make_aware(the_datetime, utc)

            instance.date_published = the_datetime
            needs_save = True

            if verbose:
                LOGGER.info(u'ftr-extractor: Set %s %s date to %s.',
                            instance_name, instance_id,
                            instance.date_published)

    # ———————————————————————————————————————————————————————————————— Language

    if (extractor.language is not None and 'language'
            not in extractor.failures) and (force
                                            or instance.language_id is None):
        instance.language = models.Language.get_by_code(extractor.language)
        needs_save = True

        if verbose:
            LOGGER.info(u'ftr-extractor: Set %s %s language to %s.',
                        instance_name, instance_id, instance.language)

    # ——————————————————————————————————————————————————————————————— Author(s)

    if (bool(extractor.author) and 'author' not in extractor.failures) and (
            force or not instance.authors.exists()):

        authors = models.Author.get_authors_from_name_emails_and_article(
            authors=[{
                'name': name
            } for name in extractor.author],
            origin_article=instance)

        if verbose:
            LOGGER.info(u'ftr-extractor: Set %s %s author(s) to %s.',
                        instance_name, instance_id,
                        u', '.join(unicode(a) for a in authors))

    if needs_save and commit:
        # If the processing was forced, we consider keeping the previous
        # version is a safe practice, to be able to eventually go back.
        # Else, just don't clutter the system with another version.

        if force:
            instance.save()

        else:
            instance.save_without_historical_record()