Beispiel #1
0
class Event(Cluster):
    __tablename__ = 'event'
    __members__ = {
        'class_name': 'Article',
        'secondary': events_articles,
        'backref_name': 'events'
    }
    __entities__ = {'secondary': events_entities, 'backref_name': 'events'}
    active = db.Column(db.Boolean, default=True)

    @property
    def articles(self):
        """
        Convenience :)
        """
        return self.members

    @articles.setter
    def articles(self, value):
        self.members = value

    def summarize(self):
        """
        Generate a summary for this cluster.
        """
        if len(self.members) == 1:
            member = self.members[0]
            self.summary = ' '.join(summarize(member.title, member.text))
        else:
            self.summary = ' '.join(
                multisummarize([m.text for m in self.members]))
        return self.summary

    @staticmethod
    def cluster(articles, threshold=0.7, debug=False):
        """
        Clusters a set of articles
        into existing events (or creates new ones).

        Args:
            | articles (list)       -- the Articles to cluster
            | threshold (float)     -- the similarity threshold for qualifying a cluster
        """
        log = logger('EVENT_CLUSTERING')
        if debug:
            log.setLevel('DEBUG')
        else:
            log.setLevel('ERROR')

        updated_clusters = []
        active_clusters = Event.query.filter_by(active=True).all()
        now = datetime.utcnow()

        for article in articles:
            # Select candidate clusters,
            # i.e. active clusters which share at least one entity with this article.
            a_ents = [entity.slug for entity in article.entities]
            candidate_clusters = []
            for c in active_clusters:
                c_ents = [entity.slug for entity in c.entities]
                if set(c_ents).intersection(a_ents):
                    candidate_clusters.append(c)

            selected_cluster = cluster(article,
                                       candidate_clusters,
                                       threshold=threshold,
                                       logger=log)

            # If no selected cluster was found, then create a new one.
            if not selected_cluster:
                log.debug(
                    'No qualifying clusters found, creating a new cluster.')
                selected_cluster = Event([article])
                db.session.add(selected_cluster)

            updated_clusters.append(selected_cluster)

        for clus in active_clusters:
            # Mark expired clusters inactive.
            if (now - clus.updated_at).days > 3:
                clus.active = False
            else:
                clus.update()

        db.session.commit()
        return updated_clusters
Beispiel #2
0
from argos.datastore import db
from argos.core.models import Entity
from argos.core.models.cluster import Cluster
from argos.core.brain.cluster import cluster
from argos.core.brain.summarize import multisummarize

from argos.util.logger import logger

stories_events = db.Table('stories_events',
        db.Column('story_id', db.Integer, db.ForeignKey('story.id'), primary_key=True),
        db.Column('event_id', db.Integer, db.ForeignKey('event.id'), primary_key=True)
)

stories_entities = db.Table('stories_entities',
        db.Column('entity_slug', db.String, db.ForeignKey('entity.slug')),
        db.Column('story_id', db.Integer, db.ForeignKey('story.id'))
)

class Story(Cluster):
    __tablename__   = 'story'
    __members__     = {'class_name': 'Event', 'secondary': stories_events, 'backref_name': 'stories'}
    __entities__    = {'secondary': stories_entities, 'backref_name': 'stories'}

    @property
    def events(self):
        """
        Convenience :)
        """
        return self.members

    @events.setter
Beispiel #3
0
class User(Model, UserMixin):
    """
    A user

    Attributes:

        * id -> Integer (Primary Key)
        * email -> String (Unique)
        * password -> String (Unique)
        * active -> Bool
        * confirmed_at -> DateTime
        * roles -> [Role]
    """
    id              = db.Column(db.Integer(), primary_key=True)
    email           = db.Column(db.String(255), unique=True)
    image           = db.Column(db.String(255), unique=True)
    name            = db.Column(db.String(255), unique=True)
    password        = db.Column(db.String(255))
    active          = db.Column(db.Boolean())
    confirmed_at    = db.Column(db.DateTime())
    auths           = db.relationship('Auth', backref='user', lazy='dynamic')
    roles           = db.relationship('Role', secondary=roles_users,
                            backref=db.backref('users', lazy='dynamic'))
    watching        = db.relationship('Story', secondary=users_stories,
                            backref=db.backref('watchers', lazy='joined'))
    bookmarked      = db.relationship('Event', secondary=users_events,
                            backref=db.backref('bookmarkers', lazy='joined'))
    created_at      = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at      = db.Column(db.DateTime, default=datetime.utcnow)

    def __init__(self, auth=None, **kwargs):
        for key in kwargs:
            setattr(self, key, kwargs[key])

    def add_provider(self, provider, provider_id, access_token, access_token_secret=None, update=True):
        """
        Add a new provider authentication to this user.

        Raises an AuthExistsForUserException if this authentication
        already exists and is associated with another user.

        Args:
            | provider (str)            -- the provider name, e.g. 'twitter'
            | provider_id (str)         -- the id assigned by the provider
            | access_token (str)        -- the access token
            | access_token_secret (str) -- the access token secret
            | update (bool)             -- whether or not to update the existing
                                        provider authentication, if found (default: True)
        """
        # Check to see if this auth already exists.
        auth = Auth.for_provider(provider, provider_id)
        if auth:
            if auth.user is not self:
                raise AuthExistsForUserException('Found an existing authorization for {0} associated with another user.'.format(provider))
            elif update:
                auth.update_token(access_token, access_token_secret)
        else:
            auth = Auth(provider, provider_id, access_token, access_token_secret)
            auth.user = self
            db.session.add(auth)

        db.session.commit()
        return auth

    def merge(self, user):
        """
        Merge this user with another user,
        where *this* user is considered the canonical
        user (i.e. its attributes are preferred over
        the other user's).

        UI tip: prompt the user to pick which account is their primary one!
        """
        providers = [auth.provider for auth in self.auths]
        for auth in user.auths:
            # In the event that the merged user has authentications
            # which conflict with one on this user, prefer the one on this user.
            # I don't anticipate this will happen, but it's possible, e.g. if a user
            # has two twitter accts and authenticates each on different user accts here.
            if auth.provider not in providers:
                auth.user = self
        db.session.delete(user)
        db.session.commit()

    @staticmethod
    def for_provider(provider, provider_id):
        """
        Find an User instance by provider.

        Args:
            | provider (str)        -- the provider name, e.g. 'twitter'
            | provider_id (str)     -- the user id assigned by the provider
        """
        auth = Auth.for_provider(provider, provider_id)
        if auth:
            return auth.user
        return None
Beispiel #4
0
class Auth(Model):
    """
    Represents a third-party authentication.
    """
    id                      = db.Column(db.BigInteger(), primary_key=True)
    provider                = db.Column(db.String(255))
    provider_id             = db.Column(db.String(255))
    access_token            = db.Column(db.String(255))
    _access_token_secret    = db.Column('access_token_secret', db.LargeBinary(255))
    user_id                 = db.Column(db.Integer, db.ForeignKey('user.id'))

    def __init__(self, provider, provider_id,  access_token, access_token_secret=None):
        self.provider_id = provider_id
        self.provider = provider
        self.access_token = access_token
        self.access_token_secret = access_token_secret

        # Generate a unique id for this auth based on the provider and the provider id.
        self.id = Auth.gen_id(provider, provider_id)

    def update_token(self, access_token, access_token_secret=None):
        """
        Updates token for an authentication.

        Enforcing that access tokens and their
        secrets must be updated in tandem.
        May need to revisit this decision later.

        Args:
            | access_token (str)        -- the access token
            | access_token_secret (str) -- the access token secret
        """

        # If the auth has a token and no secret, just update the token.
        if self.access_token and self.access_token_secret is None:
            self.access_token = access_token

        # Otherwise, the auth has a token and a secret,
        # and a new secret must be present.
        elif access_token_secret is None:
            raise Exception('This authentication requires a token secret, which was not specified.')

        else:
            self.access_token = access_token
            self.access_token_secret = access_token_secret

    @property
    def access_token_secret(self):
        if self._access_token_secret is not None:
            dec = AES.new(current_app.config['AES_KEY'], AES.MODE_CFB, current_app.config['AES_IV'])
            return dec.decrypt(self._access_token_secret).decode('utf-8')

    @access_token_secret.setter
    def access_token_secret(self, value):
        if value is not None:
            enc = AES.new(current_app.config['AES_KEY'], AES.MODE_CFB, current_app.config['AES_IV'])
            self._access_token_secret = enc.encrypt(value)

    @staticmethod
    def for_provider(provider, provider_id):
        """
        Find an Auth instance by provider.

        Args:
            | provider (str)        -- the provider name, e.g. 'twitter'
            | provider_id (str)     -- the user id assigned by the provider
        """
        id = Auth.gen_id(provider, provider_id)
        return Auth.query.get(id)

    @staticmethod
    def gen_id(provider, provider_id):
        """
        Generates a unique id for an Auth.
        """
        return hash(provider + provider_id)
Beispiel #5
0
class Concept(Model):
    """
    An concept,
    which could be a place, person,
    organization, topic, etc.

    You should *not* set the `slug` or `uri`;
    they are set automatically according to the `name`.
    In the spirit of Python's developer maturity,
    you're trusted not to modify them.
    """
    name = db.Column(db.UnicodeText)
    slug = db.Column(db.String(255), primary_key=True)
    uri = db.Column(db.String)
    summary = db.Column(db.UnicodeText)
    image = db.Column(db.String)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    aliases = db.relationship('Alias', backref='concept', lazy='joined')
    commonness = db.Column(db.Float, default=0.0)

    # Mapping concepts to concepts,
    # and tracking mentions of other concepts in this concept's summary.
    mentions = db.relationship('Alias',
                               secondary=concepts_mentions,
                               backref=db.backref('concepts'))
    concept_associations = db.relationship(
        ConceptConceptAssociation,
        foreign_keys=[ConceptConceptAssociation.from_concept_slug],
        backref=db.backref('from_concept'),
        cascade='all, delete-orphan')

    _sources = ['Wikipedia', 'DBpedia']

    def __init__(self, name):
        """
        Initialize a concept by a name, which can be
        an alias (it does not have to be the canonical name).
        This specified name will be saved as an Alias.

        A canonical name will be looked for; if one is found
        it will be used as the slug for this Concept.
        """
        self.aliases.append(Alias(name))

        # Try to get a canonical URI
        # and derive the slug from that.
        self.uri = knowledge.uri_for_name(name)
        if self.uri:
            self.slug = self.uri.split('/')[-1]
            k = knowledge.knowledge_for(uri=self.uri, fallback=True)
            self.commonness = knowledge.commonness_for_uri(self.uri)

            self.summary = k['summary']
            self.name = k['name']

            # Download the image.
            if k['image'] is not None:
                ext = splitext(k['image'])[-1].lower()
                self.image = storage.save_from_url(
                    k['image'], '{0}{1}'.format(hash(self.slug), ext))

        # If no URI was found,
        # generate our own slug.
        # Note: A problem here is that it assumes that
        # this particular name is the canonical one,
        # and that we don't collect any information for it.
        else:
            self.slug = slugify(name)
            # Commonness is set to default of 0.0,
            # which makes sense because if there's no URI for it
            # it probably is not common at all.

    @property
    def names(self):
        return [alias.name for alias in self.aliases]

    @property
    def sources(self):
        """
        Returns the data sources
        used for this concept.
        """
        return self._sources

    @property
    def concepts(self):
        """
        Returns the concepts this
        concept points *to*,
        with their importance scores
        for this concept.
        """
        if self.summary and not len(self.concept_associations):
            self.conceptize()

        def with_score(assoc):
            assoc.concept.score = assoc.score
            return assoc.concept

        return list(map(with_score, self.concept_associations))

    @property
    def from_concepts(self):
        """
        Returns the concepts that
        points to this concept,
        with their importance scores
        for this concept.
        """
        def with_score(assoc):
            assoc.from_concept.score = assoc.score
            return assoc.from_concept

        return list(map(with_score, self.from_concept_associations))

    @property
    def stories(self):
        """
        Return the stories associated with this concept,
        adding an additional "relatedness" value
        which is the concept's importance score for
        a particular story.
        """
        def with_score(assoc):
            assoc.story.relatedness = assoc.score
            return assoc.story

        return list(map(with_score, self.story_associations))

    @property
    def events(self):
        """
        Same as the `stories` property
        but for events.
        """
        def with_score(assoc):
            assoc.event.relatedness = assoc.score
            return assoc.event

        return list(map(with_score, self.event_associations))

    @property
    def articles(self):
        """
        Same as the `stories` property
        but for articles.
        """
        def with_score(assoc):
            assoc.article.relatedness = assoc.score
            return assoc.article

        return list(map(with_score, self.article_associations))

    @property
    def related_concepts(self):
        return self.to_concepts + self.from_concepts

    @property
    def profile(self):
        """
        Returns a data profile specifically
        for this concept's type.
        """
        if not hasattr(self, '_profile') or not self._profile:
            self._profile = knowledge.profiles.get_profile(self.uri)
            self._sources += self._profile.get('sources', [])
        return self._profile

    def conceptize(self):
        """
        Process the concept summary for concepts,
        and add the appropriate mentions.
        """
        concepts = []
        for c_name in gx.concepts(self.summary):
            # Search for the concept.
            uri = knowledge.uri_for_name(c_name)

            if uri:
                slug = uri.split('/')[-1]
            else:
                slug = slugify(c_name)
            c = Concept.query.get(slug)

            # If an concept is found...
            if c:
                # Add this name as a new alias, if necessary.
                alias = Alias.query.filter_by(name=c_name, concept=c).first()
                if not alias:
                    alias = Alias(c_name)
                    c.aliases.append(alias)
                self.mentions.append(alias)

            # If one doesn't exist, create a new one.
            if not c:
                c = Concept(c_name)
                self.mentions.append(c.aliases[0])
                db.session.add(c)
                db.session.commit()

            concepts.append(c)

        # Score the concepts' importance.
        total_found = len(concepts)
        counter = Counter(concepts)
        uniq_concepts = set(concepts)

        assocs = []
        for concept in uniq_concepts:
            score = (counter[concept] - concept.commonness) / total_found
            assoc = ConceptConceptAssociation(concept, score)
            assocs.append(assoc)

        self.concept_associations = assocs
Beispiel #6
0
from datetime import datetime
from Crypto.Cipher import AES

from argos.datastore import db, Model

from flask import current_app
from flask.ext.security import Security, UserMixin, RoleMixin

# Table connecting users and roles
roles_users = db.Table('roles_users',
        db.Column('user_id', db.Integer(), db.ForeignKey('user.id')),
        db.Column('role_id', db.Integer(), db.ForeignKey('role.id')))

# Table for users watching stories.
users_stories = db.Table('users_stories',
        db.Column('user_id', db.Integer(), db.ForeignKey('user.id')),
        db.Column('story_id', db.Integer(), db.ForeignKey('story.id')))

# Table for users bookmarking events.
users_events = db.Table('users_events',
        db.Column('user_id', db.Integer(), db.ForeignKey('user.id')),
        db.Column('event_id', db.Integer(), db.ForeignKey('event.id')))

# Table for users 

class AuthExistsForUserException(Exception):
    pass

class Role(Model, RoleMixin):
    """
    A user's Role
Beispiel #7
0
from argos.core import knowledge
from argos.util import storage

import galaxy as gx
from slugify import slugify
from datetime import datetime
from os.path import splitext
from sqlalchemy import event
from sqlalchemy.ext.declarative import declared_attr

from collections import Counter

concepts_mentions = db.Table(
    'concepts_mentions',
    db.Column(
        'alias_id', db.Integer,
        db.ForeignKey('alias.id', ondelete='CASCADE', onupdate='CASCADE')),
    db.Column(
        'concept_slug', db.String,
        db.ForeignKey('concept.slug', ondelete='CASCADE', onupdate='CASCADE')))


class BaseConceptAssociation(Model):
    """
    Models which will be related to concepts must
    subclass this model and specify a backref name
    through a class property called `__backref__`
    and a foreign key property for the related model.

    Example::
Beispiel #8
0
 def concept_slug(cls):
     return db.Column(db.String,
                      db.ForeignKey('concept.slug',
                                    ondelete='CASCADE',
                                    onupdate='CASCADE'),
                      primary_key=True)
Beispiel #9
0
class Cluster(Clusterable):
    """
    A cluster.

    A Cluster is capable of clustering Clusterables.

    Note: A Cluster itself is a Clusterable; i.e. clusters
    can cluster clusters :)
    """
    __abstract__ = True
    title = db.Column(db.Unicode, default='')
    summary = db.Column(db.UnicodeText)
    image = db.Column(db.String())

    def __str__(self):
        return self.title

    def __repr__(self):
        return self.title

    @declared_attr
    def members(cls):
        """
        Build the members attribute from the
        subclass's `__members__` class attribute.

        Example::

            __members__ = {'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events'}
        """
        args = cls.__members__

        return db.relationship(args['class_name'],
                               secondary=args['secondary'],
                               backref=db.backref(args['backref_name']),
                               lazy='dynamic')

    @staticmethod
    def cluster(cls, clusterables):
        """
        The particular clustering method for this Cluster class.
        Must be implemented on subclasses, otherwise raises NotImplementedError.
        """
        raise NotImplementedError

    def __init__(self, members):
        """
        Initialize a cluster with some members.
        """
        self.members = members
        self.update()

    def summarize(self):
        """
        Generate a summary for this cluster.
        """
        if len(self.members) == 1:
            member = self.members[0]
            self.summary = ' '.join(summarize(member.title, member.text))
        else:
            self.summary = ' '.join(
                multisummarize([m.text for m in self.members]))
        return self.summary

    def conceptize(self):
        """
        Update concepts (and mentions) for this cluster and score them.
        """
        self.mentions = list(
            set(
                chain.from_iterable(
                    [member.mentions for member in self.members])))

        # Get all concept associations for this cluster's members.
        assocs = chain.from_iterable(
            [member.concept_associations for member in self.members])

        # Group associations by their concept.
        # Since `groupby` only looks at adjacent elements,
        # we have to first sort the associations by their concepts' slugs.
        key_func = lambda assoc: assoc.concept.slug
        grouped_assocs = [
            list(g) for k, g in groupby(sorted(assocs, key=key_func), key_func)
        ]

        # Calculate the raw scores of each concept.
        raw_scores = {}
        for assoc_group in grouped_assocs:
            # Each group points to the same concept, so just grab the first.
            concept = assoc_group[0].concept
            raw_scores[concept] = sum(assoc.score for assoc in assoc_group)
        total = sum(raw_scores.values())

        # Calculate the final scores and create the associations.
        assocs = []
        for concept, raw_score in raw_scores.items():
            score = (raw_score / total) / (concept.commonness + 1
                                           )  # +1 to avoid division by zero
            assoc = self.__class__.__concepts__['association_model'](
                concept, score)  # this is nuts
            assocs.append(assoc)
        self.concept_associations = assocs

    def update(self):
        """
        Update the cluster's attributes,
        optionally saving (saves by default).
        """
        self.updated_at = datetime.utcnow()
        self.created_at = datetime.utcnow()

        self.summarize()
        self.conceptize()

    def add(self, member):
        """
        Add an member to the cluster.
        """
        self.members.append(member)

    def timespan(self, start, end=None):
        """
        Get cluster members within a certain (date)timespan.

        Args:
            | start (datetime)
            | end (datetime)    -- default is now (UTC)
        """
        if end is None:
            end = datetime.utcnow()
        return [
            member for member in self.members
            if start < member.created_at < end
        ]
Beispiel #10
0
class Clusterable(Model):
    """
    An abstract class for anything that can be clustered.
    """
    __abstract__ = True
    id = db.Column(db.Integer, primary_key=True)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow)

    @declared_attr
    def concept_associations(cls):
        """
        Build the concepts relationship from the
        subclass's `__concepts__` class attribute.

        This uses an Associated Object so we can
        keep track of an additional property: the
        importance score of a particular concept to a
        given clusterable. The clusterable's concepts are
        directly accessed through the `concepts` property.

        The association model should inherit from BaseConceptAssociation.

        Example::

            __concepts__ = {'association_model': ArticleConceptAssociation,
                            'backref_name': 'article'}
        """
        args = cls.__concepts__

        return db.relationship(args['association_model'],
                               backref=db.backref(args['backref_name']),
                               cascade='all, delete, delete-orphan',
                               order_by=args['association_model'].score.desc())

    @property
    def concepts(self):
        """
        Returns this model's associated concepts,
        along with their importance scores for this
        particular model.

        Note that `concepts` is a readonly property.
        Adding more concepts requires the addition of
        new instances of this model's concept-association model.
        That is, concepts must be added with an importance score
        which is accomplished by using the concept-association model.
        """
        def with_score(assoc):
            assoc.concept.score = assoc.score
            return assoc.concept

        # Only show concepts which have a name (some are extracted but don't map to known entities, so are not given a name).
        return list(
            map(with_score, [
                assoc for assoc in self.concept_associations
                if assoc.concept.name is not None
            ]))

    @property
    def concept_slugs(self):
        return [c.slug for c in self.concepts]

    @declared_attr
    def mentions(cls):
        """
        Build the mentions attribute from the
        subclass's `__mentions__` class attribute.

        Example::

            __mentions__ = {'secondary': articles_mentions, 'backref_name': 'articles'}
        """
        args = cls.__mentions__

        return db.relationship('Alias',
                               secondary=args['secondary'],
                               backref=db.backref(args['backref_name']))
Beispiel #11
0
class Cluster(Clusterable):
    """
    A cluster.

    A Cluster is capable of clustering Clusterables.

    Note: A Cluster itself is a Clusterable; i.e. clusters
    can cluster clusters :)
    """
    __abstract__ = True
    title = db.Column(db.Unicode)
    summary = db.Column(db.UnicodeText)
    image = db.Column(db.String())

    @declared_attr
    def members(cls):
        """
        Build the members attribute from the
        subclass's `__members__` class attribute.

        Example::

            __members__ = {'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events'}
        """
        args = cls.__members__

        return db.relationship(args['class_name'],
                               secondary=args['secondary'],
                               backref=db.backref(args['backref_name']))

    @staticmethod
    def cluster(cls, clusterables):
        """
        The particular clustering method for this Cluster class.
        Must be implemented on subclasses, otherwise raises NotImplementedError.
        """
        raise NotImplementedError

    def __init__(self, members):
        """
        Initialize a cluster with some members and a tag.

        Tags are used to keep track of "levels" or "kinds" of clusters.
        """
        self.members = members
        self.update()

    def summarize(self):
        """
        Generate a summary for this cluster.
        """
        if len(self.members) == 1:
            member = self.members[0]
            self.summary = ' '.join(summarize(member.title, member.text))
        else:
            self.summary = ' '.join(
                multisummarize([m.text for m in self.members]))
        return self.summary

    def titleize(self):
        """
        Generate a title for this cluster.
        Also selects a representative image.

        Looks for the cluster member that is most similar to the others,
        and then uses the title of that member.
        """
        max_member = (None, 0)
        max_member_w_image = (None, 0)
        for member in self.members:
            avg_sim = self.similarity(member)
            if avg_sim >= max_member[1]:
                max_member = (member, avg_sim)
            if avg_sim >= max_member_w_image[1] and member.image is not None:
                max_member_w_image = (member, avg_sim)

        self.title = max_member[0].title

        if max_member_w_image[0] is not None:
            self.image = max_member_w_image[0].image

    def entitize(self):
        """
        Update entities for this cluster.
        """
        self.entities = list(
            set(
                chain.from_iterable(
                    [member.entities for member in self.members])))

    def update(self):
        """
        Update the cluster's attributes,
        optionally saving (saves by default).
        """
        self.titleize()
        self.summarize()
        self.entitize()
        self.updated_at = datetime.utcnow()
        self.created_at = datetime.utcnow()

    def add(self, member):
        """
        Add an member to the cluster.
        """
        self.members.append(member)

    def similarity(self, obj):
        """
        Calculate the similarity of an object with this cluster,
        or the similarity between another cluster and this cluster.
        If it is an object, that object must have a `similarity` method implemented.
        """
        sims = [obj.similarity(member) for member in self.members]

        # Calculate average similarity.
        return sum(sims) / len(sims)

    def timespan(self, start, end=None):
        """
        Get cluster members within a certain (date)timespan.

        Args:
            | start (datetime)
            | end (datetime)    -- default is now (UTC)
        """
        if end is None:
            end = datetime.utcnow()
        return [
            member for member in self.members
            if start < member.created_at < end
        ]
Beispiel #12
0
class Article(Clusterable):
    """
    An article.
    """
    __tablename__ = 'article'
    __concepts__ = {
        'association_model': ArticleConceptAssociation,
        'backref_name': 'article'
    }
    __mentions__ = {'secondary': articles_mentions, 'backref_name': 'articles'}
    title = db.Column(db.Unicode)
    text = db.Column(db.UnicodeText)
    html = db.Column(db.UnicodeText)
    ext_url = db.Column(db.Unicode)
    image = db.Column(db.String)
    ignore = db.Column(db.Boolean, default=False)
    score = db.Column(db.Float, default=0.0)
    source_id = db.Column(db.Integer, db.ForeignKey('source.id'))
    feed_id = db.Column(db.Integer, db.ForeignKey('feed.id'))
    node_id = db.Column(db.Integer, unique=True, index=True)
    authors = db.relationship('Author',
                              secondary=articles_authors,
                              backref=db.backref('articles', lazy='dynamic'))

    # There are some articles which are just noise, and we want to ignore them using regexes for their titles.
    ignore_patterns = [
        # NYT country profiles
        re.compile(r'[A-Z].+\sprofile( - Overview)?')
    ]

    def __str__(self):
        return self.title

    def __repr__(self):
        return self.title

    def __init__(self, **kwargs):
        for key in kwargs:
            setattr(self, key, kwargs[key])

        if self.text is not None:
            self.conceptize()

        if self.score is None:
            self.score = 0.0

        self.check_ignored()

    def check_ignored(self):
        for pattern in self.ignore_patterns:
            if pattern.match(self.title):
                self.ignore = True
                break
        else:
            self.ignore = False
        return self.ignore

    def conceptize(self):
        """
        Process the article text for concepts,
        and add the appropriate mentions.
        """
        concepts = []
        for c_name in gx.concepts(self.text):
            # Search for the concept.
            uri = knowledge.uri_for_name(c_name)

            if uri:
                slug = uri.split('/')[-1]
            else:
                slug = slugify(c_name)
            c = Concept.query.get(slug)

            # If an concept is found...
            if c:
                # Add this name as a new alias, if necessary.
                alias = Alias.query.filter_by(name=c_name, concept=c).first()
                if not alias:
                    alias = Alias(c_name)
                    c.aliases.append(alias)
                # Avoid duplicate aliases.
                if alias not in self.mentions:
                    self.mentions.append(alias)

            # If one doesn't exist, create a new one.
            if not c:
                c = Concept(c_name)
                self.mentions.append(c.aliases[0])
                db.session.add(c)
                db.session.commit()

            concepts.append(c)

        # Score the concepts' importance.
        total_found = len(concepts)
        counter = Counter(concepts)
        uniq_concepts = set(concepts)

        assocs = []
        for concept in uniq_concepts:
            score = counter[concept] / total_found
            assoc = ArticleConceptAssociation(concept, score)
            assocs.append(assoc)

        self.concept_associations = assocs

    @property
    def published(self):
        """Convert datetime to seconds"""
        # If not timezone is set, assume UTC.
        # super annoying and it's probably not a good guess but it's
        # all we got for now.
        # In production, we will be setting article publish times as utc when
        # we fetch them, so it should be less of a problem there.
        if self.created_at.tzinfo is None:
            created_at = self.created_at.replace(tzinfo=pytz.UTC)
        delta = created_at - epoch
        return delta.total_seconds()
Beispiel #13
0
class Article(Clusterable):
    """
    An article.
    """
    __tablename__ = 'article'
    __entities__ = {'secondary': articles_entities, 'backref_name': 'articles'}
    vectors = db.Column(db.PickleType)
    title = db.Column(db.Unicode)
    text = db.Column(db.UnicodeText)
    html = db.Column(db.UnicodeText)
    ext_url = db.Column(db.Unicode)
    image = db.Column(db.String())
    source_id = db.Column(db.Integer, db.ForeignKey('source.id'))
    authors = db.relationship('Author',
                              secondary=articles_authors,
                              backref=db.backref('articles', lazy='dynamic'))

    def __init__(self, **kwargs):
        for key in kwargs:
            setattr(self, key, kwargs[key])

        if self.text is not None:
            self.entitize()
            self.vectorize()

    def vectorize(self):
        """
        Returns a tuple of vectors representing this article.

        Articles are represented by:
            (bag of words vector, entities vector)
        """
        if self.vectors is None:
            bow_vec = vectorize(self.text)
            ent_vec = vectorize(' '.join(entities(self.text)))
            self.vectors = [bow_vec, ent_vec]
        return self.vectors

    def entitize(self):
        """
        Process the article text for entities.
        """
        ents = []
        for e_name in entities(self.text):
            # TO DO: Need to find a way of getting canonical name.

            # Search for the entity.
            slug = slugify(e_name)
            e = Entity.query.get(slug)

            # If one doesn't exist, create a new one.
            if not e:
                e = Entity(e_name)
                db.session.add(e)
                db.session.commit()
            ents.append(e)
        self.entities = ents

    def similarity(self, article):
        """
        Calculate the similarity between this article
        and another article.
        """
        # Compare the text vectors,
        # and the entity vectors.
        v = self.vectorize()
        v_ = article.vectorize()

        # Linearly combine the similarity values,
        # weighing them according to these coefficients.
        # [text vector, entity vector, publication date]
        coefs = [2, 1, 2]
        sim = 0
        for i, vec in enumerate(v):
            dist = jaccard(v_[i], v[i])

            # Two empty vectors returns a jaccard distance of NaN.
            # Set it to be 1, i.e. consider them completely different
            # (or, put more clearly, they have nothing in common)
            # FYI if jaccard runs on empty vectors, it will throw a warning.
            if isnan(dist):
                dist = 1
            s = 1 - dist
            sim += (coefs[i] * s)

        # Also take publication dates into account.
        ideal_time = 259200  # 3 days, in seconds
        t, t_ = self.created_at, article.created_at

        # Subtract the more recent time from the earlier time.
        time_diff = t - t_ if t > t_ else t_ - t
        time_diff = time_diff.total_seconds()

        # Score is normalized [0, 1], where 1 is within the ideal time,
        # and approaches 0 the longer the difference is from the ideal time.
        time_score = 1 if time_diff < ideal_time else ideal_time / time_diff
        sim += (coefs[2] * time_score)

        # Normalize back to [0, 1].
        return sim / sum(coefs)
Beispiel #14
0
from argos.core.models.cluster import Clusterable
from argos.core.brain import vectorize, entities

from scipy.spatial.distance import jaccard

from math import isnan
from slugify import slugify

# Ignore the invalid numpy warning,
# which comes up when jaccard uses
# empty vectors.
import numpy
numpy.seterr(invalid='ignore')

articles_authors = db.Table(
    'authors', db.Column('author_id', db.Integer, db.ForeignKey('author.id')),
    db.Column('article_id', db.Integer, db.ForeignKey('article.id')))

articles_entities = db.Table(
    'articles_entities',
    db.Column('entity_slug', db.String, db.ForeignKey('entity.slug')),
    db.Column('article_id', db.Integer, db.ForeignKey('article.id')))


class Article(Clusterable):
    """
    An article.
    """
    __tablename__ = 'article'
    __entities__ = {'secondary': articles_entities, 'backref_name': 'articles'}
    vectors = db.Column(db.PickleType)
Beispiel #15
0
class Event(Cluster):
    __tablename__ = 'event'
    __members__ = {
        'class_name': 'Article',
        'secondary': events_articles,
        'backref_name': 'events'
    }
    __concepts__ = {
        'association_model': EventConceptAssociation,
        'backref_name': 'event'
    }
    __mentions__ = {'secondary': events_mentions, 'backref_name': 'events'}
    active = db.Column(db.Boolean, default=True)
    raw_score = db.Column(db.Float, default=0.0)
    _score = db.Column(db.Float, default=0.0)

    @classmethod
    def all_active(cls):
        """
        Returns all active events.
        """
        return cls.query.filter_by(active=True).all()

    @property
    def articles(self):
        """
        Convenience :)
        """
        return self.members.all()

    @property
    def num_articles(self):
        return self.members.count()

    @articles.setter
    def articles(self, value):
        self.members = value

    @property
    def images(self):
        """
        Gets images from its members.
        """
        return [
            member.image for member in self.members if member.image is not None
        ]

    @property
    def summary_sentences(self):
        """
        Breaks up a summary back into its
        original sentences (as a list).
        """
        data = [{'sentence': sent} for sent in sent_tokenize(self.summary)]
        for d in data:
            article = next((a for a in self.members
                            if d['sentence'] in ' '.join([a.title, a.text])),
                           None)
            if article is not None:
                d['source'] = article.source.name
                d['url'] = article.ext_url
            else:
                d['source'] = None
                d['url'] = None
        return data

    @property
    def top_concepts(self):
        return self.concepts[:10]

    @property
    def score(self):
        """
        Returns the event's score,
        caculating a fresh value on the fly
        and setting it on the event.
        """
        self._score = self.calculate_score()
        return self._score

    def calculate_score(self):
        """
        Calculates a score for the event,
        based on its articles' scores (its `raw_score`).

        Its score is modified by the oldness of this event.

        Currently this uses the Reddit 'hot' formula,
        see: http://amix.dk/blog/post/19588
        """
        # Calculate the raw score if it doesn't yet exist.
        if not self.raw_score:
            self.raw_score = sum([member.score for member in self.members])
        score = self.raw_score
        epoch = datetime(1970, 1, 1)
        td = self.updated_at - epoch
        epoch_seconds = td.days * 86400 + td.seconds + (
            float(td.microseconds) / 1000000)
        order = log(max(abs(score), 1), 10)
        sign = 1 if score > 0 else -1 if score < 0 else 0
        seconds = epoch_seconds - 1134028003
        return round(order + sign * seconds / 45000, 7)

    def update_score(self):
        # Calculate the raw score.
        self.raw_score = sum([member.score for member in self.members])

        # Cache a score.
        self._score = self.calculate_score()

    @property
    def member_concept_slugs(self):
        """
        The aggregate of all this event's
        articles' concepts
        """
        if not hasattr(self, '_mem_slugs') or self._mem_slugs is None:
            concepts = [' '.join(a.concept_slugs) for a in self.articles]
            self._mem_slugs = ' '.join(concepts)
        return self._mem_slugs

    @property
    def text(self):
        """
        The aggregate of all of this event's
        articles' texts.
        """
        if not hasattr(self, '_text') or self._text is None:
            texts = [' '.join([a.title, a.text]) for a in self.articles]
            self._text = ' '.join(texts)
        return self._text

    def summarize(self):
        """
        Generate a summary for this cluster.
        """
        if self.members.count() == 1:
            member = self.members[0]
            summary_sentences = summarizer.summarize(member.title, member.text)
            self.summary = ' '.join(summary_sentences)
        else:
            summary_sentences = summarizer.multisummarize(
                [m.text for m in self.members])
            self.summary = ' '.join(summary_sentences)
        return self.summary