class Event(Cluster): __tablename__ = 'event' __members__ = { 'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events' } __entities__ = {'secondary': events_entities, 'backref_name': 'events'} active = db.Column(db.Boolean, default=True) @property def articles(self): """ Convenience :) """ return self.members @articles.setter def articles(self, value): self.members = value def summarize(self): """ Generate a summary for this cluster. """ if len(self.members) == 1: member = self.members[0] self.summary = ' '.join(summarize(member.title, member.text)) else: self.summary = ' '.join( multisummarize([m.text for m in self.members])) return self.summary @staticmethod def cluster(articles, threshold=0.7, debug=False): """ Clusters a set of articles into existing events (or creates new ones). Args: | articles (list) -- the Articles to cluster | threshold (float) -- the similarity threshold for qualifying a cluster """ log = logger('EVENT_CLUSTERING') if debug: log.setLevel('DEBUG') else: log.setLevel('ERROR') updated_clusters = [] active_clusters = Event.query.filter_by(active=True).all() now = datetime.utcnow() for article in articles: # Select candidate clusters, # i.e. active clusters which share at least one entity with this article. a_ents = [entity.slug for entity in article.entities] candidate_clusters = [] for c in active_clusters: c_ents = [entity.slug for entity in c.entities] if set(c_ents).intersection(a_ents): candidate_clusters.append(c) selected_cluster = cluster(article, candidate_clusters, threshold=threshold, logger=log) # If no selected cluster was found, then create a new one. if not selected_cluster: log.debug( 'No qualifying clusters found, creating a new cluster.') selected_cluster = Event([article]) db.session.add(selected_cluster) updated_clusters.append(selected_cluster) for clus in active_clusters: # Mark expired clusters inactive. if (now - clus.updated_at).days > 3: clus.active = False else: clus.update() db.session.commit() return updated_clusters
from argos.datastore import db from argos.core.models import Entity from argos.core.models.cluster import Cluster from argos.core.brain.cluster import cluster from argos.core.brain.summarize import multisummarize from argos.util.logger import logger stories_events = db.Table('stories_events', db.Column('story_id', db.Integer, db.ForeignKey('story.id'), primary_key=True), db.Column('event_id', db.Integer, db.ForeignKey('event.id'), primary_key=True) ) stories_entities = db.Table('stories_entities', db.Column('entity_slug', db.String, db.ForeignKey('entity.slug')), db.Column('story_id', db.Integer, db.ForeignKey('story.id')) ) class Story(Cluster): __tablename__ = 'story' __members__ = {'class_name': 'Event', 'secondary': stories_events, 'backref_name': 'stories'} __entities__ = {'secondary': stories_entities, 'backref_name': 'stories'} @property def events(self): """ Convenience :) """ return self.members @events.setter
class User(Model, UserMixin): """ A user Attributes: * id -> Integer (Primary Key) * email -> String (Unique) * password -> String (Unique) * active -> Bool * confirmed_at -> DateTime * roles -> [Role] """ id = db.Column(db.Integer(), primary_key=True) email = db.Column(db.String(255), unique=True) image = db.Column(db.String(255), unique=True) name = db.Column(db.String(255), unique=True) password = db.Column(db.String(255)) active = db.Column(db.Boolean()) confirmed_at = db.Column(db.DateTime()) auths = db.relationship('Auth', backref='user', lazy='dynamic') roles = db.relationship('Role', secondary=roles_users, backref=db.backref('users', lazy='dynamic')) watching = db.relationship('Story', secondary=users_stories, backref=db.backref('watchers', lazy='joined')) bookmarked = db.relationship('Event', secondary=users_events, backref=db.backref('bookmarkers', lazy='joined')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow) def __init__(self, auth=None, **kwargs): for key in kwargs: setattr(self, key, kwargs[key]) def add_provider(self, provider, provider_id, access_token, access_token_secret=None, update=True): """ Add a new provider authentication to this user. Raises an AuthExistsForUserException if this authentication already exists and is associated with another user. Args: | provider (str) -- the provider name, e.g. 'twitter' | provider_id (str) -- the id assigned by the provider | access_token (str) -- the access token | access_token_secret (str) -- the access token secret | update (bool) -- whether or not to update the existing provider authentication, if found (default: True) """ # Check to see if this auth already exists. auth = Auth.for_provider(provider, provider_id) if auth: if auth.user is not self: raise AuthExistsForUserException('Found an existing authorization for {0} associated with another user.'.format(provider)) elif update: auth.update_token(access_token, access_token_secret) else: auth = Auth(provider, provider_id, access_token, access_token_secret) auth.user = self db.session.add(auth) db.session.commit() return auth def merge(self, user): """ Merge this user with another user, where *this* user is considered the canonical user (i.e. its attributes are preferred over the other user's). UI tip: prompt the user to pick which account is their primary one! """ providers = [auth.provider for auth in self.auths] for auth in user.auths: # In the event that the merged user has authentications # which conflict with one on this user, prefer the one on this user. # I don't anticipate this will happen, but it's possible, e.g. if a user # has two twitter accts and authenticates each on different user accts here. if auth.provider not in providers: auth.user = self db.session.delete(user) db.session.commit() @staticmethod def for_provider(provider, provider_id): """ Find an User instance by provider. Args: | provider (str) -- the provider name, e.g. 'twitter' | provider_id (str) -- the user id assigned by the provider """ auth = Auth.for_provider(provider, provider_id) if auth: return auth.user return None
class Auth(Model): """ Represents a third-party authentication. """ id = db.Column(db.BigInteger(), primary_key=True) provider = db.Column(db.String(255)) provider_id = db.Column(db.String(255)) access_token = db.Column(db.String(255)) _access_token_secret = db.Column('access_token_secret', db.LargeBinary(255)) user_id = db.Column(db.Integer, db.ForeignKey('user.id')) def __init__(self, provider, provider_id, access_token, access_token_secret=None): self.provider_id = provider_id self.provider = provider self.access_token = access_token self.access_token_secret = access_token_secret # Generate a unique id for this auth based on the provider and the provider id. self.id = Auth.gen_id(provider, provider_id) def update_token(self, access_token, access_token_secret=None): """ Updates token for an authentication. Enforcing that access tokens and their secrets must be updated in tandem. May need to revisit this decision later. Args: | access_token (str) -- the access token | access_token_secret (str) -- the access token secret """ # If the auth has a token and no secret, just update the token. if self.access_token and self.access_token_secret is None: self.access_token = access_token # Otherwise, the auth has a token and a secret, # and a new secret must be present. elif access_token_secret is None: raise Exception('This authentication requires a token secret, which was not specified.') else: self.access_token = access_token self.access_token_secret = access_token_secret @property def access_token_secret(self): if self._access_token_secret is not None: dec = AES.new(current_app.config['AES_KEY'], AES.MODE_CFB, current_app.config['AES_IV']) return dec.decrypt(self._access_token_secret).decode('utf-8') @access_token_secret.setter def access_token_secret(self, value): if value is not None: enc = AES.new(current_app.config['AES_KEY'], AES.MODE_CFB, current_app.config['AES_IV']) self._access_token_secret = enc.encrypt(value) @staticmethod def for_provider(provider, provider_id): """ Find an Auth instance by provider. Args: | provider (str) -- the provider name, e.g. 'twitter' | provider_id (str) -- the user id assigned by the provider """ id = Auth.gen_id(provider, provider_id) return Auth.query.get(id) @staticmethod def gen_id(provider, provider_id): """ Generates a unique id for an Auth. """ return hash(provider + provider_id)
class Concept(Model): """ An concept, which could be a place, person, organization, topic, etc. You should *not* set the `slug` or `uri`; they are set automatically according to the `name`. In the spirit of Python's developer maturity, you're trusted not to modify them. """ name = db.Column(db.UnicodeText) slug = db.Column(db.String(255), primary_key=True) uri = db.Column(db.String) summary = db.Column(db.UnicodeText) image = db.Column(db.String) updated_at = db.Column(db.DateTime, default=datetime.utcnow) created_at = db.Column(db.DateTime, default=datetime.utcnow) aliases = db.relationship('Alias', backref='concept', lazy='joined') commonness = db.Column(db.Float, default=0.0) # Mapping concepts to concepts, # and tracking mentions of other concepts in this concept's summary. mentions = db.relationship('Alias', secondary=concepts_mentions, backref=db.backref('concepts')) concept_associations = db.relationship( ConceptConceptAssociation, foreign_keys=[ConceptConceptAssociation.from_concept_slug], backref=db.backref('from_concept'), cascade='all, delete-orphan') _sources = ['Wikipedia', 'DBpedia'] def __init__(self, name): """ Initialize a concept by a name, which can be an alias (it does not have to be the canonical name). This specified name will be saved as an Alias. A canonical name will be looked for; if one is found it will be used as the slug for this Concept. """ self.aliases.append(Alias(name)) # Try to get a canonical URI # and derive the slug from that. self.uri = knowledge.uri_for_name(name) if self.uri: self.slug = self.uri.split('/')[-1] k = knowledge.knowledge_for(uri=self.uri, fallback=True) self.commonness = knowledge.commonness_for_uri(self.uri) self.summary = k['summary'] self.name = k['name'] # Download the image. if k['image'] is not None: ext = splitext(k['image'])[-1].lower() self.image = storage.save_from_url( k['image'], '{0}{1}'.format(hash(self.slug), ext)) # If no URI was found, # generate our own slug. # Note: A problem here is that it assumes that # this particular name is the canonical one, # and that we don't collect any information for it. else: self.slug = slugify(name) # Commonness is set to default of 0.0, # which makes sense because if there's no URI for it # it probably is not common at all. @property def names(self): return [alias.name for alias in self.aliases] @property def sources(self): """ Returns the data sources used for this concept. """ return self._sources @property def concepts(self): """ Returns the concepts this concept points *to*, with their importance scores for this concept. """ if self.summary and not len(self.concept_associations): self.conceptize() def with_score(assoc): assoc.concept.score = assoc.score return assoc.concept return list(map(with_score, self.concept_associations)) @property def from_concepts(self): """ Returns the concepts that points to this concept, with their importance scores for this concept. """ def with_score(assoc): assoc.from_concept.score = assoc.score return assoc.from_concept return list(map(with_score, self.from_concept_associations)) @property def stories(self): """ Return the stories associated with this concept, adding an additional "relatedness" value which is the concept's importance score for a particular story. """ def with_score(assoc): assoc.story.relatedness = assoc.score return assoc.story return list(map(with_score, self.story_associations)) @property def events(self): """ Same as the `stories` property but for events. """ def with_score(assoc): assoc.event.relatedness = assoc.score return assoc.event return list(map(with_score, self.event_associations)) @property def articles(self): """ Same as the `stories` property but for articles. """ def with_score(assoc): assoc.article.relatedness = assoc.score return assoc.article return list(map(with_score, self.article_associations)) @property def related_concepts(self): return self.to_concepts + self.from_concepts @property def profile(self): """ Returns a data profile specifically for this concept's type. """ if not hasattr(self, '_profile') or not self._profile: self._profile = knowledge.profiles.get_profile(self.uri) self._sources += self._profile.get('sources', []) return self._profile def conceptize(self): """ Process the concept summary for concepts, and add the appropriate mentions. """ concepts = [] for c_name in gx.concepts(self.summary): # Search for the concept. uri = knowledge.uri_for_name(c_name) if uri: slug = uri.split('/')[-1] else: slug = slugify(c_name) c = Concept.query.get(slug) # If an concept is found... if c: # Add this name as a new alias, if necessary. alias = Alias.query.filter_by(name=c_name, concept=c).first() if not alias: alias = Alias(c_name) c.aliases.append(alias) self.mentions.append(alias) # If one doesn't exist, create a new one. if not c: c = Concept(c_name) self.mentions.append(c.aliases[0]) db.session.add(c) db.session.commit() concepts.append(c) # Score the concepts' importance. total_found = len(concepts) counter = Counter(concepts) uniq_concepts = set(concepts) assocs = [] for concept in uniq_concepts: score = (counter[concept] - concept.commonness) / total_found assoc = ConceptConceptAssociation(concept, score) assocs.append(assoc) self.concept_associations = assocs
from datetime import datetime from Crypto.Cipher import AES from argos.datastore import db, Model from flask import current_app from flask.ext.security import Security, UserMixin, RoleMixin # Table connecting users and roles roles_users = db.Table('roles_users', db.Column('user_id', db.Integer(), db.ForeignKey('user.id')), db.Column('role_id', db.Integer(), db.ForeignKey('role.id'))) # Table for users watching stories. users_stories = db.Table('users_stories', db.Column('user_id', db.Integer(), db.ForeignKey('user.id')), db.Column('story_id', db.Integer(), db.ForeignKey('story.id'))) # Table for users bookmarking events. users_events = db.Table('users_events', db.Column('user_id', db.Integer(), db.ForeignKey('user.id')), db.Column('event_id', db.Integer(), db.ForeignKey('event.id'))) # Table for users class AuthExistsForUserException(Exception): pass class Role(Model, RoleMixin): """ A user's Role
from argos.core import knowledge from argos.util import storage import galaxy as gx from slugify import slugify from datetime import datetime from os.path import splitext from sqlalchemy import event from sqlalchemy.ext.declarative import declared_attr from collections import Counter concepts_mentions = db.Table( 'concepts_mentions', db.Column( 'alias_id', db.Integer, db.ForeignKey('alias.id', ondelete='CASCADE', onupdate='CASCADE')), db.Column( 'concept_slug', db.String, db.ForeignKey('concept.slug', ondelete='CASCADE', onupdate='CASCADE'))) class BaseConceptAssociation(Model): """ Models which will be related to concepts must subclass this model and specify a backref name through a class property called `__backref__` and a foreign key property for the related model. Example::
def concept_slug(cls): return db.Column(db.String, db.ForeignKey('concept.slug', ondelete='CASCADE', onupdate='CASCADE'), primary_key=True)
class Cluster(Clusterable): """ A cluster. A Cluster is capable of clustering Clusterables. Note: A Cluster itself is a Clusterable; i.e. clusters can cluster clusters :) """ __abstract__ = True title = db.Column(db.Unicode, default='') summary = db.Column(db.UnicodeText) image = db.Column(db.String()) def __str__(self): return self.title def __repr__(self): return self.title @declared_attr def members(cls): """ Build the members attribute from the subclass's `__members__` class attribute. Example:: __members__ = {'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events'} """ args = cls.__members__ return db.relationship(args['class_name'], secondary=args['secondary'], backref=db.backref(args['backref_name']), lazy='dynamic') @staticmethod def cluster(cls, clusterables): """ The particular clustering method for this Cluster class. Must be implemented on subclasses, otherwise raises NotImplementedError. """ raise NotImplementedError def __init__(self, members): """ Initialize a cluster with some members. """ self.members = members self.update() def summarize(self): """ Generate a summary for this cluster. """ if len(self.members) == 1: member = self.members[0] self.summary = ' '.join(summarize(member.title, member.text)) else: self.summary = ' '.join( multisummarize([m.text for m in self.members])) return self.summary def conceptize(self): """ Update concepts (and mentions) for this cluster and score them. """ self.mentions = list( set( chain.from_iterable( [member.mentions for member in self.members]))) # Get all concept associations for this cluster's members. assocs = chain.from_iterable( [member.concept_associations for member in self.members]) # Group associations by their concept. # Since `groupby` only looks at adjacent elements, # we have to first sort the associations by their concepts' slugs. key_func = lambda assoc: assoc.concept.slug grouped_assocs = [ list(g) for k, g in groupby(sorted(assocs, key=key_func), key_func) ] # Calculate the raw scores of each concept. raw_scores = {} for assoc_group in grouped_assocs: # Each group points to the same concept, so just grab the first. concept = assoc_group[0].concept raw_scores[concept] = sum(assoc.score for assoc in assoc_group) total = sum(raw_scores.values()) # Calculate the final scores and create the associations. assocs = [] for concept, raw_score in raw_scores.items(): score = (raw_score / total) / (concept.commonness + 1 ) # +1 to avoid division by zero assoc = self.__class__.__concepts__['association_model']( concept, score) # this is nuts assocs.append(assoc) self.concept_associations = assocs def update(self): """ Update the cluster's attributes, optionally saving (saves by default). """ self.updated_at = datetime.utcnow() self.created_at = datetime.utcnow() self.summarize() self.conceptize() def add(self, member): """ Add an member to the cluster. """ self.members.append(member) def timespan(self, start, end=None): """ Get cluster members within a certain (date)timespan. Args: | start (datetime) | end (datetime) -- default is now (UTC) """ if end is None: end = datetime.utcnow() return [ member for member in self.members if start < member.created_at < end ]
class Clusterable(Model): """ An abstract class for anything that can be clustered. """ __abstract__ = True id = db.Column(db.Integer, primary_key=True) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow) @declared_attr def concept_associations(cls): """ Build the concepts relationship from the subclass's `__concepts__` class attribute. This uses an Associated Object so we can keep track of an additional property: the importance score of a particular concept to a given clusterable. The clusterable's concepts are directly accessed through the `concepts` property. The association model should inherit from BaseConceptAssociation. Example:: __concepts__ = {'association_model': ArticleConceptAssociation, 'backref_name': 'article'} """ args = cls.__concepts__ return db.relationship(args['association_model'], backref=db.backref(args['backref_name']), cascade='all, delete, delete-orphan', order_by=args['association_model'].score.desc()) @property def concepts(self): """ Returns this model's associated concepts, along with their importance scores for this particular model. Note that `concepts` is a readonly property. Adding more concepts requires the addition of new instances of this model's concept-association model. That is, concepts must be added with an importance score which is accomplished by using the concept-association model. """ def with_score(assoc): assoc.concept.score = assoc.score return assoc.concept # Only show concepts which have a name (some are extracted but don't map to known entities, so are not given a name). return list( map(with_score, [ assoc for assoc in self.concept_associations if assoc.concept.name is not None ])) @property def concept_slugs(self): return [c.slug for c in self.concepts] @declared_attr def mentions(cls): """ Build the mentions attribute from the subclass's `__mentions__` class attribute. Example:: __mentions__ = {'secondary': articles_mentions, 'backref_name': 'articles'} """ args = cls.__mentions__ return db.relationship('Alias', secondary=args['secondary'], backref=db.backref(args['backref_name']))
class Cluster(Clusterable): """ A cluster. A Cluster is capable of clustering Clusterables. Note: A Cluster itself is a Clusterable; i.e. clusters can cluster clusters :) """ __abstract__ = True title = db.Column(db.Unicode) summary = db.Column(db.UnicodeText) image = db.Column(db.String()) @declared_attr def members(cls): """ Build the members attribute from the subclass's `__members__` class attribute. Example:: __members__ = {'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events'} """ args = cls.__members__ return db.relationship(args['class_name'], secondary=args['secondary'], backref=db.backref(args['backref_name'])) @staticmethod def cluster(cls, clusterables): """ The particular clustering method for this Cluster class. Must be implemented on subclasses, otherwise raises NotImplementedError. """ raise NotImplementedError def __init__(self, members): """ Initialize a cluster with some members and a tag. Tags are used to keep track of "levels" or "kinds" of clusters. """ self.members = members self.update() def summarize(self): """ Generate a summary for this cluster. """ if len(self.members) == 1: member = self.members[0] self.summary = ' '.join(summarize(member.title, member.text)) else: self.summary = ' '.join( multisummarize([m.text for m in self.members])) return self.summary def titleize(self): """ Generate a title for this cluster. Also selects a representative image. Looks for the cluster member that is most similar to the others, and then uses the title of that member. """ max_member = (None, 0) max_member_w_image = (None, 0) for member in self.members: avg_sim = self.similarity(member) if avg_sim >= max_member[1]: max_member = (member, avg_sim) if avg_sim >= max_member_w_image[1] and member.image is not None: max_member_w_image = (member, avg_sim) self.title = max_member[0].title if max_member_w_image[0] is not None: self.image = max_member_w_image[0].image def entitize(self): """ Update entities for this cluster. """ self.entities = list( set( chain.from_iterable( [member.entities for member in self.members]))) def update(self): """ Update the cluster's attributes, optionally saving (saves by default). """ self.titleize() self.summarize() self.entitize() self.updated_at = datetime.utcnow() self.created_at = datetime.utcnow() def add(self, member): """ Add an member to the cluster. """ self.members.append(member) def similarity(self, obj): """ Calculate the similarity of an object with this cluster, or the similarity between another cluster and this cluster. If it is an object, that object must have a `similarity` method implemented. """ sims = [obj.similarity(member) for member in self.members] # Calculate average similarity. return sum(sims) / len(sims) def timespan(self, start, end=None): """ Get cluster members within a certain (date)timespan. Args: | start (datetime) | end (datetime) -- default is now (UTC) """ if end is None: end = datetime.utcnow() return [ member for member in self.members if start < member.created_at < end ]
class Article(Clusterable): """ An article. """ __tablename__ = 'article' __concepts__ = { 'association_model': ArticleConceptAssociation, 'backref_name': 'article' } __mentions__ = {'secondary': articles_mentions, 'backref_name': 'articles'} title = db.Column(db.Unicode) text = db.Column(db.UnicodeText) html = db.Column(db.UnicodeText) ext_url = db.Column(db.Unicode) image = db.Column(db.String) ignore = db.Column(db.Boolean, default=False) score = db.Column(db.Float, default=0.0) source_id = db.Column(db.Integer, db.ForeignKey('source.id')) feed_id = db.Column(db.Integer, db.ForeignKey('feed.id')) node_id = db.Column(db.Integer, unique=True, index=True) authors = db.relationship('Author', secondary=articles_authors, backref=db.backref('articles', lazy='dynamic')) # There are some articles which are just noise, and we want to ignore them using regexes for their titles. ignore_patterns = [ # NYT country profiles re.compile(r'[A-Z].+\sprofile( - Overview)?') ] def __str__(self): return self.title def __repr__(self): return self.title def __init__(self, **kwargs): for key in kwargs: setattr(self, key, kwargs[key]) if self.text is not None: self.conceptize() if self.score is None: self.score = 0.0 self.check_ignored() def check_ignored(self): for pattern in self.ignore_patterns: if pattern.match(self.title): self.ignore = True break else: self.ignore = False return self.ignore def conceptize(self): """ Process the article text for concepts, and add the appropriate mentions. """ concepts = [] for c_name in gx.concepts(self.text): # Search for the concept. uri = knowledge.uri_for_name(c_name) if uri: slug = uri.split('/')[-1] else: slug = slugify(c_name) c = Concept.query.get(slug) # If an concept is found... if c: # Add this name as a new alias, if necessary. alias = Alias.query.filter_by(name=c_name, concept=c).first() if not alias: alias = Alias(c_name) c.aliases.append(alias) # Avoid duplicate aliases. if alias not in self.mentions: self.mentions.append(alias) # If one doesn't exist, create a new one. if not c: c = Concept(c_name) self.mentions.append(c.aliases[0]) db.session.add(c) db.session.commit() concepts.append(c) # Score the concepts' importance. total_found = len(concepts) counter = Counter(concepts) uniq_concepts = set(concepts) assocs = [] for concept in uniq_concepts: score = counter[concept] / total_found assoc = ArticleConceptAssociation(concept, score) assocs.append(assoc) self.concept_associations = assocs @property def published(self): """Convert datetime to seconds""" # If not timezone is set, assume UTC. # super annoying and it's probably not a good guess but it's # all we got for now. # In production, we will be setting article publish times as utc when # we fetch them, so it should be less of a problem there. if self.created_at.tzinfo is None: created_at = self.created_at.replace(tzinfo=pytz.UTC) delta = created_at - epoch return delta.total_seconds()
class Article(Clusterable): """ An article. """ __tablename__ = 'article' __entities__ = {'secondary': articles_entities, 'backref_name': 'articles'} vectors = db.Column(db.PickleType) title = db.Column(db.Unicode) text = db.Column(db.UnicodeText) html = db.Column(db.UnicodeText) ext_url = db.Column(db.Unicode) image = db.Column(db.String()) source_id = db.Column(db.Integer, db.ForeignKey('source.id')) authors = db.relationship('Author', secondary=articles_authors, backref=db.backref('articles', lazy='dynamic')) def __init__(self, **kwargs): for key in kwargs: setattr(self, key, kwargs[key]) if self.text is not None: self.entitize() self.vectorize() def vectorize(self): """ Returns a tuple of vectors representing this article. Articles are represented by: (bag of words vector, entities vector) """ if self.vectors is None: bow_vec = vectorize(self.text) ent_vec = vectorize(' '.join(entities(self.text))) self.vectors = [bow_vec, ent_vec] return self.vectors def entitize(self): """ Process the article text for entities. """ ents = [] for e_name in entities(self.text): # TO DO: Need to find a way of getting canonical name. # Search for the entity. slug = slugify(e_name) e = Entity.query.get(slug) # If one doesn't exist, create a new one. if not e: e = Entity(e_name) db.session.add(e) db.session.commit() ents.append(e) self.entities = ents def similarity(self, article): """ Calculate the similarity between this article and another article. """ # Compare the text vectors, # and the entity vectors. v = self.vectorize() v_ = article.vectorize() # Linearly combine the similarity values, # weighing them according to these coefficients. # [text vector, entity vector, publication date] coefs = [2, 1, 2] sim = 0 for i, vec in enumerate(v): dist = jaccard(v_[i], v[i]) # Two empty vectors returns a jaccard distance of NaN. # Set it to be 1, i.e. consider them completely different # (or, put more clearly, they have nothing in common) # FYI if jaccard runs on empty vectors, it will throw a warning. if isnan(dist): dist = 1 s = 1 - dist sim += (coefs[i] * s) # Also take publication dates into account. ideal_time = 259200 # 3 days, in seconds t, t_ = self.created_at, article.created_at # Subtract the more recent time from the earlier time. time_diff = t - t_ if t > t_ else t_ - t time_diff = time_diff.total_seconds() # Score is normalized [0, 1], where 1 is within the ideal time, # and approaches 0 the longer the difference is from the ideal time. time_score = 1 if time_diff < ideal_time else ideal_time / time_diff sim += (coefs[2] * time_score) # Normalize back to [0, 1]. return sim / sum(coefs)
from argos.core.models.cluster import Clusterable from argos.core.brain import vectorize, entities from scipy.spatial.distance import jaccard from math import isnan from slugify import slugify # Ignore the invalid numpy warning, # which comes up when jaccard uses # empty vectors. import numpy numpy.seterr(invalid='ignore') articles_authors = db.Table( 'authors', db.Column('author_id', db.Integer, db.ForeignKey('author.id')), db.Column('article_id', db.Integer, db.ForeignKey('article.id'))) articles_entities = db.Table( 'articles_entities', db.Column('entity_slug', db.String, db.ForeignKey('entity.slug')), db.Column('article_id', db.Integer, db.ForeignKey('article.id'))) class Article(Clusterable): """ An article. """ __tablename__ = 'article' __entities__ = {'secondary': articles_entities, 'backref_name': 'articles'} vectors = db.Column(db.PickleType)
class Event(Cluster): __tablename__ = 'event' __members__ = { 'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events' } __concepts__ = { 'association_model': EventConceptAssociation, 'backref_name': 'event' } __mentions__ = {'secondary': events_mentions, 'backref_name': 'events'} active = db.Column(db.Boolean, default=True) raw_score = db.Column(db.Float, default=0.0) _score = db.Column(db.Float, default=0.0) @classmethod def all_active(cls): """ Returns all active events. """ return cls.query.filter_by(active=True).all() @property def articles(self): """ Convenience :) """ return self.members.all() @property def num_articles(self): return self.members.count() @articles.setter def articles(self, value): self.members = value @property def images(self): """ Gets images from its members. """ return [ member.image for member in self.members if member.image is not None ] @property def summary_sentences(self): """ Breaks up a summary back into its original sentences (as a list). """ data = [{'sentence': sent} for sent in sent_tokenize(self.summary)] for d in data: article = next((a for a in self.members if d['sentence'] in ' '.join([a.title, a.text])), None) if article is not None: d['source'] = article.source.name d['url'] = article.ext_url else: d['source'] = None d['url'] = None return data @property def top_concepts(self): return self.concepts[:10] @property def score(self): """ Returns the event's score, caculating a fresh value on the fly and setting it on the event. """ self._score = self.calculate_score() return self._score def calculate_score(self): """ Calculates a score for the event, based on its articles' scores (its `raw_score`). Its score is modified by the oldness of this event. Currently this uses the Reddit 'hot' formula, see: http://amix.dk/blog/post/19588 """ # Calculate the raw score if it doesn't yet exist. if not self.raw_score: self.raw_score = sum([member.score for member in self.members]) score = self.raw_score epoch = datetime(1970, 1, 1) td = self.updated_at - epoch epoch_seconds = td.days * 86400 + td.seconds + ( float(td.microseconds) / 1000000) order = log(max(abs(score), 1), 10) sign = 1 if score > 0 else -1 if score < 0 else 0 seconds = epoch_seconds - 1134028003 return round(order + sign * seconds / 45000, 7) def update_score(self): # Calculate the raw score. self.raw_score = sum([member.score for member in self.members]) # Cache a score. self._score = self.calculate_score() @property def member_concept_slugs(self): """ The aggregate of all this event's articles' concepts """ if not hasattr(self, '_mem_slugs') or self._mem_slugs is None: concepts = [' '.join(a.concept_slugs) for a in self.articles] self._mem_slugs = ' '.join(concepts) return self._mem_slugs @property def text(self): """ The aggregate of all of this event's articles' texts. """ if not hasattr(self, '_text') or self._text is None: texts = [' '.join([a.title, a.text]) for a in self.articles] self._text = ' '.join(texts) return self._text def summarize(self): """ Generate a summary for this cluster. """ if self.members.count() == 1: member = self.members[0] summary_sentences = summarizer.summarize(member.title, member.text) self.summary = ' '.join(summary_sentences) else: summary_sentences = summarizer.multisummarize( [m.text for m in self.members]) self.summary = ' '.join(summary_sentences) return self.summary