def members(cls): """ Build the members attribute from the subclass's `__members__` class attribute. Example:: __members__ = {'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events'} """ args = cls.__members__ return db.relationship(args['class_name'], secondary=args['secondary'], backref=db.backref(args['backref_name']))
class ConceptConceptAssociation(BaseConceptAssociation): from_concept_slug = db.Column(db.String, db.ForeignKey('concept.slug', ondelete='CASCADE', onupdate='CASCADE'), primary_key=True) concept_slug = db.Column(db.String, db.ForeignKey('concept.slug', ondelete='CASCADE', onupdate='CASCADE'), primary_key=True) concept = db.relationship('Concept', backref=db.backref('from_concept_associations'), foreign_keys=[concept_slug])
def mentions(cls): """ Build the mentions attribute from the subclass's `__mentions__` class attribute. Example:: __mentions__ = {'secondary': articles_mentions, 'backref_name': 'articles'} """ args = cls.__mentions__ return db.relationship('Alias', secondary=args['secondary'], backref=db.backref(args['backref_name']))
def entities(cls): """ Build the entities attribute from the subclass's `__entities__` class attribute. Example:: __entities__ = {'secondary': articles_entities, 'backref_name': 'articles'} """ args = cls.__entities__ return db.relationship('Entity', secondary=args['secondary'], backref=db.backref(args['backref_name']))
def concept_associations(cls): """ Build the concepts relationship from the subclass's `__concepts__` class attribute. This uses an Associated Object so we can keep track of an additional property: the importance score of a particular concept to a given clusterable. The clusterable's concepts are directly accessed through the `concepts` property. The association model should inherit from BaseConceptAssociation. Example:: __concepts__ = {'association_model': ArticleConceptAssociation, 'backref_name': 'article'} """ args = cls.__concepts__ return db.relationship(args['association_model'], backref=db.backref(args['backref_name']), cascade='all, delete, delete-orphan', order_by=args['association_model'].score.desc())
class User(Model, UserMixin): """ A user Attributes: * id -> Integer (Primary Key) * email -> String (Unique) * password -> String (Unique) * active -> Bool * confirmed_at -> DateTime * roles -> [Role] """ id = db.Column(db.Integer(), primary_key=True) email = db.Column(db.String(255), unique=True) image = db.Column(db.String(255), unique=True) name = db.Column(db.String(255), unique=True) password = db.Column(db.String(255)) active = db.Column(db.Boolean()) confirmed_at = db.Column(db.DateTime()) auths = db.relationship('Auth', backref='user', lazy='dynamic') roles = db.relationship('Role', secondary=roles_users, backref=db.backref('users', lazy='dynamic')) watching = db.relationship('Story', secondary=users_stories, backref=db.backref('watchers', lazy='joined')) bookmarked = db.relationship('Event', secondary=users_events, backref=db.backref('bookmarkers', lazy='joined')) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow) def __init__(self, auth=None, **kwargs): for key in kwargs: setattr(self, key, kwargs[key]) def add_provider(self, provider, provider_id, access_token, access_token_secret=None, update=True): """ Add a new provider authentication to this user. Raises an AuthExistsForUserException if this authentication already exists and is associated with another user. Args: | provider (str) -- the provider name, e.g. 'twitter' | provider_id (str) -- the id assigned by the provider | access_token (str) -- the access token | access_token_secret (str) -- the access token secret | update (bool) -- whether or not to update the existing provider authentication, if found (default: True) """ # Check to see if this auth already exists. auth = Auth.for_provider(provider, provider_id) if auth: if auth.user is not self: raise AuthExistsForUserException('Found an existing authorization for {0} associated with another user.'.format(provider)) elif update: auth.update_token(access_token, access_token_secret) else: auth = Auth(provider, provider_id, access_token, access_token_secret) auth.user = self db.session.add(auth) db.session.commit() return auth def merge(self, user): """ Merge this user with another user, where *this* user is considered the canonical user (i.e. its attributes are preferred over the other user's). UI tip: prompt the user to pick which account is their primary one! """ providers = [auth.provider for auth in self.auths] for auth in user.auths: # In the event that the merged user has authentications # which conflict with one on this user, prefer the one on this user. # I don't anticipate this will happen, but it's possible, e.g. if a user # has two twitter accts and authenticates each on different user accts here. if auth.provider not in providers: auth.user = self db.session.delete(user) db.session.commit() @staticmethod def for_provider(provider, provider_id): """ Find an User instance by provider. Args: | provider (str) -- the provider name, e.g. 'twitter' | provider_id (str) -- the user id assigned by the provider """ auth = Auth.for_provider(provider, provider_id) if auth: return auth.user return None
class Concept(Model): """ An concept, which could be a place, person, organization, topic, etc. You should *not* set the `slug` or `uri`; they are set automatically according to the `name`. In the spirit of Python's developer maturity, you're trusted not to modify them. """ name = db.Column(db.UnicodeText) slug = db.Column(db.String(255), primary_key=True) uri = db.Column(db.String) summary = db.Column(db.UnicodeText) image = db.Column(db.String) updated_at = db.Column(db.DateTime, default=datetime.utcnow) created_at = db.Column(db.DateTime, default=datetime.utcnow) aliases = db.relationship('Alias', backref='concept', lazy='joined') commonness = db.Column(db.Float, default=0.0) # Mapping concepts to concepts, # and tracking mentions of other concepts in this concept's summary. mentions = db.relationship('Alias', secondary=concepts_mentions, backref=db.backref('concepts')) concept_associations = db.relationship( ConceptConceptAssociation, foreign_keys=[ConceptConceptAssociation.from_concept_slug], backref=db.backref('from_concept'), cascade='all, delete-orphan') _sources = ['Wikipedia', 'DBpedia'] def __init__(self, name): """ Initialize a concept by a name, which can be an alias (it does not have to be the canonical name). This specified name will be saved as an Alias. A canonical name will be looked for; if one is found it will be used as the slug for this Concept. """ self.aliases.append(Alias(name)) # Try to get a canonical URI # and derive the slug from that. self.uri = knowledge.uri_for_name(name) if self.uri: self.slug = self.uri.split('/')[-1] k = knowledge.knowledge_for(uri=self.uri, fallback=True) self.commonness = knowledge.commonness_for_uri(self.uri) self.summary = k['summary'] self.name = k['name'] # Download the image. if k['image'] is not None: ext = splitext(k['image'])[-1].lower() self.image = storage.save_from_url( k['image'], '{0}{1}'.format(hash(self.slug), ext)) # If no URI was found, # generate our own slug. # Note: A problem here is that it assumes that # this particular name is the canonical one, # and that we don't collect any information for it. else: self.slug = slugify(name) # Commonness is set to default of 0.0, # which makes sense because if there's no URI for it # it probably is not common at all. @property def names(self): return [alias.name for alias in self.aliases] @property def sources(self): """ Returns the data sources used for this concept. """ return self._sources @property def concepts(self): """ Returns the concepts this concept points *to*, with their importance scores for this concept. """ if self.summary and not len(self.concept_associations): self.conceptize() def with_score(assoc): assoc.concept.score = assoc.score return assoc.concept return list(map(with_score, self.concept_associations)) @property def from_concepts(self): """ Returns the concepts that points to this concept, with their importance scores for this concept. """ def with_score(assoc): assoc.from_concept.score = assoc.score return assoc.from_concept return list(map(with_score, self.from_concept_associations)) @property def stories(self): """ Return the stories associated with this concept, adding an additional "relatedness" value which is the concept's importance score for a particular story. """ def with_score(assoc): assoc.story.relatedness = assoc.score return assoc.story return list(map(with_score, self.story_associations)) @property def events(self): """ Same as the `stories` property but for events. """ def with_score(assoc): assoc.event.relatedness = assoc.score return assoc.event return list(map(with_score, self.event_associations)) @property def articles(self): """ Same as the `stories` property but for articles. """ def with_score(assoc): assoc.article.relatedness = assoc.score return assoc.article return list(map(with_score, self.article_associations)) @property def related_concepts(self): return self.to_concepts + self.from_concepts @property def profile(self): """ Returns a data profile specifically for this concept's type. """ if not hasattr(self, '_profile') or not self._profile: self._profile = knowledge.profiles.get_profile(self.uri) self._sources += self._profile.get('sources', []) return self._profile def conceptize(self): """ Process the concept summary for concepts, and add the appropriate mentions. """ concepts = [] for c_name in gx.concepts(self.summary): # Search for the concept. uri = knowledge.uri_for_name(c_name) if uri: slug = uri.split('/')[-1] else: slug = slugify(c_name) c = Concept.query.get(slug) # If an concept is found... if c: # Add this name as a new alias, if necessary. alias = Alias.query.filter_by(name=c_name, concept=c).first() if not alias: alias = Alias(c_name) c.aliases.append(alias) self.mentions.append(alias) # If one doesn't exist, create a new one. if not c: c = Concept(c_name) self.mentions.append(c.aliases[0]) db.session.add(c) db.session.commit() concepts.append(c) # Score the concepts' importance. total_found = len(concepts) counter = Counter(concepts) uniq_concepts = set(concepts) assocs = [] for concept in uniq_concepts: score = (counter[concept] - concept.commonness) / total_found assoc = ConceptConceptAssociation(concept, score) assocs.append(assoc) self.concept_associations = assocs
class Article(Clusterable): """ An article. """ __tablename__ = 'article' __concepts__ = { 'association_model': ArticleConceptAssociation, 'backref_name': 'article' } __mentions__ = {'secondary': articles_mentions, 'backref_name': 'articles'} title = db.Column(db.Unicode) text = db.Column(db.UnicodeText) html = db.Column(db.UnicodeText) ext_url = db.Column(db.Unicode) image = db.Column(db.String) ignore = db.Column(db.Boolean, default=False) score = db.Column(db.Float, default=0.0) source_id = db.Column(db.Integer, db.ForeignKey('source.id')) feed_id = db.Column(db.Integer, db.ForeignKey('feed.id')) node_id = db.Column(db.Integer, unique=True, index=True) authors = db.relationship('Author', secondary=articles_authors, backref=db.backref('articles', lazy='dynamic')) # There are some articles which are just noise, and we want to ignore them using regexes for their titles. ignore_patterns = [ # NYT country profiles re.compile(r'[A-Z].+\sprofile( - Overview)?') ] def __str__(self): return self.title def __repr__(self): return self.title def __init__(self, **kwargs): for key in kwargs: setattr(self, key, kwargs[key]) if self.text is not None: self.conceptize() if self.score is None: self.score = 0.0 self.check_ignored() def check_ignored(self): for pattern in self.ignore_patterns: if pattern.match(self.title): self.ignore = True break else: self.ignore = False return self.ignore def conceptize(self): """ Process the article text for concepts, and add the appropriate mentions. """ concepts = [] for c_name in gx.concepts(self.text): # Search for the concept. uri = knowledge.uri_for_name(c_name) if uri: slug = uri.split('/')[-1] else: slug = slugify(c_name) c = Concept.query.get(slug) # If an concept is found... if c: # Add this name as a new alias, if necessary. alias = Alias.query.filter_by(name=c_name, concept=c).first() if not alias: alias = Alias(c_name) c.aliases.append(alias) # Avoid duplicate aliases. if alias not in self.mentions: self.mentions.append(alias) # If one doesn't exist, create a new one. if not c: c = Concept(c_name) self.mentions.append(c.aliases[0]) db.session.add(c) db.session.commit() concepts.append(c) # Score the concepts' importance. total_found = len(concepts) counter = Counter(concepts) uniq_concepts = set(concepts) assocs = [] for concept in uniq_concepts: score = counter[concept] / total_found assoc = ArticleConceptAssociation(concept, score) assocs.append(assoc) self.concept_associations = assocs @property def published(self): """Convert datetime to seconds""" # If not timezone is set, assume UTC. # super annoying and it's probably not a good guess but it's # all we got for now. # In production, we will be setting article publish times as utc when # we fetch them, so it should be less of a problem there. if self.created_at.tzinfo is None: created_at = self.created_at.replace(tzinfo=pytz.UTC) delta = created_at - epoch return delta.total_seconds()
class Article(Clusterable): """ An article. """ __tablename__ = 'article' __entities__ = {'secondary': articles_entities, 'backref_name': 'articles'} vectors = db.Column(db.PickleType) title = db.Column(db.Unicode) text = db.Column(db.UnicodeText) html = db.Column(db.UnicodeText) ext_url = db.Column(db.Unicode) image = db.Column(db.String()) source_id = db.Column(db.Integer, db.ForeignKey('source.id')) authors = db.relationship('Author', secondary=articles_authors, backref=db.backref('articles', lazy='dynamic')) def __init__(self, **kwargs): for key in kwargs: setattr(self, key, kwargs[key]) if self.text is not None: self.entitize() self.vectorize() def vectorize(self): """ Returns a tuple of vectors representing this article. Articles are represented by: (bag of words vector, entities vector) """ if self.vectors is None: bow_vec = vectorize(self.text) ent_vec = vectorize(' '.join(entities(self.text))) self.vectors = [bow_vec, ent_vec] return self.vectors def entitize(self): """ Process the article text for entities. """ ents = [] for e_name in entities(self.text): # TO DO: Need to find a way of getting canonical name. # Search for the entity. slug = slugify(e_name) e = Entity.query.get(slug) # If one doesn't exist, create a new one. if not e: e = Entity(e_name) db.session.add(e) db.session.commit() ents.append(e) self.entities = ents def similarity(self, article): """ Calculate the similarity between this article and another article. """ # Compare the text vectors, # and the entity vectors. v = self.vectorize() v_ = article.vectorize() # Linearly combine the similarity values, # weighing them according to these coefficients. # [text vector, entity vector, publication date] coefs = [2, 1, 2] sim = 0 for i, vec in enumerate(v): dist = jaccard(v_[i], v[i]) # Two empty vectors returns a jaccard distance of NaN. # Set it to be 1, i.e. consider them completely different # (or, put more clearly, they have nothing in common) # FYI if jaccard runs on empty vectors, it will throw a warning. if isnan(dist): dist = 1 s = 1 - dist sim += (coefs[i] * s) # Also take publication dates into account. ideal_time = 259200 # 3 days, in seconds t, t_ = self.created_at, article.created_at # Subtract the more recent time from the earlier time. time_diff = t - t_ if t > t_ else t_ - t time_diff = time_diff.total_seconds() # Score is normalized [0, 1], where 1 is within the ideal time, # and approaches 0 the longer the difference is from the ideal time. time_score = 1 if time_diff < ideal_time else ideal_time / time_diff sim += (coefs[2] * time_score) # Normalize back to [0, 1]. return sim / sum(coefs)