def cluster(articles, threshold=0.7, debug=False): """ Clusters a set of articles into existing events (or creates new ones). Args: | articles (list) -- the Articles to cluster | threshold (float) -- the similarity threshold for qualifying a cluster """ log = logger('EVENT_CLUSTERING') if debug: log.setLevel('DEBUG') else: log.setLevel('ERROR') updated_clusters = [] active_clusters = Event.query.filter_by(active=True).all() now = datetime.utcnow() for article in articles: # Select candidate clusters, # i.e. active clusters which share at least one entity with this article. a_ents = [entity.slug for entity in article.entities] candidate_clusters = [] for c in active_clusters: c_ents = [entity.slug for entity in c.entities] if set(c_ents).intersection(a_ents): candidate_clusters.append(c) selected_cluster = cluster(article, candidate_clusters, threshold=threshold, logger=log) # If no selected cluster was found, then create a new one. if not selected_cluster: log.debug( 'No qualifying clusters found, creating a new cluster.') selected_cluster = Event([article]) db.session.add(selected_cluster) updated_clusters.append(selected_cluster) for clus in active_clusters: # Mark expired clusters inactive. if (now - clus.updated_at).days > 3: clus.active = False else: clus.update() db.session.commit() return updated_clusters
def cluster(articles, threshold=0.7, debug=False): """ Clusters a set of articles into existing events (or creates new ones). Args: | articles (list) -- the Articles to cluster | threshold (float) -- the similarity threshold for qualifying a cluster """ log = logger('EVENT_CLUSTERING') if debug: log.setLevel('DEBUG') else: log.setLevel('ERROR') updated_clusters = [] active_clusters = Event.query.filter_by(active=True).all() now = datetime.utcnow() for article in articles: # Select candidate clusters, # i.e. active clusters which share at least one entity with this article. a_ents = [entity.slug for entity in article.entities] candidate_clusters = [] for c in active_clusters: c_ents = [entity.slug for entity in c.entities] if set(c_ents).intersection(a_ents): candidate_clusters.append(c) selected_cluster = cluster(article, candidate_clusters, threshold=threshold, logger=log) # If no selected cluster was found, then create a new one. if not selected_cluster: log.debug('No qualifying clusters found, creating a new cluster.') selected_cluster = Event([article]) db.session.add(selected_cluster) updated_clusters.append(selected_cluster) for clus in active_clusters: # Mark expired clusters inactive. if (now - clus.updated_at).days > 3: clus.active = False else: clus.update() db.session.commit() return updated_clusters
def cluster(events, threshold=0.7, debug=False): """ Clusters a set of events into existing stories (or creates new ones). Args: | events (list) -- the Events to cluster | threshold (float) -- the similarity threshold for qualifying a cluster | debug (bool) -- will log clustering info if True Returns: | clusters (list) -- the list of updated clusters """ log = logger('STORY_CLUSTERING') if debug: log.setLevel('DEBUG') else: log.setLevel('ERROR') updated_clusters = [] for event in events: # Find stories which have some matching entities with this event. candidate_clusters = Story.query.filter(Entity.name.in_([entity.name for entity in event.entities])).all() # Cluster this event. selected_cluster = cluster(event, candidate_clusters, threshold=threshold, logger=log) # If no selected cluster was found, then create a new one. if not selected_cluster: log.debug('No qualifying clusters found, creating a new cluster.') selected_cluster = Story([event]) db.session.add(selected_cluster) updated_clusters.append(selected_cluster) db.session.commit() return updated_clusters
import os import cProfile, pstats from argos.datastore import db from argos.core.models import Event, Article from argos.util.logger import logger from argos.util.progress import progress_bar # Logging. logger = logger(__name__) def evaluate_clustering(): """ Evaluate the clustering algorithm. """ logger.info('Constructing expected clusters and articles...') expected_clusters = {} articles = [] all_files = [] # Collect all appropriate files. for dir, subdir, files in os.walk('manage/evaluate/organized_articles'): for file in files: filepath = os.path.join(dir, file) name, ext = os.path.splitext(filepath) if ext == '.txt': all_files.append((dir, name, filepath)) # Create articles for appropriate files.
Example:: # Print articles from a feed. site = 'http://www.polygon.com/' feed_url = find_feed(site) source = Source(feed_url) print(articles(source)) """ from argos.datastore import db from argos.core.models import Source, Feed from argos.core.membrane import feedfinder from argos.util.logger import logger logger = logger(__name__) def find_feed(url): """ Find the RSS feed url for a site. Returns the first eligible feed. Args: | url (str) -- the url of the site to search. Returns: | str -- the discovered feed url. """ return feedfinder.feed(url)
from argos.datastore import db, join_table from argos.core.models import Concept, Event from argos.core.models.concept import BaseConceptAssociation from argos.core.models.cluster import Cluster from argos.core.brain.summarizer import multisummarize import itertools from nltk.tokenize import sent_tokenize from argos.util.logger import logger from argos.conf import APP logr = logger('STORY_CLUSTERING') if APP['DEBUG']: logr.setLevel('DEBUG') else: logr.setLevel('ERROR') stories_events = join_table('stories_events', 'story', 'event') stories_mentions = join_table('stories_mentions', 'story', 'alias') class StoryConceptAssociation(BaseConceptAssociation): __backref__ = 'story_associations' story_id = db.Column(db.Integer, db.ForeignKey('story.id', ondelete='CASCADE', onupdate='CASCADE'), primary_key=True) class Story(Cluster): __tablename__ = 'story' __members__ = {'class_name': 'Event', 'secondary': stories_events, 'backref_name': 'stories'} __concepts__ = {'association_model': StoryConceptAssociation, 'backref_name': 'story'} __mentions__ = {'secondary': stories_mentions, 'backref_name': 'stories'}
from argos.datastore import db, join_table from argos.core.models import Concept, Event from argos.core.models.concept import BaseConceptAssociation from argos.core.models.cluster import Cluster from argos.core.brain.summarizer import multisummarize import itertools from nltk.tokenize import sent_tokenize from argos.util.logger import logger from argos.conf import APP logr = logger('STORY_CLUSTERING') if APP['DEBUG']: logr.setLevel('DEBUG') else: logr.setLevel('ERROR') stories_events = join_table('stories_events', 'story', 'event') stories_mentions = join_table('stories_mentions', 'story', 'alias') class StoryConceptAssociation(BaseConceptAssociation): __backref__ = 'story_associations' story_id = db.Column(db.Integer, db.ForeignKey('story.id', ondelete='CASCADE', onupdate='CASCADE'), primary_key=True)