Example #1
0
    def cluster(articles, threshold=0.7, debug=False):
        """
        Clusters a set of articles
        into existing events (or creates new ones).

        Args:
            | articles (list)       -- the Articles to cluster
            | threshold (float)     -- the similarity threshold for qualifying a cluster
        """
        log = logger('EVENT_CLUSTERING')
        if debug:
            log.setLevel('DEBUG')
        else:
            log.setLevel('ERROR')

        updated_clusters = []
        active_clusters = Event.query.filter_by(active=True).all()
        now = datetime.utcnow()

        for article in articles:
            # Select candidate clusters,
            # i.e. active clusters which share at least one entity with this article.
            a_ents = [entity.slug for entity in article.entities]
            candidate_clusters = []
            for c in active_clusters:
                c_ents = [entity.slug for entity in c.entities]
                if set(c_ents).intersection(a_ents):
                    candidate_clusters.append(c)

            selected_cluster = cluster(article,
                                       candidate_clusters,
                                       threshold=threshold,
                                       logger=log)

            # If no selected cluster was found, then create a new one.
            if not selected_cluster:
                log.debug(
                    'No qualifying clusters found, creating a new cluster.')
                selected_cluster = Event([article])
                db.session.add(selected_cluster)

            updated_clusters.append(selected_cluster)

        for clus in active_clusters:
            # Mark expired clusters inactive.
            if (now - clus.updated_at).days > 3:
                clus.active = False
            else:
                clus.update()

        db.session.commit()
        return updated_clusters
Example #2
0
    def cluster(articles, threshold=0.7, debug=False):
        """
        Clusters a set of articles
        into existing events (or creates new ones).

        Args:
            | articles (list)       -- the Articles to cluster
            | threshold (float)     -- the similarity threshold for qualifying a cluster
        """
        log = logger('EVENT_CLUSTERING')
        if debug:
            log.setLevel('DEBUG')
        else:
            log.setLevel('ERROR')

        updated_clusters = []
        active_clusters = Event.query.filter_by(active=True).all()
        now = datetime.utcnow()

        for article in articles:
            # Select candidate clusters,
            # i.e. active clusters which share at least one entity with this article.
            a_ents = [entity.slug for entity in article.entities]
            candidate_clusters = []
            for c in active_clusters:
                c_ents = [entity.slug for entity in c.entities]
                if set(c_ents).intersection(a_ents):
                    candidate_clusters.append(c)

            selected_cluster = cluster(article, candidate_clusters, threshold=threshold, logger=log)

            # If no selected cluster was found, then create a new one.
            if not selected_cluster:
                log.debug('No qualifying clusters found, creating a new cluster.')
                selected_cluster = Event([article])
                db.session.add(selected_cluster)

            updated_clusters.append(selected_cluster)

        for clus in active_clusters:
            # Mark expired clusters inactive.
            if (now - clus.updated_at).days > 3:
                clus.active = False
            else:
                clus.update()

        db.session.commit()
        return updated_clusters
Example #3
0
    def cluster(events, threshold=0.7, debug=False):
        """
        Clusters a set of events
        into existing stories (or creates new ones).

        Args:
            | events (list)         -- the Events to cluster
            | threshold (float)     -- the similarity threshold for qualifying a cluster
            | debug (bool)          -- will log clustering info if True

        Returns:
            | clusters (list)       -- the list of updated clusters
        """
        log = logger('STORY_CLUSTERING')
        if debug:
            log.setLevel('DEBUG')
        else:
            log.setLevel('ERROR')

        updated_clusters = []

        for event in events:
            # Find stories which have some matching entities with this event.
            candidate_clusters = Story.query.filter(Entity.name.in_([entity.name for entity in event.entities])).all()

            # Cluster this event.
            selected_cluster = cluster(event, candidate_clusters, threshold=threshold, logger=log)

            # If no selected cluster was found, then create a new one.
            if not selected_cluster:
                log.debug('No qualifying clusters found, creating a new cluster.')
                selected_cluster = Story([event])
                db.session.add(selected_cluster)

            updated_clusters.append(selected_cluster)

        db.session.commit()
        return updated_clusters
Example #4
0
import os
import cProfile, pstats

from argos.datastore import db
from argos.core.models import Event, Article
from argos.util.logger import logger
from argos.util.progress import progress_bar

# Logging.
logger = logger(__name__)


def evaluate_clustering():
    """
    Evaluate the clustering algorithm.
    """

    logger.info('Constructing expected clusters and articles...')
    expected_clusters = {}
    articles = []
    all_files = []

    # Collect all appropriate files.
    for dir, subdir, files in os.walk('manage/evaluate/organized_articles'):
        for file in files:
            filepath = os.path.join(dir, file)
            name, ext = os.path.splitext(filepath)
            if ext == '.txt':
                all_files.append((dir, name, filepath))

    # Create articles for appropriate files.
Example #5
0
Example::

    # Print articles from a feed.
    site = 'http://www.polygon.com/'
    feed_url = find_feed(site)
    source = Source(feed_url)
    print(articles(source))
"""

from argos.datastore import db
from argos.core.models import Source, Feed
from argos.core.membrane import feedfinder

from argos.util.logger import logger
logger = logger(__name__)


def find_feed(url):
    """
    Find the RSS feed url for a site.
    Returns the first eligible feed.

    Args:
        | url (str)    -- the url of the site to search.

    Returns:
        | str -- the discovered feed url.
    """
    return feedfinder.feed(url)
Example #6
0
from argos.datastore import db, join_table
from argos.core.models import Concept, Event
from argos.core.models.concept import BaseConceptAssociation
from argos.core.models.cluster import Cluster
from argos.core.brain.summarizer import multisummarize

import itertools
from nltk.tokenize import sent_tokenize

from argos.util.logger import logger
from argos.conf import APP

logr = logger('STORY_CLUSTERING')

if APP['DEBUG']:
    logr.setLevel('DEBUG')
else:
    logr.setLevel('ERROR')

stories_events = join_table('stories_events', 'story', 'event')
stories_mentions = join_table('stories_mentions', 'story', 'alias')

class StoryConceptAssociation(BaseConceptAssociation):
    __backref__     = 'story_associations'
    story_id        = db.Column(db.Integer, db.ForeignKey('story.id', ondelete='CASCADE', onupdate='CASCADE'), primary_key=True)

class Story(Cluster):
    __tablename__   = 'story'
    __members__     = {'class_name': 'Event', 'secondary': stories_events, 'backref_name': 'stories'}
    __concepts__    = {'association_model': StoryConceptAssociation, 'backref_name': 'story'}
    __mentions__    = {'secondary': stories_mentions, 'backref_name': 'stories'}
Example #7
0
from argos.datastore import db, join_table
from argos.core.models import Concept, Event
from argos.core.models.concept import BaseConceptAssociation
from argos.core.models.cluster import Cluster
from argos.core.brain.summarizer import multisummarize

import itertools
from nltk.tokenize import sent_tokenize

from argos.util.logger import logger
from argos.conf import APP

logr = logger('STORY_CLUSTERING')

if APP['DEBUG']:
    logr.setLevel('DEBUG')
else:
    logr.setLevel('ERROR')

stories_events = join_table('stories_events', 'story', 'event')
stories_mentions = join_table('stories_mentions', 'story', 'alias')


class StoryConceptAssociation(BaseConceptAssociation):
    __backref__ = 'story_associations'
    story_id = db.Column(db.Integer,
                         db.ForeignKey('story.id',
                                       ondelete='CASCADE',
                                       onupdate='CASCADE'),
                         primary_key=True)