Ejemplo n.º 1
0
    def test_story_clustering_with_matching_entities(self):
        # This creates a story with duplicate member events.
        story = fac.story()

        # This event is a duplicate of the story's events.
        event = fac.event()

        Story.cluster([event])
        self.assertEqual(len(story.members), 3)
Ejemplo n.º 2
0
    def test_story_clustering_with_matching_entities(self):
        # This creates a story with duplicate member events.
        story = fac.story()

        # This event is a duplicate of the story's events.
        event = fac.event()

        Story.cluster([event])
        self.assertEqual(len(story.members), 3)
Ejemplo n.º 3
0
    def test_story_clustering_without_matching_entities(self):
        story = fac.story()

        # Create an event with completely different entities
        # from the story.
        article = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event = Event([article])

        Story.cluster([event])
        self.assertEqual(len(story.members), 2)
        self.assertEqual(Story.query.count(), 2)
Ejemplo n.º 4
0
    def test_story_clustering_without_matching_entities(self):
        story = fac.story()

        # Create an event with completely different entities
        # from the story.
        article = fac.article(title='The Illiad',
                              text='The Illiad has Argos in it.')
        event = Event([article])

        Story.cluster([event])
        self.assertEqual(len(story.members), 2)
        self.assertEqual(Story.query.count(), 2)
Ejemplo n.º 5
0
def process_stories(clusters):
    """
    Takes clusters of node uuids and
    builds, modifies, and deletes stories out of them.

    `clusters` comes in as a list of lists, where sublists' members are article node ids.

    e.g::

        [[1,2,3,4,5],[6,7,8,9]]
    """
    story_map = {}
    existing = {}

    # Yikes this might be too much
    # TODO Should probably preserve existing story node id composition separately.
    for s in Story.query.all():
        story_map[s.id] = s
        existing[s.id] = [e.id for e in s.events]

    # Figure out which stories to update, delete, and create.
    to_update, to_create, to_delete, unchanged = triage(existing, clusters)

    for e_ids in to_create:
        events = Event.query.filter(Event.id.in_(e_ids)).order_by(Event.created_at.desc()).all()
        story = Story(events)

        story.created_at = events[0].created_at
        story.updated_at = events[-1].created_at

        # TODO need a better way of coming up with a story title.
        # Perhaps the easiest way if stories just don't have titles and are just groupings of events.
        # For now, just using the latest event title and image.
        story.title = events[0].title
        story.image = events[0].image

        db.session.add(story)

    for s_id, e_ids in to_update.items():
        s = story_map[s_id]
        events = Event.query.filter(Event.id.in_(e_ids)).order_by(Event.created_at.desc()).all()
        s.members = events

        s.title = events[0].title
        s.image = events[0].image

        s.update()

    for s_id in to_delete:
        db.session.delete(story_map[s_id])

    db.session.commit()

    # Delete any stories that no longer have events.
    # http://stackoverflow.com/a/7954618/1097920
    Story.query.filter(~Story.members.any()).delete(synchronize_session='fetch')
Ejemplo n.º 6
0
    def _create_dated_story(self):
        datetime_A = datetime.utcnow() - timedelta(days=1)
        datetime_B = datetime.utcnow() - timedelta(days=5)

        article_a = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_a = Event([article_a])
        event_a.created_at = datetime_A

        article_b = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_b = Event([article_b])
        event_b.created_at = datetime_B

        article_c = fac.article(title='The Illiad', text='The Illiad has Argos in it.')
        event_c = Event([article_c])
        event_c.created_at = datetime_A

        story = Story([event_a, event_b, event_c])

        self.db.session.add(story)
        self.db.session.commit()

        return story, datetime_A, datetime_B
Ejemplo n.º 7
0
def seed(debug=False):
    this_dir = os.path.dirname(__file__)
    seeds = open(os.path.join(this_dir, 'seed.json'), 'r')
    sources = open(os.path.join(this_dir, 'seed_sources.json'), 'r')

    sample_images = [
        'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg'
        'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg',
        'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg'
    ]

    print('Resetting the database...')
    db.drop_all()
    db.create_all()

    # Create sources
    print('Creating sources...')
    for url in json.load(sources):
        s = Source(ext_url=url, name='The Times') # fake name
        db.session.add(s)
    db.session.commit()
    num_sources = Source.query.count()
    print('Created {0} sources.'.format(num_sources))

    # Create articles
    entries = json.load(seeds)
    print('Seeding {0} articles...'.format(len(entries)))
    articles = []
    for entry in entries:
        if debug:
            print(json.dumps(entry, sort_keys=True, indent=4))

        source = Source.query.filter_by(ext_url=entry['source']).first()

        a = Article(
                ext_url=entry['url'],
                source=source,
                html=entry['html'],
                text=entry['text'],
                tags=entry['tags'],
                title=entry['title'],
                created_at = parse(entry['published']),
                updated_at = parse(entry['updated']),
                image=random.choice(sample_images) # fake image
        )
        articles.append(a)
        db.session.add(a)

        progress_bar(len(articles) / len(entries) * 100)

    db.session.commit()

    num_articles = Article.query.count()
    num_entities = Entity.query.count()
    print('Seeded {0} articles.'.format(num_articles))
    print('Found {0} entities.'.format(num_entities))

    print('Clustering articles into events...')
    Event.cluster(articles, threshold=0.02, debug=True)
    num_events = Event.query.count()
    print('Created {0} event clusters.'.format(num_events))

    print('Clustering events into stories...')
    events = Event.query.all()
    Story.cluster(events, threshold=0.02, debug=True)
    num_stories = Story.query.count()
    print('Created {0} story clusters.'.format(num_stories))

    print('\n\n==============================================')
    print('From {0} sources, seeded {1} articles, found {2} entities, created {3} events and {4} stories.'.format(num_sources, num_articles, num_entities, num_events, num_stories))
    print('==============================================\n\n')