def test_summary_sentences(self): # Check to see that we can break up the summary # back into its original sentences. from argos.core.brain import summarizer title = 'Syria Misses New Deadline as It Works to Purge Arms' text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.' expected_sents = summarizer.summarize(title, text) source = Source() source.name = 'Super Cool Times' article = Article(title=title, text=text, score=100) article.source = source article.ext_url = 'http://foo.com' self.event = Event([article]) expected = [{ 'sentence': sent, 'source': 'Super Cool Times', 'url': 'http://foo.com' } for sent in expected_sents] self.assertEqual(self.event.summary_sentences, expected)
def test_event_no_matching_cluster_creates_new_cluster(self): article = Article(title='Superstars', text='superstars are awesome, Clinton', created_at=datetime.utcnow()) Event.cluster([article]) self.assertEqual(Event.query.count(), 1)
def test_event_clusters_similar(self): self.prepare_event() members = self.prepare_articles(type='duplicate') self.cluster.members = members Event.cluster([self.article]) self.assertEqual(len(self.cluster.members), 3)
def test_event_does_not_cluster_not_similar(self): self.prepare_event() article = Article(title='Superstars', text='superstars are awesome, Clinton', created_at=datetime.utcnow()) Event.cluster([article]) self.assertEqual(len(self.cluster.members), 2)
def test_event_entitize(self): members = [ Article(title='Robots', text='dinosaurs are cool, Reagan'), self.prepare_articles()[0] ] self.cluster = Event(members) entities = {ent.name for ent in self.cluster.entities} self.assertEqual(entities, {'Clinton', 'Reagan'})
def test_event_does_not_cluster_not_similar(self): self.prepare_event() article = Article( title='Superstars', text='superstars are awesome, Clinton', created_at=datetime.utcnow() ) Event.cluster([article]) self.assertEqual(len(self.cluster.members), 2)
def test_event_no_matching_cluster_creates_new_cluster(self): article = Article( title='Superstars', text='superstars are awesome, Clinton', created_at=datetime.utcnow() ) Event.cluster([article]) self.assertEqual(Event.query.count(), 1)
def test_event_does_not_cluster_if_no_shared_entities(self): self.prepare_event() members = [ Article(title='Robots', text='dinosaurs are cool, Reagan', created_at=datetime.utcnow()) ] self.cluster.members = members Event.cluster([self.article]) self.assertEqual(len(self.cluster.members), 1)
def test_event_does_not_cluster_if_no_shared_entities(self): self.prepare_event() members = [Article( title='Robots', text='dinosaurs are cool, Reagan', created_at=datetime.utcnow() )] self.cluster.members = members Event.cluster([self.article]) self.assertEqual(len(self.cluster.members), 1)
def test_event_titleize(self): members = [Article( title='Robots', text='dinosaurs are cool, Reagan' )] + self.prepare_articles(type='duplicate') self.cluster = Event(members) self.assertEqual(self.cluster.title, 'Dinosaurs')
def test_summary_sentences(self): # Check to see that we can break up the summary # back into its original sentences. from argos.core.brain import summarizer title = 'Syria Misses New Deadline as It Works to Purge Arms' text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.' expected_sents = summarizer.summarize(title, text) source = Source() source.name = 'Super Cool Times' article = Article( title=title, text=text, score=100) article.source = source article.ext_url = 'http://foo.com' self.event=Event([article]) expected = [{ 'sentence': sent, 'source': 'Super Cool Times', 'url': 'http://foo.com' } for sent in expected_sents] self.assertEqual(self.event.summary_sentences, expected)
def test_conceptize(self): members = [ Article(title='Robots', text='dinosaurs are cool, Reagan'), self.prepare_articles()[0] ] self.event = Event(members) concepts = {con.slug for con in self.event.concepts} mentions = {ali.name for ali in self.event.mentions} self.assertEqual(concepts, {'Clinton', 'Reagan'}) self.assertEqual(mentions, {'Clinton', 'Reagan'}) # Each concept's score won't be 0.5, since # they are weighed down by the commonness. for concept in self.event.concepts: self.assertAlmostEqual(concept.score, 0.005, places=3)
def test_event_entitize(self): members = [Article( title='Robots', text='dinosaurs are cool, Reagan' ), self.prepare_articles()[0]] self.cluster = Event(members) entities = {ent.name for ent in self.cluster.entities} self.assertEqual(entities, {'Clinton', 'Reagan'})
def test_event_similarity_with_cluster_different(self): self.prepare_event() members = self.prepare_articles(type='different') c = Event(members) avg_sim = self.cluster.similarity(c) self.assertNotEqual(avg_sim, 1.0) self.assertNotEqual(avg_sim, 0.0)
def test_event_timespan(self): text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs' members = [ Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)), Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)), Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111)) ] self.cluster = Event(members) results = self.cluster.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111)) self.assertEqual(len(results), 2) self.assertEqual({r.title for r in results}, {'B', 'C'})
def test_timespan(self): text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs' members = [ Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)), Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)), Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111)) ] self.event = Event(members) results = self.event.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111)) self.assertEqual(len(results), 2) self.assertEqual({r.title for r in results}, {'B', 'C'})
def test_event_deletion_removes_from_articles_events(self): articles = self.prepare_articles() for article in articles: self.db.session.add(article) # Make an event. self.event = Event(articles) self.db.session.add(self.event) self.db.session.commit() # The articles should reference their events. for article in articles: self.assertEqual(article.events, [self.event]) # Destroy events. Event.query.delete() self.db.session.commit() # The articles should no longer have references to the events. for article in articles: self.assertEqual(article.events, [])
def test_story_clustering_without_matching_entities(self): story = fac.story() # Create an event with completely different entities # from the story. article = fac.article(title='The Illiad', text='The Illiad has Argos in it.') event = Event([article]) Story.cluster([event]) self.assertEqual(len(story.members), 2) self.assertEqual(Story.query.count(), 2)
def test_event_similarity_with_cluster_duplicates(self): self.prepare_event() members = (self.prepare_articles()) c = Event(members) avg_sim = self.cluster.similarity(c) # Currently, the similarity calculation between clusters # does not yield 1.0 if they are identical clusters, # because we calculate the average similarity of the articles # between the clusters, rather than the overlap of the two clusters. #self.assertEqual(avg_sim, 1.0) self.assertAlmostEqual(avg_sim, 0.83999999999999)
def _create_dated_story(self): datetime_A = datetime.utcnow() - timedelta(days=1) datetime_B = datetime.utcnow() - timedelta(days=5) article_a = fac.article(title='The Illiad', text='The Illiad has Argos in it.') event_a = Event([article_a]) event_a.created_at = datetime_A article_b = fac.article(title='The Illiad', text='The Illiad has Argos in it.') event_b = Event([article_b]) event_b.created_at = datetime_B article_c = fac.article(title='The Illiad', text='The Illiad has Argos in it.') event_c = Event([article_c]) event_c.created_at = datetime_A story = Story([event_a, event_b, event_c]) self.db.session.add(story) self.db.session.commit() return story, datetime_A, datetime_B
def test_conceptize(self): members = [Article( title='Robots', text='dinosaurs are cool, Reagan' ), self.prepare_articles()[0]] self.event = Event(members) concepts = {con.slug for con in self.event.concepts} mentions = {ali.name for ali in self.event.mentions} self.assertEqual(concepts, {'Clinton', 'Reagan'}) self.assertEqual(mentions, {'Clinton', 'Reagan'}) # Each concept's score won't be 0.5, since # they are weighed down by the commonness. for concept in self.event.concepts: self.assertAlmostEqual(concept.score, 0.005, places=3)
def prepare_event(self): self.event = Event(self.prepare_articles()) self.db.session.add(self.event) self.db.session.commit()
def seed(debug=False): this_dir = os.path.dirname(__file__) seeds = open(os.path.join(this_dir, 'seed.json'), 'r') sources = open(os.path.join(this_dir, 'seed_sources.json'), 'r') sample_images = [ 'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg', 'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg' 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg' ] print('Resetting the database...') db.drop_all() db.create_all() # Create sources print('Creating sources...') for url in json.load(sources): s = Source(ext_url=url, name='The Times') # fake name db.session.add(s) db.session.commit() num_sources = Source.query.count() print('Created {0} sources.'.format(num_sources)) # Create articles entries = json.load(seeds) print('Seeding {0} articles...'.format(len(entries))) articles = [] for entry in entries: if debug: print(json.dumps(entry, sort_keys=True, indent=4)) source = Source.query.filter_by(ext_url=entry['source']).first() a = Article( ext_url=entry['url'], source=source, html=entry['html'], text=entry['text'], tags=entry['tags'], title=entry['title'], created_at = parse(entry['published']), updated_at = parse(entry['updated']), image=random.choice(sample_images) # fake image ) articles.append(a) db.session.add(a) progress_bar(len(articles) / len(entries) * 100) db.session.commit() num_articles = Article.query.count() num_entities = Entity.query.count() print('Seeded {0} articles.'.format(num_articles)) print('Found {0} entities.'.format(num_entities)) print('Clustering articles into events...') Event.cluster(articles, threshold=0.02, debug=True) num_events = Event.query.count() print('Created {0} event clusters.'.format(num_events)) print('Clustering events into stories...') events = Event.query.all() Story.cluster(events, threshold=0.02, debug=True) num_stories = Story.query.count() print('Created {0} story clusters.'.format(num_stories)) print('\n\n==============================================') print('From {0} sources, seeded {1} articles, found {2} entities, created {3} events and {4} stories.'.format(num_sources, num_articles, num_entities, num_events, num_stories)) print('==============================================\n\n')
def test_event_summarize(self): self.cluster = Event(self.prepare_articles()) self.assertTrue(self.cluster.summary)
def prepare_event(self): self.cluster = Event(self.prepare_articles()) self.db.session.add(self.cluster) self.db.session.commit()
def test_event_similarity_with_object_duplicates(self): members = self.prepare_articles(type='duplicate') c = Event(members) avg_sim = c.similarity(self.article) self.assertEqual(avg_sim, 1.0)
def test_event_summarize_single_article(self): self.cluster = Event([self.prepare_articles()[0]]) self.assertTrue(self.cluster.summary)
def test_summarize_single_article(self): self.event = Event([self.prepare_articles()[0]]) self.assertTrue(self.event.summary)
def test_summarize(self): self.event = Event(self.prepare_articles()) self.assertTrue(self.event.summary)
def test_conceptize_no_duplicates(self): self.event = Event(self.prepare_articles()) concepts = [con.slug for con in self.event.concepts] mentions = [ali.name for ali in self.event.mentions] self.assertEqual(concepts, ['Clinton']) self.assertEqual(mentions, ['Clinton'])
class EventTest(RequiresDatabase): """ Note this tests the abstract Cluster class's methods as well. A Cluster instance can't be instantiated since it is abstract, so we use the Event as a testing proxy. """ patch_knowledge = True def setUp(self): self.article = self.prepare_articles()[0] def prepare_articles(self, type='standard', score=100): a = { 'title': 'Dinosaurs', 'text': 'dinosaurs are cool, Clinton', 'score': score } b = { 'title': 'Robots', 'text': 'robots are nice, Clinton', 'score': score } c = { 'title': 'Robots', 'text': 'papa was a rodeo, Clinton', 'score': score } if type == 'standard': articles = [Article(**a), Article(**b)] elif type == 'duplicate': articles = [Article(**a), Article(**a)] elif type == 'different': articles = [Article(**a), Article(**c)] # Need to save these articles to persist concepts, # so that their overlaps are calculated properly when clustering! for article in articles: self.db.session.add(article) self.db.session.commit() return articles def prepare_event(self): self.event = Event(self.prepare_articles()) self.db.session.add(self.event) self.db.session.commit() def test_event_deletion_removes_from_articles_events(self): articles = self.prepare_articles() for article in articles: self.db.session.add(article) # Make an event. self.event = Event(articles) self.db.session.add(self.event) self.db.session.commit() # The articles should reference their events. for article in articles: self.assertEqual(article.events, [self.event]) # Destroy events. Event.query.delete() self.db.session.commit() # The articles should no longer have references to the events. for article in articles: self.assertEqual(article.events, []) def test_conceptize(self): members = [ Article(title='Robots', text='dinosaurs are cool, Reagan'), self.prepare_articles()[0] ] self.event = Event(members) concepts = {con.slug for con in self.event.concepts} mentions = {ali.name for ali in self.event.mentions} self.assertEqual(concepts, {'Clinton', 'Reagan'}) self.assertEqual(mentions, {'Clinton', 'Reagan'}) # Each concept's score won't be 0.5, since # they are weighed down by the commonness. for concept in self.event.concepts: self.assertAlmostEqual(concept.score, 0.005, places=3) def test_conceptize_no_duplicates(self): self.event = Event(self.prepare_articles()) concepts = [con.slug for con in self.event.concepts] mentions = [ali.name for ali in self.event.mentions] self.assertEqual(concepts, ['Clinton']) self.assertEqual(mentions, ['Clinton']) def test_summarize(self): self.event = Event(self.prepare_articles()) self.assertTrue(self.event.summary) def test_summarize_single_article(self): self.event = Event([self.prepare_articles()[0]]) self.assertTrue(self.event.summary) def test_summary_sentences(self): # Check to see that we can break up the summary # back into its original sentences. from argos.core.brain import summarizer title = 'Syria Misses New Deadline as It Works to Purge Arms' text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.' expected_sents = summarizer.summarize(title, text) source = Source() source.name = 'Super Cool Times' article = Article(title=title, text=text, score=100) article.source = source article.ext_url = 'http://foo.com' self.event = Event([article]) expected = [{ 'sentence': sent, 'source': 'Super Cool Times', 'url': 'http://foo.com' } for sent in expected_sents] self.assertEqual(self.event.summary_sentences, expected) def test_timespan(self): text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs' members = [ Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)), Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)), Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111)) ] self.event = Event(members) results = self.event.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111)) self.assertEqual(len(results), 2) self.assertEqual({r.title for r in results}, {'B', 'C'}) def test_score_prefer_newer_events(self): event_a = Event(self.prepare_articles()) event_b = Event(self.prepare_articles()) self.assertGreater(event_b.score, event_a.score) def test_score_prefer_events_with_higher_article_scores(self): event_a = Event(self.prepare_articles()) event_b = Event(self.prepare_articles(score=200)) self.assertGreater(event_b.score, event_a.score)
class EventTest(RequiresDatabase): """ Note this tests the abstract Cluster class's methods as well. A Cluster instance can't be instantiated since it is abstract, so we use the Event as a testing proxy. """ patch_knowledge = True def setUp(self): self.article = self.prepare_articles()[0] def prepare_articles(self, type='standard', score=100): a = {'title':'Dinosaurs', 'text':'dinosaurs are cool, Clinton', 'score':score} b = {'title':'Robots', 'text':'robots are nice, Clinton', 'score':score} c = {'title':'Robots', 'text':'papa was a rodeo, Clinton', 'score':score} if type == 'standard': articles = [Article(**a), Article(**b)] elif type == 'duplicate': articles = [Article(**a), Article(**a)] elif type == 'different': articles = [Article(**a), Article(**c)] # Need to save these articles to persist concepts, # so that their overlaps are calculated properly when clustering! for article in articles: self.db.session.add(article) self.db.session.commit() return articles def prepare_event(self): self.event = Event(self.prepare_articles()) self.db.session.add(self.event) self.db.session.commit() def test_event_deletion_removes_from_articles_events(self): articles = self.prepare_articles() for article in articles: self.db.session.add(article) # Make an event. self.event = Event(articles) self.db.session.add(self.event) self.db.session.commit() # The articles should reference their events. for article in articles: self.assertEqual(article.events, [self.event]) # Destroy events. Event.query.delete() self.db.session.commit() # The articles should no longer have references to the events. for article in articles: self.assertEqual(article.events, []) def test_conceptize(self): members = [Article( title='Robots', text='dinosaurs are cool, Reagan' ), self.prepare_articles()[0]] self.event = Event(members) concepts = {con.slug for con in self.event.concepts} mentions = {ali.name for ali in self.event.mentions} self.assertEqual(concepts, {'Clinton', 'Reagan'}) self.assertEqual(mentions, {'Clinton', 'Reagan'}) # Each concept's score won't be 0.5, since # they are weighed down by the commonness. for concept in self.event.concepts: self.assertAlmostEqual(concept.score, 0.005, places=3) def test_conceptize_no_duplicates(self): self.event = Event(self.prepare_articles()) concepts = [con.slug for con in self.event.concepts] mentions = [ali.name for ali in self.event.mentions] self.assertEqual(concepts, ['Clinton']) self.assertEqual(mentions, ['Clinton']) def test_summarize(self): self.event = Event(self.prepare_articles()) self.assertTrue(self.event.summary) def test_summarize_single_article(self): self.event = Event([self.prepare_articles()[0]]) self.assertTrue(self.event.summary) def test_summary_sentences(self): # Check to see that we can break up the summary # back into its original sentences. from argos.core.brain import summarizer title = 'Syria Misses New Deadline as It Works to Purge Arms' text = 'Syria missed a revised deadline on Sunday for completing the export or destruction of chemicals in its weapons arsenal, but the government of the war-ravaged country may be only days away from finishing the job, according to international experts overseeing the process. The Syrian government had agreed to complete the export or destruction of about 1,200 tons of chemical agents by April 27 after missing a February deadline, but by Sunday, it had shipped out or destroyed 92.5 percent of the arsenal, said Sigrid Kaag, the coordinator of the joint mission by the United Nations and the watchdog agency the Organization for the Prohibition of Chemical Weapons.' expected_sents = summarizer.summarize(title, text) source = Source() source.name = 'Super Cool Times' article = Article( title=title, text=text, score=100) article.source = source article.ext_url = 'http://foo.com' self.event=Event([article]) expected = [{ 'sentence': sent, 'source': 'Super Cool Times', 'url': 'http://foo.com' } for sent in expected_sents] self.assertEqual(self.event.summary_sentences, expected) def test_timespan(self): text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs' members = [ Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)), Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)), Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111)) ] self.event = Event(members) results = self.event.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111)) self.assertEqual(len(results), 2) self.assertEqual({r.title for r in results}, {'B', 'C'}) def test_score_prefer_newer_events(self): event_a = Event(self.prepare_articles()) event_b = Event(self.prepare_articles()) self.assertGreater(event_b.score, event_a.score) def test_score_prefer_events_with_higher_article_scores(self): event_a = Event(self.prepare_articles()) event_b = Event(self.prepare_articles(score=200)) self.assertGreater(event_b.score, event_a.score)
def test_event_titleize(self): members = [Article(title='Robots', text='dinosaurs are cool, Reagan') ] + self.prepare_articles(type='duplicate') self.cluster = Event(members) self.assertEqual(self.cluster.title, 'Dinosaurs')
class EventTest(RequiresApp): """ Note this tests the abstract Cluster class's methods as well. A Cluster instance can't be instantiated since it is abstract, so we use the Event as a testing proxy. """ def setUp(self): self.article = self.prepare_articles()[0] def prepare_articles(self, type='standard'): a = {'title':'Dinosaurs', 'text':'dinosaurs are cool, Clinton'} b = {'title':'Robots', 'text':'robots are nice, Clinton'} c = {'title':'Robots', 'text':'papa was a rodeo, Clinton'} if type == 'standard': articles = [Article(**a), Article(**b)] elif type == 'duplicate': articles = [Article(**a), Article(**a)] elif type == 'different': articles = [Article(**a), Article(**c)] # Need to save these articles to persist entities, # so that their overlaps are calculated properly when clustering! for article in articles: self.db.session.add(article) self.db.session.commit() return articles def prepare_event(self): self.cluster = Event(self.prepare_articles()) self.db.session.add(self.cluster) self.db.session.commit() def test_event_similarity_with_object_different(self): self.prepare_event() avg_sim = self.cluster.similarity(self.article) self.assertNotEqual(avg_sim, 1.0) self.assertNotEqual(avg_sim, 0.0) def test_event_similarity_with_object_duplicates(self): members = self.prepare_articles(type='duplicate') c = Event(members) avg_sim = c.similarity(self.article) self.assertEqual(avg_sim, 1.0) def test_event_similarity_with_cluster_duplicates(self): self.prepare_event() members = (self.prepare_articles()) c = Event(members) avg_sim = self.cluster.similarity(c) # Currently, the similarity calculation between clusters # does not yield 1.0 if they are identical clusters, # because we calculate the average similarity of the articles # between the clusters, rather than the overlap of the two clusters. #self.assertEqual(avg_sim, 1.0) self.assertAlmostEqual(avg_sim, 0.83999999999999) def test_event_similarity_with_cluster_different(self): self.prepare_event() members = self.prepare_articles(type='different') c = Event(members) avg_sim = self.cluster.similarity(c) self.assertNotEqual(avg_sim, 1.0) self.assertNotEqual(avg_sim, 0.0) def test_event_expired_made_inactive(self): self.prepare_event() self.cluster.updated_at = datetime.utcnow() - timedelta(days=4) Event.cluster([self.article]) self.assertFalse(self.cluster.active) def test_event_clusters_similar(self): self.prepare_event() members = self.prepare_articles(type='duplicate') self.cluster.members = members Event.cluster([self.article]) self.assertEqual(len(self.cluster.members), 3) def test_event_does_not_cluster_if_no_shared_entities(self): self.prepare_event() members = [Article( title='Robots', text='dinosaurs are cool, Reagan', created_at=datetime.utcnow() )] self.cluster.members = members Event.cluster([self.article]) self.assertEqual(len(self.cluster.members), 1) def test_event_does_not_cluster_not_similar(self): self.prepare_event() article = Article( title='Superstars', text='superstars are awesome, Clinton', created_at=datetime.utcnow() ) Event.cluster([article]) self.assertEqual(len(self.cluster.members), 2) def test_event_no_matching_cluster_creates_new_cluster(self): article = Article( title='Superstars', text='superstars are awesome, Clinton', created_at=datetime.utcnow() ) Event.cluster([article]) self.assertEqual(Event.query.count(), 1) def test_event_entitize(self): members = [Article( title='Robots', text='dinosaurs are cool, Reagan' ), self.prepare_articles()[0]] self.cluster = Event(members) entities = {ent.name for ent in self.cluster.entities} self.assertEqual(entities, {'Clinton', 'Reagan'}) def test_event_entitize_no_duplicates(self): self.cluster = Event(self.prepare_articles()) entities = [ent.name for ent in self.cluster.entities] self.assertEqual(entities, ['Clinton']) def test_event_titleize(self): members = [Article( title='Robots', text='dinosaurs are cool, Reagan' )] + self.prepare_articles(type='duplicate') self.cluster = Event(members) self.assertEqual(self.cluster.title, 'Dinosaurs') def test_event_summarize(self): self.cluster = Event(self.prepare_articles()) self.assertTrue(self.cluster.summary) def test_event_summarize_single_article(self): self.cluster = Event([self.prepare_articles()[0]]) self.assertTrue(self.cluster.summary) def test_event_timespan(self): text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs' members = [ Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)), Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)), Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111)) ] self.cluster = Event(members) results = self.cluster.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111)) self.assertEqual(len(results), 2) self.assertEqual({r.title for r in results}, {'B', 'C'})
def process_events(h, clusters): """ Takes clusters of node uuids and builds, modifies, and deletes events out of them. """ now = datetime.utcnow() # Get existing event clusters. event_map = {} existing = {} for e in Event.all_active(): # Map event ids to their event, for lookup later. event_map[e.id] = e # Map event ids to a list of their member node ids. existing[e.id] = [a.node_id for a in e.articles] # Figure out which events to update, delete, and create. to_update, to_create, to_delete, unchanged = triage(existing, clusters) for a_ids in to_create: articles = Article.query.filter(Article.node_id.in_([id.item() for id in a_ids])).order_by(Article.created_at.desc()).all() e = Event(articles) e.created_at = articles[0].created_at e.updated_at = articles[-1].updated_at rep_article = representative_article(h, a_ids, articles) e.title = rep_article.title e.image = rep_article.image db.session.add(e) for e_id, a_ids in to_update.items(): e = event_map[e_id] articles = Article.query.filter(Article.node_id.in_([id.item() for id in a_ids])).all() e.members = articles rep_article = representative_article(h, a_ids, articles) e.title = rep_article.title e.image = rep_article.image e.update() # Freeze expiring events and clean up their articles from the hierarchy. for e_id in unchanged: e = event_map[e_id] if (now - e.updated_at).days > 3: e.active = False nodes = [h.to_iid(a.node_id) for a in e.articles] h.prune(nodes) # Do this LAST so any of this event's associated articles # have a chance to be moved to their new clusters (if any). for e_id in to_delete: db.session.delete(event_map[e_id]) # does this need to prune the articles as well? # i think the assumption is that a deleted event's articles have all migrated elsewhere. db.session.commit()
def test_score_prefer_events_with_higher_article_scores(self): event_a = Event(self.prepare_articles()) event_b = Event(self.prepare_articles(score=200)) self.assertGreater(event_b.score, event_a.score)
class EventTest(RequiresApp): """ Note this tests the abstract Cluster class's methods as well. A Cluster instance can't be instantiated since it is abstract, so we use the Event as a testing proxy. """ def setUp(self): self.article = self.prepare_articles()[0] def prepare_articles(self, type='standard'): a = {'title': 'Dinosaurs', 'text': 'dinosaurs are cool, Clinton'} b = {'title': 'Robots', 'text': 'robots are nice, Clinton'} c = {'title': 'Robots', 'text': 'papa was a rodeo, Clinton'} if type == 'standard': articles = [Article(**a), Article(**b)] elif type == 'duplicate': articles = [Article(**a), Article(**a)] elif type == 'different': articles = [Article(**a), Article(**c)] # Need to save these articles to persist entities, # so that their overlaps are calculated properly when clustering! for article in articles: self.db.session.add(article) self.db.session.commit() return articles def prepare_event(self): self.cluster = Event(self.prepare_articles()) self.db.session.add(self.cluster) self.db.session.commit() def test_event_similarity_with_object_different(self): self.prepare_event() avg_sim = self.cluster.similarity(self.article) self.assertNotEqual(avg_sim, 1.0) self.assertNotEqual(avg_sim, 0.0) def test_event_similarity_with_object_duplicates(self): members = self.prepare_articles(type='duplicate') c = Event(members) avg_sim = c.similarity(self.article) self.assertEqual(avg_sim, 1.0) def test_event_similarity_with_cluster_duplicates(self): self.prepare_event() members = (self.prepare_articles()) c = Event(members) avg_sim = self.cluster.similarity(c) # Currently, the similarity calculation between clusters # does not yield 1.0 if they are identical clusters, # because we calculate the average similarity of the articles # between the clusters, rather than the overlap of the two clusters. #self.assertEqual(avg_sim, 1.0) self.assertAlmostEqual(avg_sim, 0.83999999999999) def test_event_similarity_with_cluster_different(self): self.prepare_event() members = self.prepare_articles(type='different') c = Event(members) avg_sim = self.cluster.similarity(c) self.assertNotEqual(avg_sim, 1.0) self.assertNotEqual(avg_sim, 0.0) def test_event_expired_made_inactive(self): self.prepare_event() self.cluster.updated_at = datetime.utcnow() - timedelta(days=4) Event.cluster([self.article]) self.assertFalse(self.cluster.active) def test_event_clusters_similar(self): self.prepare_event() members = self.prepare_articles(type='duplicate') self.cluster.members = members Event.cluster([self.article]) self.assertEqual(len(self.cluster.members), 3) def test_event_does_not_cluster_if_no_shared_entities(self): self.prepare_event() members = [ Article(title='Robots', text='dinosaurs are cool, Reagan', created_at=datetime.utcnow()) ] self.cluster.members = members Event.cluster([self.article]) self.assertEqual(len(self.cluster.members), 1) def test_event_does_not_cluster_not_similar(self): self.prepare_event() article = Article(title='Superstars', text='superstars are awesome, Clinton', created_at=datetime.utcnow()) Event.cluster([article]) self.assertEqual(len(self.cluster.members), 2) def test_event_no_matching_cluster_creates_new_cluster(self): article = Article(title='Superstars', text='superstars are awesome, Clinton', created_at=datetime.utcnow()) Event.cluster([article]) self.assertEqual(Event.query.count(), 1) def test_event_entitize(self): members = [ Article(title='Robots', text='dinosaurs are cool, Reagan'), self.prepare_articles()[0] ] self.cluster = Event(members) entities = {ent.name for ent in self.cluster.entities} self.assertEqual(entities, {'Clinton', 'Reagan'}) def test_event_entitize_no_duplicates(self): self.cluster = Event(self.prepare_articles()) entities = [ent.name for ent in self.cluster.entities] self.assertEqual(entities, ['Clinton']) def test_event_titleize(self): members = [Article(title='Robots', text='dinosaurs are cool, Reagan') ] + self.prepare_articles(type='duplicate') self.cluster = Event(members) self.assertEqual(self.cluster.title, 'Dinosaurs') def test_event_summarize(self): self.cluster = Event(self.prepare_articles()) self.assertTrue(self.cluster.summary) def test_event_summarize_single_article(self): self.cluster = Event([self.prepare_articles()[0]]) self.assertTrue(self.cluster.summary) def test_event_timespan(self): text = 'the worldly philosophers today cautious optimism is based to a large extent on technological breakthroughs' members = [ Article(title='A', text=text, created_at=datetime(2014, 1, 20, 1, 1, 1, 111111)), Article(title='B', text=text, created_at=datetime(2014, 1, 22, 1, 1, 1, 111111)), Article(title='C', text=text, created_at=datetime(2014, 1, 24, 1, 1, 1, 111111)) ] self.cluster = Event(members) results = self.cluster.timespan(datetime(2014, 1, 21, 1, 1, 1, 111111)) self.assertEqual(len(results), 2) self.assertEqual({r.title for r in results}, {'B', 'C'})
def test_score_prefer_newer_events(self): event_a = Event(self.prepare_articles()) event_b = Event(self.prepare_articles()) self.assertGreater(event_b.score, event_a.score)
def test_event_expired_made_inactive(self): self.prepare_event() self.cluster.updated_at = datetime.utcnow() - timedelta(days=4) Event.cluster([self.article]) self.assertFalse(self.cluster.active)
def test_event_entitize_no_duplicates(self): self.cluster = Event(self.prepare_articles()) entities = [ent.name for ent in self.cluster.entities] self.assertEqual(entities, ['Clinton'])