def test_cluster_creates_events(self): cluster.conf['min_articles'] = 1 self.assertEqual(Event.query.count(), 0) # Since there are no events yet, we expect at least one to be created. articles = self.prepare_articles() cluster.cluster(articles) self.assertTrue(Event.query.count() > 0)
def test_cluster_creates_stories(self): cluster.conf['min_articles'] = 1 cluster.conf['min_events'] = 1 self.assertEqual(Story.query.count(), 0) articles = self.prepare_articles(type='variety') cluster.cluster(articles) self.assertTrue(Story.query.count() > 0)
def cluster_articles(): """ Clusters articles which have not yet been incorporated into the clustering hierarchy. """ articles = Article.query.filter(Article.node_id == None).all() if articles: try: cluster.cluster(articles) except cluster.LockException as e: pass
def test_cluster_minimum_articles(self): # Create articles and slice such that we expect one cluster of 2 articles and one of 1 article. cluster.conf['min_articles'] = 2 articles = self.prepare_articles(type='different') + self.prepare_articles(type='different') articles = articles[:3] # Require a minimum of two articles to create an event. cluster.cluster(articles) # So we should only have one event. self.assertEqual(Event.query.count(), 1) self.assertEqual(Event.query.first().members.all(), [articles[0], articles[2]])
def test_cluster_minimum_articles(self): # Create articles and slice such that we expect one cluster of 2 articles and one of 1 article. cluster.conf['min_articles'] = 2 articles = self.prepare_articles( type='different') + self.prepare_articles(type='different') articles = articles[:3] # Require a minimum of two articles to create an event. cluster.cluster(articles) # So we should only have one event. self.assertEqual(Event.query.count(), 1) self.assertEqual(Event.query.first().members.all(), [articles[0], articles[2]])
def test_cluster_updates_stories(self): cluster.conf['min_articles'] = 1 cluster.conf['min_events'] = 1 cluster.conf['story_threshold'] = 80.0 raw_articles = self.prepare_articles(type='variety') articles = raw_articles[:4] new_articles = raw_articles[4:] cluster.cluster(articles) self.assertEqual(Story.query.count(), 1) for story in Story.query.all(): self.assertEqual(story.members.count(), 1) cluster.cluster(new_articles) # The number of events should not have changed. self.assertEqual(Story.query.count(), 1) for story in Story.query.all(): self.assertEqual(story.members.count(), 4)
def test_cluster_minimum_articles_deletes(self): cluster.conf['min_articles'] = 2 articles = self.prepare_articles(type='different') + self.prepare_articles(type='different') cluster.cluster(articles) # We should have two events first. self.assertEqual(Event.query.count(), 2) # Get one duplicate event. new_articles = self.prepare_articles(type='different') new_articles = new_articles[:1] # Cluster the new duplicate event and raise the min. # One cluster should have 3 now and one should have two. cluster.conf['min_articles'] = 3 cluster.cluster(new_articles) # One event should have been deleted. self.assertEqual(Event.query.count(), 1) self.assertEqual(Event.query.first().members.all(), [articles[0], articles[2], new_articles[0]])
def cluster(articles, threshold=0.7, debug=False): """ Clusters a set of articles into existing events (or creates new ones). Args: | articles (list) -- the Articles to cluster | threshold (float) -- the similarity threshold for qualifying a cluster """ log = logger('EVENT_CLUSTERING') if debug: log.setLevel('DEBUG') else: log.setLevel('ERROR') updated_clusters = [] active_clusters = Event.query.filter_by(active=True).all() now = datetime.utcnow() for article in articles: # Select candidate clusters, # i.e. active clusters which share at least one entity with this article. a_ents = [entity.slug for entity in article.entities] candidate_clusters = [] for c in active_clusters: c_ents = [entity.slug for entity in c.entities] if set(c_ents).intersection(a_ents): candidate_clusters.append(c) selected_cluster = cluster(article, candidate_clusters, threshold=threshold, logger=log) # If no selected cluster was found, then create a new one. if not selected_cluster: log.debug( 'No qualifying clusters found, creating a new cluster.') selected_cluster = Event([article]) db.session.add(selected_cluster) updated_clusters.append(selected_cluster) for clus in active_clusters: # Mark expired clusters inactive. if (now - clus.updated_at).days > 3: clus.active = False else: clus.update() db.session.commit() return updated_clusters
def test_cluster_minimum_articles_deletes(self): cluster.conf['min_articles'] = 2 articles = self.prepare_articles( type='different') + self.prepare_articles(type='different') cluster.cluster(articles) # We should have two events first. self.assertEqual(Event.query.count(), 2) # Get one duplicate event. new_articles = self.prepare_articles(type='different') new_articles = new_articles[:1] # Cluster the new duplicate event and raise the min. # One cluster should have 3 now and one should have two. cluster.conf['min_articles'] = 3 cluster.cluster(new_articles) # One event should have been deleted. self.assertEqual(Event.query.count(), 1) self.assertEqual(Event.query.first().members.all(), [articles[0], articles[2], new_articles[0]])
def test_cluster_updates_events(self): cluster.conf['min_articles'] = 1 articles = self.prepare_articles(type='different') cluster.cluster(articles) self.assertEqual(Event.query.count(), 2) for event in Event.query.all(): self.assertEqual(event.members.count(), 1) # These two articles should be identical to the first one. new_articles = self.prepare_articles(type='duplicate') cluster.cluster(new_articles) # The number of events should not have changed. self.assertEqual(Event.query.count(), 2) # One event should have 3 articles. for event in Event.query.all(): if articles[1] in event.members: self.assertEqual(event.members.count(), 1) else: self.assertEqual(event.members.count(), 3)
def run(self): path = APP['CLUSTERING']['hierarchy_path'] path = os.path.expanduser(path) if os.path.exists(path): print('Backing up existing hierarchy...') shutil.move(path, path + '.bk') # Delete existing events. events = Event.query.delete() articles = Article.query.filter(Article.node_id != None).all() # Reset node associations. print('Resetting article-node associations...') for article in articles: article.node_id = None db.session.commit() print('Reconstructing the hierarchy...') start_time = time() total = Article.query.count() p = Progress() # Cluster articles in batches of 1000, for memory's sake. batch_size = 100 articles, remaining = get_unclustered_articles(batch_size) p.print_progress((total - remaining) / (total - 1)) while articles: cluster.cluster(articles, snip=False) articles, remaining = get_unclustered_articles(batch_size) p.print_progress((total - remaining) / (total - 1)) elapsed_time = time() - start_time print('Clustered {0} articles in {1}.'.format(len(articles), elapsed_time)) print('Reconstruction done!')
def run(self): path = APP['CLUSTERING']['hierarchy_path'] path = os.path.expanduser(path) if os.path.exists(path): print('Backing up existing hierarchy...') shutil.move(path, path + '.bk') # Delete existing events. events = Event.query.delete() articles = Article.query.filter(Article.node_id != None).all() # Reset node associations. print('Resetting article-node associations...') for article in articles: article.node_id = None db.session.commit() print('Reconstructing the hierarchy...') start_time = time() total = Article.query.count() p = Progress() # Cluster articles in batches of 1000, for memory's sake. batch_size = 100 articles, remaining = get_unclustered_articles(batch_size) p.print_progress((total-remaining)/(total - 1)) while articles: cluster.cluster(articles, snip=False) articles, remaining = get_unclustered_articles(batch_size) p.print_progress((total-remaining)/(total - 1)) elapsed_time = time() - start_time print('Clustered {0} articles in {1}.'.format(len(articles), elapsed_time)) print('Reconstruction done!')
def test_cluster_freezes_events(self): cluster.conf['min_articles'] = 1 articles = self.prepare_articles(type='different') cluster.cluster(articles) self.assertEqual(Event.query.count(), 2) for event in Event.query.all(): self.assertTrue(event.active) if articles[1] in event.members: # Fake things and set the event we expect not to have new articles to be old. event.updated_at = event.updated_at - timedelta(days=4) # These two articles should be identical to the first one. new_articles = self.prepare_articles(type='duplicate') cluster.cluster(new_articles) # One event gets updated, one doesn't. for event in Event.query.all(): if articles[1] in event.members: self.assertFalse(event.active) else: self.assertTrue(event.active)
def cluster(articles, threshold=0.7, debug=False): """ Clusters a set of articles into existing events (or creates new ones). Args: | articles (list) -- the Articles to cluster | threshold (float) -- the similarity threshold for qualifying a cluster """ log = logger('EVENT_CLUSTERING') if debug: log.setLevel('DEBUG') else: log.setLevel('ERROR') updated_clusters = [] active_clusters = Event.query.filter_by(active=True).all() now = datetime.utcnow() for article in articles: # Select candidate clusters, # i.e. active clusters which share at least one entity with this article. a_ents = [entity.slug for entity in article.entities] candidate_clusters = [] for c in active_clusters: c_ents = [entity.slug for entity in c.entities] if set(c_ents).intersection(a_ents): candidate_clusters.append(c) selected_cluster = cluster(article, candidate_clusters, threshold=threshold, logger=log) # If no selected cluster was found, then create a new one. if not selected_cluster: log.debug('No qualifying clusters found, creating a new cluster.') selected_cluster = Event([article]) db.session.add(selected_cluster) updated_clusters.append(selected_cluster) for clus in active_clusters: # Mark expired clusters inactive. if (now - clus.updated_at).days > 3: clus.active = False else: clus.update() db.session.commit() return updated_clusters
def cluster(events, threshold=0.7, debug=False): """ Clusters a set of events into existing stories (or creates new ones). Args: | events (list) -- the Events to cluster | threshold (float) -- the similarity threshold for qualifying a cluster | debug (bool) -- will log clustering info if True Returns: | clusters (list) -- the list of updated clusters """ log = logger('STORY_CLUSTERING') if debug: log.setLevel('DEBUG') else: log.setLevel('ERROR') updated_clusters = [] for event in events: # Find stories which have some matching entities with this event. candidate_clusters = Story.query.filter(Entity.name.in_([entity.name for entity in event.entities])).all() # Cluster this event. selected_cluster = cluster(event, candidate_clusters, threshold=threshold, logger=log) # If no selected cluster was found, then create a new one. if not selected_cluster: log.debug('No qualifying clusters found, creating a new cluster.') selected_cluster = Story([event]) db.session.add(selected_cluster) updated_clusters.append(selected_cluster) db.session.commit() return updated_clusters
def seed(debug=False): # Patch out saving images to S3. patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg') patcher.start() seeds = open('manage/core/data/seed.json', 'r') sample_images = [ 'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg', 'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg' 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Saint_Isaac%27s_Cathedral_in_SPB.jpeg/800px-Saint_Isaac%27s_Cathedral_in_SPB.jpeg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg/800px-Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e9/TU_Bibl_01_DSC1099w.jpg/644px-TU_Bibl_01_DSC1099w.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg/800px-%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/64/Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG/800px-Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG', 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Autignac%2C_H%C3%A9rault_01.jpg/800px-Autignac%2C_H%C3%A9rault_01.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg/800px-Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg/800px-Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/b/bd/12-07-12-wikimania-wdc-by-RalfR-010.jpg/800px-12-07-12-wikimania-wdc-by-RalfR-010.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg/800px-Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg/611px-British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Mercedes-Benz_Museum_201312_08_blue_hour.jpg/800px-Mercedes-Benz_Museum_201312_08_blue_hour.jpg' ] print('Resetting the database...') db.reflect() db.drop_all() db.create_all() # Create sources print('Creating sources...') create_sources() num_sources = Source.query.count() print('Created {0} sources.'.format(num_sources)) # Create articles entries = json.load(seeds) print('Seeding {0} articles...'.format(len(entries))) articles = [] for entry in entries: if debug: print(json.dumps(entry, sort_keys=True, indent=4)) feed = Feed.query.filter_by(ext_url=entry['source']).first() a = Article( ext_url=entry['url'], feed=feed, source=feed.source, html=entry['html'], text=entry['text'], tags=entry['tags'], title=entry['title'], created_at = parse(entry['published']), updated_at = parse(entry['updated']), image=random.choice(sample_images), # fake image score=random.random() * 100 # fake score ) articles.append(a) db.session.add(a) progress_bar(len(articles) / len(entries) * 100) print('Creating additional articles...') # Collect all appropriate files. all_files = [] for dir, subdir, files in os.walk('manage/core/data/organized_articles'): for file in files: filepath = os.path.join(dir, file) name, ext = os.path.splitext(filepath) if ext == '.txt': all_files.append((dir, name, filepath)) # Create articles for appropriate files. for dir, name, filepath in all_files: category = dir.split('/')[-1] f = open(filepath, 'r') article = Article( text=f.read(), title=name.split('/')[-1], ext_url='http://fauxurl/', source = Source.query.get(1), # fake source image=random.choice(sample_images), # fake image score=random.random() * 100 # fake score ) db.session.add(article) articles.append(article) progress_bar(len(articles)/len(all_files) * 100) db.session.commit() num_articles = Article.query.count() num_concepts = Concept.query.count() print('Seeded {0} articles.'.format(num_articles)) print('Found {0} concepts.'.format(num_concepts)) print('Clustering articles into events...') cluster.cluster(articles) num_events = Event.query.count() print('Created {0} event clusters.'.format(num_events)) print('Clustering events into stories...') # TO DO num_stories = Story.query.count() print('Created {0} story clusters.'.format(num_stories)) patcher.stop() print('\n\n==============================================') print('From {0} sources, seeded {1} articles, found {2} concepts, created {3} events and {4} stories.'.format(num_sources, num_articles, num_concepts, num_events, num_stories)) print('==============================================\n\n') client = current_app.test_client() ctx = current_app.test_request_context() ctx.push() register_user(email='[email protected]', password='******') ctx.pop() print('\n\n==============================================') print('Created a test user, email is [email protected], password is password') print('==============================================\n\n') client = Client( #client_id=gen_salt(40), #client_secret=gen_salt(50), client_id='test', client_secret='test', _redirect_uris='http://localhost:5000/authorized', _default_scopes='userinfo', _allowed_grant_types='authorization_code refresh_token password', user_id=None, is_confidential=True # make a confidential client. ) db.session.add(client) db.session.commit() print('\n\n==============================================') print('Created a test client:\nclient id: {0}\nclient secret: {1}'.format(client.client_id, client.client_secret)) print('==============================================\n\n')