def evaluate_clustering(): """ Evaluate the clustering algorithm. """ logger.info('Constructing expected clusters and articles...') expected_clusters = {} articles = [] all_files = [] # Collect all appropriate files. for dir, subdir, files in os.walk('manage/evaluate/organized_articles'): for file in files: filepath = os.path.join(dir, file) name, ext = os.path.splitext(filepath) if ext == '.txt': all_files.append((dir, name, filepath)) # Create articles for appropriate files. for dir, name, filepath in all_files: category = dir.split('/')[-1] f = open(filepath, 'r') article = Article( text=f.read(), title=name.split('/')[-1] ) expected_clusters.setdefault(category, []).append(article) articles.append(article) progress_bar(len(articles)/len(all_files) * 100) print('\n') logger.info('Will cluster {0} articles.'.format(len(articles))) logger.info('Expecting {0} clusters.'.format(len(expected_clusters.keys()))) logger.info('Clustering...') p = cProfile.Profile() clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True) logger.info('Created {0} clusters.'.format(len(clusters))) logger.info('Cluster composition is as follows...') for c in clusters: logger.info([m.title for m in c.members]) logger.info('Profiling statistics from the clustering...') ps = pstats.Stats(p) ps.sort_stats('time').print_stats(10)
def evaluate_clustering(): """ Evaluate the clustering algorithm. """ logger.info('Constructing expected clusters and articles...') expected_clusters = {} articles = [] all_files = [] # Collect all appropriate files. for dir, subdir, files in os.walk('manage/evaluate/organized_articles'): for file in files: filepath = os.path.join(dir, file) name, ext = os.path.splitext(filepath) if ext == '.txt': all_files.append((dir, name, filepath)) # Create articles for appropriate files. for dir, name, filepath in all_files: category = dir.split('/')[-1] f = open(filepath, 'r') article = Article(text=f.read(), title=name.split('/')[-1]) expected_clusters.setdefault(category, []).append(article) articles.append(article) progress_bar(len(articles) / len(all_files) * 100) print('\n') logger.info('Will cluster {0} articles.'.format(len(articles))) logger.info('Expecting {0} clusters.'.format(len( expected_clusters.keys()))) logger.info('Clustering...') p = cProfile.Profile() clusters = p.runcall(Event.cluster, articles, threshold=0.04, debug=True) logger.info('Created {0} clusters.'.format(len(clusters))) logger.info('Cluster composition is as follows...') for c in clusters: logger.info([m.title for m in c.members]) logger.info('Profiling statistics from the clustering...') ps = pstats.Stats(p) ps.sort_stats('time').print_stats(10)
def generate(keywords, num=5000): this_dir = path.dirname(__file__) articles = load_articles() # Filter down to articles from the specified sources. results = [] articles = articles[:num] for idx, article in enumerate(articles): article_words = tokenize(article['text']) if set(article_words).issuperset(set(keywords)): results.append(article) progress_bar(idx / len(articles) * 100) # Store articles into separate text files. for article in results: #print(json.dumps(article, sort_keys=True, indent=4)) article_path = 'unorganized_articles/{0}.txt'.format(article['title']) f = open(path.join(this_dir, article_path), 'wb') f.write(article['text'].encode('utf-8'))
def generate(keywords, num=5000): this_dir = path.dirname(__file__) articles = load_articles() # Filter down to articles from the specified sources. results = [] articles = articles[:num] for idx, article in enumerate(articles): article_words = tokenize(article['text']) if set(article_words).issuperset(set(keywords)): results.append(article) progress_bar(idx/len(articles) * 100) # Store articles into separate text files. for article in results: #print(json.dumps(article, sort_keys=True, indent=4)) article_path = 'unorganized_articles/{0}.txt'.format(article['title']) f = open(path.join(this_dir, article_path), 'wb') f.write(article['text'].encode('utf-8'))
def seed(debug=False): this_dir = os.path.dirname(__file__) seeds = open(os.path.join(this_dir, 'seed.json'), 'r') sources = open(os.path.join(this_dir, 'seed_sources.json'), 'r') sample_images = [ 'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg', 'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg' 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg' ] print('Resetting the database...') db.drop_all() db.create_all() # Create sources print('Creating sources...') for url in json.load(sources): s = Source(ext_url=url, name='The Times') # fake name db.session.add(s) db.session.commit() num_sources = Source.query.count() print('Created {0} sources.'.format(num_sources)) # Create articles entries = json.load(seeds) print('Seeding {0} articles...'.format(len(entries))) articles = [] for entry in entries: if debug: print(json.dumps(entry, sort_keys=True, indent=4)) source = Source.query.filter_by(ext_url=entry['source']).first() a = Article( ext_url=entry['url'], source=source, html=entry['html'], text=entry['text'], tags=entry['tags'], title=entry['title'], created_at = parse(entry['published']), updated_at = parse(entry['updated']), image=random.choice(sample_images) # fake image ) articles.append(a) db.session.add(a) progress_bar(len(articles) / len(entries) * 100) db.session.commit() num_articles = Article.query.count() num_entities = Entity.query.count() print('Seeded {0} articles.'.format(num_articles)) print('Found {0} entities.'.format(num_entities)) print('Clustering articles into events...') Event.cluster(articles, threshold=0.02, debug=True) num_events = Event.query.count() print('Created {0} event clusters.'.format(num_events)) print('Clustering events into stories...') events = Event.query.all() Story.cluster(events, threshold=0.02, debug=True) num_stories = Story.query.count() print('Created {0} story clusters.'.format(num_stories)) print('\n\n==============================================') print('From {0} sources, seeded {1} articles, found {2} entities, created {3} events and {4} stories.'.format(num_sources, num_articles, num_entities, num_events, num_stories)) print('==============================================\n\n')
def seed(debug=False): # Patch out saving images to S3. patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg') patcher.start() seeds = open('manage/core/data/seed.json', 'r') sample_images = [ 'https://upload.wikimedia.org/wikipedia/commons/d/d5/Michael_Rogers_-_Herbiers_2004.jpg', 'https://upload.wikimedia.org/wikipedia/commons/6/6e/Brandenburger_Tor_2004.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/ChicagoAntiGaddafiHopeless.jpg/576px-ChicagoAntiGaddafiHopeless.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Evo_morales_2_year_bolivia_Joel_Alvarez.jpg/640px-Evo_morales_2_year_bolivia_Joel_Alvarez.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9f/2010_wet_season_cloud_over_colombia.jpg/640px-2010_wet_season_cloud_over_colombia.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/27/Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg/640px-Barack_Obama_at_Las_Vegas_Presidential_Forum.jpg' 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a9/2010-10-23-Demo-Stuttgart21-Befuerworter.png/640px-2010-10-23-Demo-Stuttgart21-Befuerworter.png', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fc/51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg/640px-51%C2%BA_Congresso_da_UNE_%28Conune%29.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f7/Tough_return%2C_365.35.jpg/640px-Tough_return%2C_365.35.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/Saint_Isaac%27s_Cathedral_in_SPB.jpeg/800px-Saint_Isaac%27s_Cathedral_in_SPB.jpeg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg/800px-Ponta_de_S%C3%A3o_Louren%C3%A7o_north_north_east.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e9/TU_Bibl_01_DSC1099w.jpg/644px-TU_Bibl_01_DSC1099w.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg/800px-%D0%9C%D0%B0%D0%BA%D0%B5%D0%B4%D0%BE%D0%BD%D0%B8%D1%83%D0%BC_-_%D0%9A%D1%80%D1%83%D1%88%D0%B5%D0%B2%D0%BE.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/64/Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG/800px-Puente_Mong%2C_Ciudad_Ho_Chi_Minh%2C_Vietnam%2C_2013-08-14%2C_DD_01.JPG', 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c3/Autignac%2C_H%C3%A9rault_01.jpg/800px-Autignac%2C_H%C3%A9rault_01.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg/800px-Caesio_teres_in_Fiji_by_Nick_Hobgood.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg/800px-Ash_in_Yogyakarta_during_the_2014_eruption_of_Kelud_01.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/b/bd/12-07-12-wikimania-wdc-by-RalfR-010.jpg/800px-12-07-12-wikimania-wdc-by-RalfR-010.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg/800px-Mortagne-sur-Gironde_Civellier_Mayflowers_2013.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg/611px-British_Museum_Great_Court%2C_London%2C_UK_-_Diliff.jpg', 'https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Mercedes-Benz_Museum_201312_08_blue_hour.jpg/800px-Mercedes-Benz_Museum_201312_08_blue_hour.jpg' ] print('Resetting the database...') db.reflect() db.drop_all() db.create_all() # Create sources print('Creating sources...') create_sources() num_sources = Source.query.count() print('Created {0} sources.'.format(num_sources)) # Create articles entries = json.load(seeds) print('Seeding {0} articles...'.format(len(entries))) articles = [] for entry in entries: if debug: print(json.dumps(entry, sort_keys=True, indent=4)) feed = Feed.query.filter_by(ext_url=entry['source']).first() a = Article( ext_url=entry['url'], feed=feed, source=feed.source, html=entry['html'], text=entry['text'], tags=entry['tags'], title=entry['title'], created_at = parse(entry['published']), updated_at = parse(entry['updated']), image=random.choice(sample_images), # fake image score=random.random() * 100 # fake score ) articles.append(a) db.session.add(a) progress_bar(len(articles) / len(entries) * 100) print('Creating additional articles...') # Collect all appropriate files. all_files = [] for dir, subdir, files in os.walk('manage/core/data/organized_articles'): for file in files: filepath = os.path.join(dir, file) name, ext = os.path.splitext(filepath) if ext == '.txt': all_files.append((dir, name, filepath)) # Create articles for appropriate files. for dir, name, filepath in all_files: category = dir.split('/')[-1] f = open(filepath, 'r') article = Article( text=f.read(), title=name.split('/')[-1], ext_url='http://fauxurl/', source = Source.query.get(1), # fake source image=random.choice(sample_images), # fake image score=random.random() * 100 # fake score ) db.session.add(article) articles.append(article) progress_bar(len(articles)/len(all_files) * 100) db.session.commit() num_articles = Article.query.count() num_concepts = Concept.query.count() print('Seeded {0} articles.'.format(num_articles)) print('Found {0} concepts.'.format(num_concepts)) print('Clustering articles into events...') cluster.cluster(articles) num_events = Event.query.count() print('Created {0} event clusters.'.format(num_events)) print('Clustering events into stories...') # TO DO num_stories = Story.query.count() print('Created {0} story clusters.'.format(num_stories)) patcher.stop() print('\n\n==============================================') print('From {0} sources, seeded {1} articles, found {2} concepts, created {3} events and {4} stories.'.format(num_sources, num_articles, num_concepts, num_events, num_stories)) print('==============================================\n\n') client = current_app.test_client() ctx = current_app.test_request_context() ctx.push() register_user(email='[email protected]', password='******') ctx.pop() print('\n\n==============================================') print('Created a test user, email is [email protected], password is password') print('==============================================\n\n') client = Client( #client_id=gen_salt(40), #client_secret=gen_salt(50), client_id='test', client_secret='test', _redirect_uris='http://localhost:5000/authorized', _default_scopes='userinfo', _allowed_grant_types='authorization_code refresh_token password', user_id=None, is_confidential=True # make a confidential client. ) db.session.add(client) db.session.commit() print('\n\n==============================================') print('Created a test client:\nclient id: {0}\nclient secret: {1}'.format(client.client_id, client.client_secret)) print('==============================================\n\n')
def download(url, save_path, filename=None, progress=False): """ Downloads a file from the specified URL. Will resume an existing download if the target server supports it (responds with the "Accepts-Range" header). Args: | url (str) -- url of the file to download | save_path (str) -- path to the directory to save the file | progress (bool) -- output progress bar to stdout """ # Strip trailing slash, if there is one. save_path = save_path.rstrip('\/') if filename is None: filename = url.split('/').pop() file = '{0}/{1}'.format(save_path, filename) existing_size = 0 # If file already exists, # but there is not a newer file is on the server... if os.path.exists(file) and not _expired(url, file): # Append to existing file. outfile = open(file, 'ab') # Figure out how many bytes we've got. existing_size = outfile.tell() # Setup request for only the remaining bytes. headers = {'Range': 'bytes={0}-'.format(existing_size)} req = request.Request(url, headers=headers) # Otherwise, create a new/overwrite existing file. else: # Create/overwrite file. outfile = open(file, 'wb') outfile.seek(0) # Vanilla request. req = request.Request(url) try: # Get response. resp = request.urlopen(req) # Get total size of content. total_size = float(resp.headers['Content-Length'].strip()) # Check if the file has already been downloaded_size. if total_size == existing_size: logger.info('File already downloaded.') return # Check that the server accepts ranges. # If it does not, the server will ignore the Range header, # And we have to start all over again. if existing_size > 0 and not resp.headers.get('Accept-Ranges', None): logger.info('Server does not allow resuming of downloads.') logger.info('Starting from the beginning! :D') outfile = open(file, 'wb') outfile.seek(0) if progress: progress_bar( (existing_size/total_size) * 100 ) # Pull out the chunks! for chunk in iter(lambda: resp.read(CHUNK), b''): # Write the chunk to the file. outfile.write(chunk) # Update existing size. existing_size += len(chunk) percent_complete = (existing_size/total_size) * 100 # Show progress. if progress: progress_bar(percent_complete) if progress: sys.stdout.write('\n') # Return the download's filepath. return file except request.HTTPError as e: logger.error('HTTP Error:', e.code, url) except request.URLError as e: logger.error('URL Error:', e.reason, url)
def run(self, dumppath, use_patch): if use_patch: print('Patching out saving images to S3...') patcher = patch('argos.util.storage.save_from_url', autospec=True, return_value='https://i.imgur.com/Zf9mXlj.jpg') patcher.start() else: patcher = None print('Loading sources...') sources_map = {} with open(os.path.join(dumppath, 'sources.json'), 'r') as f: sources = json.load(f) for i, s in enumerate(sources): source = Source.query.filter(Source.name == s['name']).first() if not source: source = Source(name=s['name']) db.session.add(source) id = s['_id']['$oid'] sources_map[id] = source progress_bar(i/(len(sources) - 1) * 100) db.session.commit() print('\nLoading feeds...') feeds_map = {} with open(os.path.join(dumppath, 'feeds.json'), 'r') as f: feeds = json.load(f) for i, f in enumerate(feeds): feed = Feed.query.filter(Feed.ext_url == f['ext_url']).first() if not feed: feed = Feed(ext_url=f['ext_url']) db.session.add(feed) feed.source = sources_map[f['source']['$oid']] id = f['_id']['$oid'] feeds_map[id] = feed progress_bar(i/(len(feeds) - 1) * 100) db.session.commit() print('\nLoading articles...') with open(os.path.join(dumppath, 'articles.json'), 'r') as f: articles = json.load(f) for i, a in enumerate(articles): authors = [] for author in a['authors']: authors.append(Author.find_or_create(name=author)) existing = Article.query.filter(Article.ext_url == a['ext_url']).first() if not existing: feed = feeds_map[a['feed']['$oid']] article = Article( ext_url=a['ext_url'], source=feed.source, feed=feed, html=None, # not saved by argos.corpora text=fix_text_segment(a['text']), authors=authors, tags=[], title=fix_text_segment(a['title']), created_at=datetime.fromtimestamp(a['created_at']['$date']/1000), updated_at=datetime.fromtimestamp(a['updated_at']['$date']/1000), image=a['image'], score=evaluator.score(a['ext_url']) ) db.session.add(article) progress_bar(i/(len(articles) - 1) * 100) print('Loaded {0} sources, {1} feeds, and {2} articles.'.format(len(sources), len(feeds), len(articles))) print('Done!') if patcher is not None: patcher.stop()