def load_opensources(): mongo_driver.kill('opensources') opensources = json.load( open('./opensources/sources/sources.json')) list(map(transform_open_format, opensources.items())) assert mongo_driver.check_for_dups('opensources')
""" This cleans all the scraped articles """ import json from helpers import LemmaTokenizer import mongo_driver def lemma_wrapper(dict_): dict_['article'] = LemmaTokenizer(dict_['text']) dict_.pop('text') return dict_ def flags_articles_gen(): for i, _ in enumerate(mongo_driver.get_all('articles')): yield _ if __name__ == '__main__': mongo_driver.kill('articles_cleaned') mongo_driver.drop_articles() cleaner_gen = (lemma_wrapper(doc) for doc in flags_articles_gen()) for i, cleaned_article in enumerate(cleaner_gen): mongo_driver.insert('articles_cleaned', cleaned_article) if not i % 100: print(i) json.dump(mongo_driver.db['articles_cleaned'].count(), open('n_articles.json', 'w'))
if item: yield item return list(replacer()) if 'Truthiness' in data_ and data_['Truthiness'] is not None: data_['Category'] += ', ' + data_['Truthiness'] data_.pop('Truthiness') data_['Category'] = string_clean(data_['Category']) data_['url'] = url return data_ if __name__ == '__main__': mongo_driver.kill('all_sources') os_data = get_clean_urls('opensources') mb_data = get_clean_urls('media_bias') os_urls = set(os_data.keys()) mb_urls = set(mb_data.keys()) shared_urls = os_urls & mb_urls stats = { 'individual': [len(os_urls), len(mb_urls)], 'total': [len(os_urls) + len(mb_urls)], 'not shared': len(os_urls ^ mb_urls), 'shared': len(shared_urls), 'total': len(os_urls | mb_urls),
def load_opensources(): mongo_driver.kill('opensources') opensources = json.load(open('./opensources/sources/sources.json')) list(map(transform_open_format, opensources.items())) assert mongo_driver.check_for_dups('opensources')
""" This cleans all the scraped articles """ from helpers import LemmaTokenizer import mongo_driver import json def lemma_wrapper(dict_): dict_['article'] = LemmaTokenizer(dict_['text']) dict_.pop('text') return dict_ def flags_articles_gen(): for i, _ in enumerate(mongo_driver.get_all('articles')): yield _ if __name__ == '__main__': mongo_driver.kill('articles_cleaned') mongo_driver.drop_articles() cleaner_gen = (lemma_wrapper(doc) for doc in flags_articles_gen()) for i, cleaned_article in enumerate(cleaner_gen): mongo_driver.insert('articles_cleaned', cleaned_article) if not i % 100: print(i) json.dump(mongo_driver.db['articles_cleaned'].count(), open('n_articles.json', 'w'))
def article_feeder(): flag_corpus = articles_by_flag(unique_flags()) def feed(): for flag in flag_corpus: yield from flag yield from feed() def main(): for i, article in enumerate(article_feeder()): mongo_driver.insert( 'articles_by_flag', { 'article': article['title'] + ' ' + article['text'].replace('\n', ' '), # {'article': article['keywords'], 'flag': curr_flag.val }) if __name__ == '__main__': mongo_driver.kill('articles_by_flag') main() # mongo_driver.print_n('articles_by_flag', 10)