def transform_open_format(x): ''' Original format: (u'NutritionalAnarchy.com', {u'2nd type': u'', u'3rd type': u'', u'Source Notes (things to know?)': u'', u'type': u'unreliable'}) ''' urls = mongo_driver.get_url('opensources') if x[0] in urls: return template = { 'Category': 'conspiracy', 'Reference': 'http://mediabiasfactcheck.com/zero-hedge/', 'Truthiness': 'MIXED', 'url': 'http://www.zerohedge.com/' } out_dict = dict().fromkeys(template) out_dict['url'] = x[0] out_dict['Category'] = ', '.join( list(set([x[1][_] for _ in x[1].keys() if 'type' in _ and x[1][_]]))) out_dict['Reference'] = 'http://www.opensources.co' mongo_driver.insert('opensources', out_dict)
def get_articles(article): article_data = {} article.url = article.url.strip() try: article.download() article.parse() except Exception as e: print(e) return if article.title: # try: # article.nlp() # except: # article_data['keywords'] = article.keywords article_data['title'] = article.title article_data['text'] = article.text article_data['flags'] = self.categories article_data['source'] = self.url article_data['url'] = article.url print(self.categories, '\t', article_data['source'], article_data['title']) mongo_driver.insert('articles', article_data)
def transform_open_format(x): ''' Original format: (u'NutritionalAnarchy.com', {u'2nd type': u'', u'3rd type': u'', u'Source Notes (things to know?)': u'', u'type': u'unreliable'}) ''' urls = mongo_driver.get_url('opensources') if x[0] in urls: return template = { 'Category': 'conspiracy', 'Reference': 'http://mediabiasfactcheck.com/conspiracy-times/', 'Truthiness': 'MIXED', 'url': 'http://www.conspiracy-times.com/' } out_dict = dict().fromkeys(template) out_dict['url'] = x[0] out_dict['Category'] = ', '.join( list(set([x[1][_] for _ in x[1].keys() if 'type' in _ and x[1][_]]))) out_dict['Reference'] = 'http://www.opensources.co' mongo_driver.insert('opensources', out_dict)
def export_results(self): logger.debug("Exporting results") self.results.update({'Reference': self.page, 'Category': accumulator.cat}) logger.debug(self.results) logger.debug("Saving results to mongo") mongo_driver.insert('media_bias', self.results)
def merge(url): os_ = addDict(correct(url, 'os')) mb_ = addDict(correct(url, 'mb')) [os_.pop(_) for _ in ('_id', 'url')] [mb_.pop(_) for _ in ('_id', 'url')] merged_ = mb_ + os_ merged_['url'] = url mongo_driver.insert('all_sources', merged_)
def export_results(self): self.results.update({ 'Reference': self.page, 'Category': accumulator.cat }) print(self.results) mongo_driver.insert('media_bias', self.results)
def merge(url): os_ = addDict(correct(url, 'os')) mb_ = addDict(correct(url, 'mb')) [os_.pop(_) for _ in ('_id', 'url')] [mb_.pop(_) for _ in ('_id', 'url')] merged_ = mb_ + os_ merged_['url'] = url mongo_driver.insert('all_sources', merged_)
def merge(url): logger.debug("Merging sources for url %s" % url) os_ = addDict(correct(url, 'os')) mb_ = addDict(correct(url, 'mb')) [os_.pop(_) for _ in ('_id', 'url')] [mb_.pop(_) for _ in ('_id', 'url')] merged_ = mb_ + os_ merged_['url'] = url mongo_driver.insert('all_sources', merged_)
def build(self): self.newspaper_obj = newspaper.build(self.url, config=newspaper_config, request_timeout=3, number_threads=2) self.categories = self.source['Category'] self.build_metadata() logger.info( f"found {self.newspaper_obj.size()} articles for {self.url}") assert self.newspaper_obj.size() == len(self.newspaper_obj.articles) self.get_articles_controller() mongo_driver.insert('source_logs', self.meta)
def __init__(self, source, n_articles=45): self._data = source self.url = self.test_https(source['url'].split('/')[0]) self.categories = source['Category'] self.n_articles = n_articles self.get_links() self.build_meta() print(self.url, self.categories) self.get_articles_controller() if self.source_obj.size() > 0: mongo_driver.insert('source_logs', self.meta)
def main(): for i, article in enumerate(article_feeder()): mongo_driver.insert( 'articles_by_flag', { 'article': article['title'] + ' ' + article['text'].replace('\n', ' '), # {'article': article['keywords'], 'flag': curr_flag.val })
def build(self, source): self._data = source self.categories = source['Category'] self.url = self.test_https(source['url'].split('/')[0]) if self.url == False: return self.get_links() self.build_meta() print(self.url) self.get_articles_controller() if self.source_obj.size() > 0: mongo_driver.insert('source_logs', self.meta)
def get_articles(article): article.download() article.parse() article_data = {} article.url = article.url.strip() if len(article.text.split()) > 200 and detect( article.text) == 'en': article_data['text'] = article.text article_data['title'] = article.title article_data['text'] = article.text article_data['flags'] = self.categories article_data['source'] = self.url article_data['url'] = article.url logger.info( f"{self.categories} {article_data['source']} {article_data['title']}" ) mongo_driver.insert('articles', article_data) else: logger.info( f"skipped article {article.title} due to insufficient length" )
""" This cleans all the scraped articles """ import json from helpers import LemmaTokenizer import mongo_driver def lemma_wrapper(dict_): dict_['article'] = LemmaTokenizer(dict_['text']) dict_.pop('text') return dict_ def flags_articles_gen(): for i, _ in enumerate(mongo_driver.get_all('articles')): yield _ if __name__ == '__main__': mongo_driver.kill('articles_cleaned') mongo_driver.drop_articles() cleaner_gen = (lemma_wrapper(doc) for doc in flags_articles_gen()) for i, cleaned_article in enumerate(cleaner_gen): mongo_driver.insert('articles_cleaned', cleaned_article) if not i % 100: print(i) json.dump(mongo_driver.db['articles_cleaned'].count(), open('n_articles.json', 'w'))
def export_results(self): self.results.update({'Reference': self.page, 'Category': accumulator.cat}) print(self.results) mongo_driver.insert('media_bias', self.results)
from helpers import LemmaTokenizer import mongo_driver def lemma_wrapper(dict_): dict_['article'] = LemmaTokenizer(dict_['article']) return dict_ def flags_articles_gen(): for i, _ in enumerate(mongo_driver.get_all('articles_by_flag')): yield _ if __name__ == '__main__': mongo_driver.kill('articles_cleaned') mongo_driver.drop_articles() list( map(lambda _: mongo_driver.insert('articles_cleaned', _), (lemma_wrapper(doc) for doc in flags_articles_gen())))
""" This cleans all the scraped articles """ from helpers import LemmaTokenizer import mongo_driver import json def lemma_wrapper(dict_): dict_['article'] = LemmaTokenizer(dict_['text']) dict_.pop('text') return dict_ def flags_articles_gen(): for i, _ in enumerate(mongo_driver.get_all('articles')): yield _ if __name__ == '__main__': mongo_driver.kill('articles_cleaned') mongo_driver.drop_articles() cleaner_gen = (lemma_wrapper(doc) for doc in flags_articles_gen()) for i, cleaned_article in enumerate(cleaner_gen): mongo_driver.insert('articles_cleaned', cleaned_article) if not i % 100: print(i) json.dump(mongo_driver.db['articles_cleaned'].count(), open('n_articles.json', 'w'))
if __name__ == '__main__': mongo_driver.kill('all_sources') os_data = get_clean_urls('opensources') mb_data = get_clean_urls('media_bias') os_urls = set(os_data.keys()) mb_urls = set(mb_data.keys()) shared_urls = os_urls & mb_urls stats = { 'individual': [len(os_urls), len(mb_urls)], 'total': [len(os_urls) + len(mb_urls)], 'not shared': len(os_urls ^ mb_urls), 'shared': len(shared_urls), 'total': len(os_urls | mb_urls), 'opensource only': len(os_urls - mb_urls), 'mediabias only': len(mb_urls - os_urls) } print(stats) [mongo_driver.insert('all_sources', correct(url, 'os')) for url in os_urls - mb_urls] [mongo_driver.insert('all_sources', correct(url, 'mb')) for url in mb_urls - os_urls] list(map(merge, shared_urls)) x = sorted([_ for _ in mongo_driver.db['all_sources'].find().distinct('Category')]) pprint(x) print(len(x))
mb_urls = set(mb_data.keys()) shared_urls = os_urls & mb_urls stats = { 'individual': [len(os_urls), len(mb_urls)], 'total': [len(os_urls) + len(mb_urls)], 'not shared': len(os_urls ^ mb_urls), 'shared': len(shared_urls), 'total': len(os_urls | mb_urls), 'opensource only': len(os_urls - mb_urls), 'mediabias only': len(mb_urls - os_urls) } print(stats) [ mongo_driver.insert('all_sources', correct(url, 'os')) for url in os_urls - mb_urls ] [ mongo_driver.insert('all_sources', correct(url, 'mb')) for url in mb_urls - os_urls ] list(map(merge, shared_urls)) x = sorted([ _ for _ in mongo_driver.db['all_sources'].find().distinct('Category') ]) pprint(x) print(len(x))