Esempio n. 1
0
def load_opensources():
    mongo_driver.kill('opensources')
    opensources = json.load(
        open('./opensources/sources/sources.json'))
    list(map(transform_open_format, opensources.items()))
    assert mongo_driver.check_for_dups('opensources')
""" This cleans all the scraped articles  """

import json

from helpers import LemmaTokenizer
import mongo_driver


def lemma_wrapper(dict_):
    dict_['article'] = LemmaTokenizer(dict_['text'])
    dict_.pop('text')
    return dict_


def flags_articles_gen():
    for i, _ in enumerate(mongo_driver.get_all('articles')):
        yield _


if __name__ == '__main__':
    mongo_driver.kill('articles_cleaned')
    mongo_driver.drop_articles()

    cleaner_gen = (lemma_wrapper(doc) for doc in flags_articles_gen())
    for i, cleaned_article in enumerate(cleaner_gen):
        mongo_driver.insert('articles_cleaned', cleaned_article)
        if not i % 100:
            print(i)
    json.dump(mongo_driver.db['articles_cleaned'].count(),
              open('n_articles.json', 'w'))
Esempio n. 3
0
                if item:
                    yield item

        return list(replacer())

    if 'Truthiness' in data_ and data_['Truthiness'] is not None:
        data_['Category'] += ', ' + data_['Truthiness']
        data_.pop('Truthiness')
    data_['Category'] = string_clean(data_['Category'])

    data_['url'] = url
    return data_


if __name__ == '__main__':
    mongo_driver.kill('all_sources')

    os_data = get_clean_urls('opensources')
    mb_data = get_clean_urls('media_bias')

    os_urls = set(os_data.keys())
    mb_urls = set(mb_data.keys())

    shared_urls = os_urls & mb_urls

    stats = {
        'individual': [len(os_urls), len(mb_urls)],
        'total': [len(os_urls) + len(mb_urls)],
        'not shared': len(os_urls ^ mb_urls),
        'shared': len(shared_urls),
        'total': len(os_urls | mb_urls),
def load_opensources():
    mongo_driver.kill('opensources')
    opensources = json.load(open('./opensources/sources/sources.json'))
    list(map(transform_open_format, opensources.items()))
    assert mongo_driver.check_for_dups('opensources')
                if item:
                    yield item

        return list(replacer())

    if 'Truthiness' in data_ and data_['Truthiness'] is not None:
        data_['Category'] += ', ' + data_['Truthiness']
        data_.pop('Truthiness')
    data_['Category'] = string_clean(data_['Category'])

    data_['url'] = url
    return data_


if __name__ == '__main__':
    mongo_driver.kill('all_sources')

    os_data = get_clean_urls('opensources')
    mb_data = get_clean_urls('media_bias')

    os_urls = set(os_data.keys())
    mb_urls = set(mb_data.keys())

    shared_urls = os_urls & mb_urls

    stats = {
        'individual': [len(os_urls), len(mb_urls)],
        'total': [len(os_urls) + len(mb_urls)],
        'not shared': len(os_urls ^ mb_urls),
        'shared': len(shared_urls),
        'total': len(os_urls | mb_urls),
Esempio n. 6
0
""" This cleans all the scraped articles  """

from helpers import LemmaTokenizer
import mongo_driver
import json


def lemma_wrapper(dict_):

    dict_['article'] = LemmaTokenizer(dict_['text'])
    dict_.pop('text')
    return dict_


def flags_articles_gen():

    for i, _ in enumerate(mongo_driver.get_all('articles')):
        yield _


if __name__ == '__main__':
    mongo_driver.kill('articles_cleaned')
    mongo_driver.drop_articles()

    cleaner_gen = (lemma_wrapper(doc) for doc in flags_articles_gen())
    for i, cleaned_article in enumerate(cleaner_gen):
        mongo_driver.insert('articles_cleaned', cleaned_article)
        if not i % 100:
            print(i)
    json.dump(mongo_driver.db['articles_cleaned'].count(), open('n_articles.json', 'w'))
def article_feeder():
    flag_corpus = articles_by_flag(unique_flags())

    def feed():
        for flag in flag_corpus:
            yield from flag

    yield from feed()


def main():
    for i, article in enumerate(article_feeder()):

        mongo_driver.insert(
            'articles_by_flag',
            {
                'article':
                article['title'] + ' ' + article['text'].replace('\n', ' '),
                # {'article': article['keywords'],
                'flag':
                curr_flag.val
            })


if __name__ == '__main__':
    mongo_driver.kill('articles_by_flag')
    main()

    # mongo_driver.print_n('articles_by_flag', 10)