Ejemplo n.º 1
0
def get_clean_urls(table_name):
    raw_data = list(mongo_driver.get_all(table_name))
    urls = list(filter(lambda item: 'url' in item, raw_data))

    def clean_link(data):

        link = data['url'].lower().replace('http://', '').replace('https://', '').replace(
            'www.', '').replace((' '), '')
        if link.endswith('/'):
            return link[:-1], data
        else:
            return link, data

    return dict(list(map(lambda item: clean_link(item), urls)))
def get_clean_urls(table_name):
    raw_data = list(mongo_driver.get_all(table_name))
    urls = list(filter(lambda item: 'url' in item, raw_data))

    def clean_link(data):

        link = data['url'].lower().replace('http://',
                                           '').replace('https://',
                                                       '').replace('www.',
                                                                   '').replace(
                                                                       (' '),
                                                                       '')
        if link.endswith('/'):
            return link[:-1], data
        else:
            return link, data

    return dict(list(map(lambda item: clean_link(item), urls)))
def flags_articles_gen():
    for i, _ in enumerate(mongo_driver.get_all('articles')):
        yield _
Ejemplo n.º 4
0
    def corpus_gen():
        for i, _ in enumerate(mongo_driver.get_all('articles_cleaned')):

            if _['article']:
                yield _
Ejemplo n.º 5
0
    def corpus_gen():
        for i, _ in enumerate(mongo_driver.get_all('articles_cleaned')):

            if _['article']:
                yield _
Ejemplo n.º 6
0
def threadpool():
    pool = Pool(30)
    x = pool.imap_unordered(go, batch)
    while True:
        try:
            x.next(timeout=10)
        except multiprocessing.context.TimeoutError:
            print('timeout!')
        except AttributeError as e:
            print(e)
        except StopIteration:
            print('batch finished.')
            pool.close()
            break

        except EOFError:
            pass


if __name__ == '__main__':
    news_sources = mongo_driver.get_all('all_sources')
    while True:
        try:
            batch = itertools.islice(news_sources, 90)
            threadpool()

        except StopIteration:
            print('finished.')
            exit()
Ejemplo n.º 7
0
def flags_articles_gen():

    for i, _ in enumerate(mongo_driver.get_all('articles')):
        yield _
Ejemplo n.º 8
0
    def corpus_gen():
        for i, _ in enumerate(mongo_driver.get_all('articles_cleaned')):

            if _['article']:  #and _['flag'] != 'satire':
                yield _