Ejemplo n.º 1
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Toxine project
#
# Copyright (C) 2019-present by Sergei Ternovykh
# License: BSD, see LICENSE for details
"""
Example: Tokenize Wikipedia and make its articles looks like some speech
recognition software output. Save the result as CoNLL-U.
"""
from corpuscula import Conllu
from corpuscula.wikipedia_utils import download_wikipedia
from toxine.wikipedia_utils import TokenizedWikipedia

download_wikipedia(overwrite=False)
Conllu.save(TokenizedWikipedia().articles(),
            'wiki_speech.conllu',
            fix=True,
            adjust_for_speech=True,
            log_file=None)
Ejemplo n.º 2
0
MAX_CHUNK_WORDS = 200

if SEED:
    random.seed(SEED)
'''===========================================================================
Headers collection
==========================================================================='''
ARTICLE_NOS_FN = os.path.join(utils.PAGES_DIR, 'article_nos')
if os.path.isfile(ARTICLE_NOS_FN):
    with open(ARTICLE_NOS_FN, 'rt') as f:
        article_nos = [int(x) for x in f.read().split('\n')]
        num_articles_total, article_nos = article_nos[0], article_nos[1:]

else:
    wikipedia_utils.download_wikipedia(lang='RU',
                                       root_dir=None,
                                       overwrite=False)
    for max_article_no, _ in enumerate(wikipedia_utils.Wikipedia().titles()):
        pass
    #max_article_no = 3804415  # articles - templates: 3783701
    num_articles_total = max_article_no + 1
    article_nos = list(range(num_articles_total))
    random.shuffle(article_nos)
    article_nos = article_nos[:int(utils.TEXTS_FOR_SOURCE *
                                   1.1)]  # spare for templates
    with open(ARTICLE_NOS_FN, 'wt') as f:
        print(num_articles_total, file=f)
        f.write('\n'.join(str(x) for x in article_nos))
'''===========================================================================
Texts collection
==========================================================================='''