#!/usr/bin/python # -*- coding: utf-8 -*- # Toxine project # # Copyright (C) 2019-present by Sergei Ternovykh # License: BSD, see LICENSE for details """ Example: Tokenize Wikipedia and make its articles looks like some speech recognition software output. Save the result as CoNLL-U. """ from corpuscula import Conllu from corpuscula.wikipedia_utils import download_wikipedia from toxine.wikipedia_utils import TokenizedWikipedia download_wikipedia(overwrite=False) Conllu.save(TokenizedWikipedia().articles(), 'wiki_speech.conllu', fix=True, adjust_for_speech=True, log_file=None)
MAX_CHUNK_WORDS = 200 if SEED: random.seed(SEED) '''=========================================================================== Headers collection ===========================================================================''' ARTICLE_NOS_FN = os.path.join(utils.PAGES_DIR, 'article_nos') if os.path.isfile(ARTICLE_NOS_FN): with open(ARTICLE_NOS_FN, 'rt') as f: article_nos = [int(x) for x in f.read().split('\n')] num_articles_total, article_nos = article_nos[0], article_nos[1:] else: wikipedia_utils.download_wikipedia(lang='RU', root_dir=None, overwrite=False) for max_article_no, _ in enumerate(wikipedia_utils.Wikipedia().titles()): pass #max_article_no = 3804415 # articles - templates: 3783701 num_articles_total = max_article_no + 1 article_nos = list(range(num_articles_total)) random.shuffle(article_nos) article_nos = article_nos[:int(utils.TEXTS_FOR_SOURCE * 1.1)] # spare for templates with open(ARTICLE_NOS_FN, 'wt') as f: print(num_articles_total, file=f) f.write('\n'.join(str(x) for x in article_nos)) '''=========================================================================== Texts collection ==========================================================================='''