def test_corpus_builtin_corpora(): builtin_corp = Corpus.builtin_corpora() assert len(builtin_corp) == 2 for corp in builtin_corp: c = Corpus.from_builtin_corpus(corp) assert len(c) > 0
def test_corpus_builtin_corpora(): builtin_corp = Corpus.builtin_corpora() assert sorted(builtin_corp) == sorted( Corpus._BUILTIN_CORPORA_LOAD_KWARGS.keys()) for corp in builtin_corp: c = Corpus.from_builtin_corpus(corp) assert len(c) > 0
def load_corpus_bg_en(sample_n): from tmtoolkit.corpus import Corpus builtin_corp_en = Corpus.from_builtin_corpus('en-NewsArticles') return builtin_corp_en.sample(sample_n)
from tmtoolkit.corpus import Corpus from tmtoolkit.preprocess import TMPreproc from examples._benchmarktools import add_timing, print_timings logging.basicConfig(level=logging.INFO) tmtoolkit_log = logging.getLogger('tmtoolkit') tmtoolkit_log.setLevel(logging.INFO) tmtoolkit_log.propagate = True random.seed(20200320) #%% corpus = Corpus.from_builtin_corpus('en-NewsArticles').sample(1000) print('%d documents' % len(corpus)) #%% add_timing('start') preproc = TMPreproc(corpus, language='en', n_max_processes=cpu_count()) add_timing('load and tokenize') preproc.expand_compound_tokens() add_timing('expand_compound_tokens') preproc.pos_tag() add_timing('pos_tag')
import logging from multiprocessing import cpu_count from tmtoolkit.corpus import Corpus from tmtoolkit.preprocess import TMPreproc from examples._benchmarktools import add_timing, print_timings logging.basicConfig(level=logging.INFO) tmtoolkit_log = logging.getLogger('tmtoolkit') tmtoolkit_log.setLevel(logging.INFO) tmtoolkit_log.propagate = True #%% corpus = Corpus.from_builtin_corpus('en-NewsArticles') print('%d documents' % len(corpus)) #%% add_timing('start') preproc = TMPreproc(corpus, language='en', n_max_processes=4) add_timing('load and tokenize') preproc.pos_tag() add_timing('pos_tag') preproc.lemmatize() add_timing('lemmatize')