def test_keep_everything(): with open(DUMMY_CORPUS) as corpus: tokens = extract_fields(corpus, drop_meta=False, drop_tags=False, **DUMMY_SPECS) tokens = list(tokens) assert len(tokens) == 56
def test_extract_tags_tokens(): with open(DUMMY_CORPUS) as corpus: tags_tokens = extract_fields(corpus, keep_meta={}, return_fields={1, 2}, **DUMMY_SPECS) tags_tokens = list(tags_tokens) assert len(tags_tokens) == 38 assert all([len(tt) == 2 for tt in tags_tokens])
def test_extract_multiple_fields_and_meta(): with open(DUMMY_CORPUS) as corpus: tokens = extract_fields(corpus, return_fields=[0, 2], drop_meta=False, drop_tags=False, **DUMMY_SPECS) tokens = list(tokens) assert len(tokens) == 56 assert sum([isinstance(token, str) for token in tokens]) == 12
def test_drop_meta_false(): with open(DUMMY_CORPUS) as corpus: tokens = extract_fields(corpus, drop_meta=False, **DUMMY_SPECS) tokens = list(tokens) assert len(tokens) == 50
def test_extract_tokens(): with open(DUMMY_CORPUS) as corpus: tokens = extract_fields(corpus, keep_meta={}, **DUMMY_SPECS) tokens = list(tokens) assert len(tokens) == 38
from os.path import dirname, join from itertools import chain from collections import Counter from corpustools import extract_fields, ngrams from corpustools.language_model import LanguageModel top = join(dirname(__file__), "data") DUMMY_CORPUS = join(top, "dummy_corpus.txt") DUMMY_SPECS = {"tag_field": 2, "delimiter": "\t", "num_fields": 3} with open(DUMMY_CORPUS) as corpus: tokens = list(extract_fields(corpus, **DUMMY_SPECS)) dummy_counts = Counter( chain(ngrams(tokens, 1, join_char="#"), ngrams(tokens, 2, join_char="#"), ngrams(tokens, 3, join_char="#"))) dummy_counts[""] = len([t for t in tokens if not t.startswith("<")]) def test_frequencies_and_probabilities_of_trigram_model(): lm = LanguageModel(3) lm.train(tokens) for result in lm.all_target_probabilities(return_n_gram=True, sizes=range(1, 4)): n_gram, frequency, probability = result *preceding, target = n_gram target_freq = dummy_counts["#".join(n_gram)] preceding_freq = dummy_counts["#".join(preceding)] target_prob = target_freq / preceding_freq