Esempio n. 1
0
def test_keep_everything():
    with open(DUMMY_CORPUS) as corpus:
        tokens = extract_fields(corpus,
                                drop_meta=False,
                                drop_tags=False,
                                **DUMMY_SPECS)
        tokens = list(tokens)
    assert len(tokens) == 56
Esempio n. 2
0
def test_extract_tags_tokens():
    with open(DUMMY_CORPUS) as corpus:
        tags_tokens = extract_fields(corpus,
                                     keep_meta={},
                                     return_fields={1, 2},
                                     **DUMMY_SPECS)
        tags_tokens = list(tags_tokens)
        assert len(tags_tokens) == 38
        assert all([len(tt) == 2 for tt in tags_tokens])
Esempio n. 3
0
def test_extract_multiple_fields_and_meta():
    with open(DUMMY_CORPUS) as corpus:
        tokens = extract_fields(corpus,
                                return_fields=[0, 2],
                                drop_meta=False,
                                drop_tags=False,
                                **DUMMY_SPECS)
        tokens = list(tokens)
    assert len(tokens) == 56
    assert sum([isinstance(token, str) for token in tokens]) == 12
Esempio n. 4
0
def test_drop_meta_false():
    with open(DUMMY_CORPUS) as corpus:
        tokens = extract_fields(corpus, drop_meta=False, **DUMMY_SPECS)
        tokens = list(tokens)
        assert len(tokens) == 50
Esempio n. 5
0
def test_extract_tokens():
    with open(DUMMY_CORPUS) as corpus:
        tokens = extract_fields(corpus, keep_meta={}, **DUMMY_SPECS)
        tokens = list(tokens)
        assert len(tokens) == 38
from os.path import dirname, join
from itertools import chain
from collections import Counter

from corpustools import extract_fields, ngrams
from corpustools.language_model import LanguageModel

top = join(dirname(__file__), "data")

DUMMY_CORPUS = join(top, "dummy_corpus.txt")
DUMMY_SPECS = {"tag_field": 2, "delimiter": "\t", "num_fields": 3}

with open(DUMMY_CORPUS) as corpus:
    tokens = list(extract_fields(corpus, **DUMMY_SPECS))

dummy_counts = Counter(
    chain(ngrams(tokens, 1, join_char="#"), ngrams(tokens, 2, join_char="#"),
          ngrams(tokens, 3, join_char="#")))
dummy_counts[""] = len([t for t in tokens if not t.startswith("<")])


def test_frequencies_and_probabilities_of_trigram_model():
    lm = LanguageModel(3)
    lm.train(tokens)
    for result in lm.all_target_probabilities(return_n_gram=True,
                                              sizes=range(1, 4)):
        n_gram, frequency, probability = result
        *preceding, target = n_gram
        target_freq = dummy_counts["#".join(n_gram)]
        preceding_freq = dummy_counts["#".join(preceding)]
        target_prob = target_freq / preceding_freq