Beispiel #1
0
def start():
    n = None
    print(
        '''Welcome. Let\'s create a language model together.\nWhat size n-grams do you desire?'''
    )
    while n == None:
        try:
            n = int(input())
            if n < 1 or n > 6:
                print('Please enter an integer between 1 and 6.')
                n = None
        except:
            print('Please enter an integer between 1 and 6.')
    mdl = lm.LanguageModel(n)
    print(
        '''\nGreat choice!! You must train the model.\nPlease tell us the path to a text that you\'d like to use for training.'''
    )
    filename = input()
    tokens = cp.open_file(filename)
    while tokens == None:
        print('Try again.')
        filename = input()
        tokens = cp.open_file(filename)
    mdl.train(tokens)
    print(
        '\nYour model has been created. Here\'s a list of commands you can use to explore further.\n'
    )
    help()
    return mdl
Beispiel #2
0
def train(mdl):
    print('\nPlease input path to the text you want to train from.')
    inpt = input()
    text = cp.open_file(inpt)
    while text == None:
        print('\nTry another text file? Or quit?')
        inpt = input()
        if inpt == 'quit':
            return
        text = cp.open_file(inpt)

    mdl.train(text)
    print('\nThe model has been updated.')
    return mdl
Beispiel #3
0
def test_models_have_correct_lambda_size():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    for i in range(0, lm.n - 2):
        model = lm.models[i]
        assert len(model.lambdas) == len(model.hist_words_dct)
Beispiel #4
0
def test_kn_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.kn_evaluate(['text', 'shall', 'train']) == -2.0770634192748685
    assert lm.kn_evaluate(['this', 'text', 'dog']) == -3.1656313103493887
    assert lm.kn_evaluate(['the', 'brown', 'cat']) == -2.4724841297894433
Beispiel #5
0
def test_models_have_correct_n():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    for i in range(0, lm.n - 2):
        model = lm.models[i]
        assert model.n == i + 2
Beispiel #6
0
def test_perplexity_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    perp = round(lm.perplexity(2, math.log(0.5)), 5)
    correct = round(math.sqrt(2), 5)
    assert perp == correct
Beispiel #7
0
def test_models_have_correct_beginning_grams():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert sorted(lm.models[0].beginning_grams) \
     == sorted(['this', 'shall', 'PAD'])
    assert sorted(lm.models[1].beginning_grams) \
     == sorted(['PAD this', 'this text', 'PAD PAD', 'shall train'])
Beispiel #8
0
def test_laplace_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.laplace_evaluate(['this', 'shall', 'train', 'PAD']) \
     == -2.890371757896165
    assert lm.laplace_evaluate(['dog', 'text', '.', 'PAD']) \
     == (math.log(1 / 9) + math.log(1 / 2))
Beispiel #9
0
def plot(mdl):
    print('\nPlease input path to the text you want to plot.')
    inpt = input()
    text = cp.open_file(inpt)
    if text != None and len(text) + 1 < mdl.n:
        print('\nThis text is too small. It must have at least %s tokens.' %
              (mdl.n - 1))
        text = None

    while text == None:
        print('\nTry another text file? Or quit?')
        inpt = input()
        if inpt == 'quit':
            return
        text = cp.open_file(inpt)

    mdl.plot_perplexity(inpt, text)
Beispiel #10
0
def kn(mdl):
    print('\nPlease input path to the text you want to evaluate.')
    inpt = input()
    text = cp.open_file(inpt)
    if text != None and len(text) + 1 < mdl.n:
        print('\nThis text is too small. It must have at least %s tokens.' %
              (mdl.n - 1))
        text = None
    while text == None:
        print('\nTry another text file? Or quit?')
        inpt = input()
        if inpt == 'quit':
            return
        text = cp.open_file(inpt)

    prob = mdl.kn_evaluate(text)
    perp = mdl.perplexity(len(text) + 1, prob)
    print('The model predicts a log probability of:\n', str(prob))
    print('The perplexity of this prediction is:\n', str(perp))
Beispiel #11
0
def test_train_creates_expected_hist_words_dict():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    assert sorted(list(model.hist_words_dct.keys())) \
     == sorted(['PAD', 'this', 'text', 'shall', 'train', '.'])
    assert list(model.hist_words_dct['this'].keys()) == ['text']
    assert list(model.hist_words_dct['text'].keys()) == ['.']
    assert list(model.hist_words_dct['shall'].keys()) == ['train']
    assert list(model.hist_words_dct['train'].keys()) == ['text']
    assert list(model.hist_words_dct['PAD'].keys()) == ['this']
    assert sorted(list(model.hist_words_dct['.'].keys())) \
     == sorted(['PAD', 'shall'])
Beispiel #12
0
def test_subsequent_training():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    wh1_len = len(model.word_hists_dct)
    hw1_len = len(model.hist_words_dct)
    data = tokenize('This sample.')
    lm.train(data)
    model = lm.models[-1]
    wh2_len = len(model.word_hists_dct)
    hw2_len = len(model.hist_words_dct)
    assert wh2_len - wh1_len == 1
    assert hw2_len - hw1_len == 1
    assert sorted(list(model.word_hists_dct['.'].keys())) \
     == sorted(['text', 'sample'])
    assert sorted(list(model.hist_words_dct['this'].keys())) \
     == sorted(['text', 'sample'])
Beispiel #13
0
def test_discount():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.discount == 0.75
Beispiel #14
0
def test_open_handles_empty_files():
    assert open_file('empty.txt') is None
Beispiel #15
0
def test_open_handles_non_utf8_files():
    assert open_file('pdf.pdf') is None
Beispiel #16
0
def test_p_next_sums_to_one():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert sum(lm.p_next(['this', 'text']).values()) == 1
Beispiel #17
0
def test_kn_produces_expected_values_n4():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.kn_evaluate(['shall', 'train', 'text',
                           '.']) == -0.7742507185722116
Beispiel #18
0
def test_models_have_correct_vocab_size():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert (lm.models[0].ngram_vocab_size == 7)
    assert (lm.models[1].ngram_vocab_size == 9)
Beispiel #19
0
def test_lm_has_correct_number_tokens_and_unigram_types():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.num_tokens == 7
    assert len(lm.unigrams) == 5
Beispiel #20
0
def test_laplace_produces_expected_values2():
    lm = LanguageModel(1)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.laplace_evaluate(['text']) == math.log(3 / 12)
    assert lm.laplace_evaluate(['dog']) == math.log(1 / 12)
Beispiel #21
0
import pytest
from hypothesis import given
from hypothesis.strategies import lists, text
from lm import Model, LanguageModel
from corpus import tokenize, detokenize, punc_merge, open_file
import math

shakespeare = open_file('train_shakespeare.txt')
empty = open_file('empty.txt')
mind = open_file('amind.txt')


def test_tokenize_returns_list_of_strings():
    tokens = tokenize('this string')
    assert type(tokens) == list
    for item in tokens:
        assert type(item) == str


@pytest.mark.parametrize('string, tokens', [
    ('This is a string.', ['this', 'is', 'a', 'string', '.']),
    ('That isn\'t what\'s on-the-go!',
     ['that', 'is', "n't", 'what', "'s", 'on-the-go', '!']),
    ("Where is thy leather apron and thy rule? What dost thou with thy best apparel on? You, sir, what trade are you?",
     [
         'where', 'is', 'thy', 'leather', 'apron', 'and', 'thy', 'rule', '?',
         'what', 'dost', 'thou', 'with', 'thy', 'best', 'apparel', 'on', '?',
         'you', ',', 'sir', ',', 'what', 'trade', 'are', 'you', '?'
     ]),
])
def test_tokenize_produces_correct_tokens(string, tokens):