def start(): n = None print( '''Welcome. Let\'s create a language model together.\nWhat size n-grams do you desire?''' ) while n == None: try: n = int(input()) if n < 1 or n > 6: print('Please enter an integer between 1 and 6.') n = None except: print('Please enter an integer between 1 and 6.') mdl = lm.LanguageModel(n) print( '''\nGreat choice!! You must train the model.\nPlease tell us the path to a text that you\'d like to use for training.''' ) filename = input() tokens = cp.open_file(filename) while tokens == None: print('Try again.') filename = input() tokens = cp.open_file(filename) mdl.train(tokens) print( '\nYour model has been created. Here\'s a list of commands you can use to explore further.\n' ) help() return mdl
def train(mdl): print('\nPlease input path to the text you want to train from.') inpt = input() text = cp.open_file(inpt) while text == None: print('\nTry another text file? Or quit?') inpt = input() if inpt == 'quit': return text = cp.open_file(inpt) mdl.train(text) print('\nThe model has been updated.') return mdl
def test_models_have_correct_lambda_size(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) for i in range(0, lm.n - 2): model = lm.models[i] assert len(model.lambdas) == len(model.hist_words_dct)
def test_kn_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.kn_evaluate(['text', 'shall', 'train']) == -2.0770634192748685 assert lm.kn_evaluate(['this', 'text', 'dog']) == -3.1656313103493887 assert lm.kn_evaluate(['the', 'brown', 'cat']) == -2.4724841297894433
def test_models_have_correct_n(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) for i in range(0, lm.n - 2): model = lm.models[i] assert model.n == i + 2
def test_perplexity_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) perp = round(lm.perplexity(2, math.log(0.5)), 5) correct = round(math.sqrt(2), 5) assert perp == correct
def test_models_have_correct_beginning_grams(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert sorted(lm.models[0].beginning_grams) \ == sorted(['this', 'shall', 'PAD']) assert sorted(lm.models[1].beginning_grams) \ == sorted(['PAD this', 'this text', 'PAD PAD', 'shall train'])
def test_laplace_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.laplace_evaluate(['this', 'shall', 'train', 'PAD']) \ == -2.890371757896165 assert lm.laplace_evaluate(['dog', 'text', '.', 'PAD']) \ == (math.log(1 / 9) + math.log(1 / 2))
def plot(mdl): print('\nPlease input path to the text you want to plot.') inpt = input() text = cp.open_file(inpt) if text != None and len(text) + 1 < mdl.n: print('\nThis text is too small. It must have at least %s tokens.' % (mdl.n - 1)) text = None while text == None: print('\nTry another text file? Or quit?') inpt = input() if inpt == 'quit': return text = cp.open_file(inpt) mdl.plot_perplexity(inpt, text)
def kn(mdl): print('\nPlease input path to the text you want to evaluate.') inpt = input() text = cp.open_file(inpt) if text != None and len(text) + 1 < mdl.n: print('\nThis text is too small. It must have at least %s tokens.' % (mdl.n - 1)) text = None while text == None: print('\nTry another text file? Or quit?') inpt = input() if inpt == 'quit': return text = cp.open_file(inpt) prob = mdl.kn_evaluate(text) perp = mdl.perplexity(len(text) + 1, prob) print('The model predicts a log probability of:\n', str(prob)) print('The perplexity of this prediction is:\n', str(perp))
def test_train_creates_expected_hist_words_dict(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] assert sorted(list(model.hist_words_dct.keys())) \ == sorted(['PAD', 'this', 'text', 'shall', 'train', '.']) assert list(model.hist_words_dct['this'].keys()) == ['text'] assert list(model.hist_words_dct['text'].keys()) == ['.'] assert list(model.hist_words_dct['shall'].keys()) == ['train'] assert list(model.hist_words_dct['train'].keys()) == ['text'] assert list(model.hist_words_dct['PAD'].keys()) == ['this'] assert sorted(list(model.hist_words_dct['.'].keys())) \ == sorted(['PAD', 'shall'])
def test_subsequent_training(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] wh1_len = len(model.word_hists_dct) hw1_len = len(model.hist_words_dct) data = tokenize('This sample.') lm.train(data) model = lm.models[-1] wh2_len = len(model.word_hists_dct) hw2_len = len(model.hist_words_dct) assert wh2_len - wh1_len == 1 assert hw2_len - hw1_len == 1 assert sorted(list(model.word_hists_dct['.'].keys())) \ == sorted(['text', 'sample']) assert sorted(list(model.hist_words_dct['this'].keys())) \ == sorted(['text', 'sample'])
def test_discount(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) assert lm.discount == 0.75
def test_open_handles_empty_files(): assert open_file('empty.txt') is None
def test_open_handles_non_utf8_files(): assert open_file('pdf.pdf') is None
def test_p_next_sums_to_one(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert sum(lm.p_next(['this', 'text']).values()) == 1
def test_kn_produces_expected_values_n4(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) assert lm.kn_evaluate(['shall', 'train', 'text', '.']) == -0.7742507185722116
def test_models_have_correct_vocab_size(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert (lm.models[0].ngram_vocab_size == 7) assert (lm.models[1].ngram_vocab_size == 9)
def test_lm_has_correct_number_tokens_and_unigram_types(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.num_tokens == 7 assert len(lm.unigrams) == 5
def test_laplace_produces_expected_values2(): lm = LanguageModel(1) data = open_file('kn_test.txt') lm.train(data) assert lm.laplace_evaluate(['text']) == math.log(3 / 12) assert lm.laplace_evaluate(['dog']) == math.log(1 / 12)
import pytest from hypothesis import given from hypothesis.strategies import lists, text from lm import Model, LanguageModel from corpus import tokenize, detokenize, punc_merge, open_file import math shakespeare = open_file('train_shakespeare.txt') empty = open_file('empty.txt') mind = open_file('amind.txt') def test_tokenize_returns_list_of_strings(): tokens = tokenize('this string') assert type(tokens) == list for item in tokens: assert type(item) == str @pytest.mark.parametrize('string, tokens', [ ('This is a string.', ['this', 'is', 'a', 'string', '.']), ('That isn\'t what\'s on-the-go!', ['that', 'is', "n't", 'what', "'s", 'on-the-go', '!']), ("Where is thy leather apron and thy rule? What dost thou with thy best apparel on? You, sir, what trade are you?", [ 'where', 'is', 'thy', 'leather', 'apron', 'and', 'thy', 'rule', '?', 'what', 'dost', 'thou', 'with', 'thy', 'best', 'apparel', 'on', '?', 'you', ',', 'sir', ',', 'what', 'trade', 'are', 'you', '?' ]), ]) def test_tokenize_produces_correct_tokens(string, tokens):