def test_models_have_correct_lambda_size(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) for i in range(0, lm.n - 2): model = lm.models[i] assert len(model.lambdas) == len(model.hist_words_dct)
def test_kn_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.kn_evaluate(['text', 'shall', 'train']) == -2.0770634192748685 assert lm.kn_evaluate(['this', 'text', 'dog']) == -3.1656313103493887 assert lm.kn_evaluate(['the', 'brown', 'cat']) == -2.4724841297894433
def test_models_have_correct_n(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) for i in range(0, lm.n - 2): model = lm.models[i] assert model.n == i + 2
def test_perplexity_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) perp = round(lm.perplexity(2, math.log(0.5)), 5) correct = round(math.sqrt(2), 5) assert perp == correct
def test_models_have_correct_beginning_grams(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert sorted(lm.models[0].beginning_grams) \ == sorted(['this', 'shall', 'PAD']) assert sorted(lm.models[1].beginning_grams) \ == sorted(['PAD this', 'this text', 'PAD PAD', 'shall train'])
def test_laplace_produces_expected_values(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.laplace_evaluate(['this', 'shall', 'train', 'PAD']) \ == -2.890371757896165 assert lm.laplace_evaluate(['dog', 'text', '.', 'PAD']) \ == (math.log(1 / 9) + math.log(1 / 2))
def test_train_creates_expected_hist_words_dict(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] assert sorted(list(model.hist_words_dct.keys())) \ == sorted(['PAD', 'this', 'text', 'shall', 'train', '.']) assert list(model.hist_words_dct['this'].keys()) == ['text'] assert list(model.hist_words_dct['text'].keys()) == ['.'] assert list(model.hist_words_dct['shall'].keys()) == ['train'] assert list(model.hist_words_dct['train'].keys()) == ['text'] assert list(model.hist_words_dct['PAD'].keys()) == ['this'] assert sorted(list(model.hist_words_dct['.'].keys())) \ == sorted(['PAD', 'shall'])
def test_subsequent_training(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) model = lm.models[-1] wh1_len = len(model.word_hists_dct) hw1_len = len(model.hist_words_dct) data = tokenize('This sample.') lm.train(data) model = lm.models[-1] wh2_len = len(model.word_hists_dct) hw2_len = len(model.hist_words_dct) assert wh2_len - wh1_len == 1 assert hw2_len - hw1_len == 1 assert sorted(list(model.word_hists_dct['.'].keys())) \ == sorted(['text', 'sample']) assert sorted(list(model.hist_words_dct['this'].keys())) \ == sorted(['text', 'sample'])
def main(): p = get_argparser() args = p.parse_args() lm = LanguageModel() lm.configure_logger(level=logging.DEBUG if args.DEBUG else logging.INFO, write_file=True) if args.train and args.data_path: lm.train(args.data_path, output_path=args.train, learning_rate=args.learning_rate, hidden_size=args.hidden_size, batch_size=args.batch_size, max_epoch=args.max_epoch) elif args.test and args.data_path: lm.predict(args.test, args.data_path) else: # Well, this is silly. p.print_help() exit(2)
def main(args): """ Main function of the program operates based on the argument provided. Train - Ask for ngram - Ask for training file path - Train language model - Save the trained model Generate - Load the saved model from pickle file - Ask for a beam search (y/n) - Ask Beam length - Print one generated sentence in terminal - Ask for number of sentences to be generated on file - Save the input number of sentences in a file (Default: new_shakespeare.txt) Perplexity - Load Pickle file - Ask the test set file path - Print perplexity value Common - Load pickle - Ask number of most common ngram - Print the most common ngram with their occurence number. """ if args['train']: if not args['--n']: ngram = input("Please enter n for n-gram (Default: 3)-\n") if not ngram: ngram=3 else: ngram=args['--n'] lm = LanguageModel(int(ngram)) if not args['--path']: path = input("Please enter path of the file-\n") else: path = args['--path'] lm.train(readFile(path)) print("N-gram training completed") print("Saving the model") f = open('trained_model_ngram.pkl','wb') pickle.dump(lm, f) f.close() print("Model saved") if args['generate']: lm = loadPickle() if click.confirm('Do you want to generate with Beam search?', default=True): lm.beam_flag = True beam_size =input("Enter beam size (Default: 20)-\n") if not beam_size: lm.beam_width = beam_size else: lm.beam_flag = False print("Generating one sentence in terminal...") print(detokenize(lm.generate())) if not args['--lines']: noOfText =input("Enter number of generated text you want to save (Default: 10)-\n") if not noOfText: noOfText=10 else: noOfText = args['--lines'] generated = [] for g in range(0, int(noOfText)): generated.append(detokenize(lm.generate())) with open('new_shakespeare.txt', 'w') as f: for g in generated: f.write("%s\n" % g) print("Sentence file generated in current folder") if args['perplexity']: lm = loadPickle() if not args['--path']: path = input("Please enter path of the test file-\n") else: path = args['--path'] print("Perplexity for {}-gram is {}".format(lm.ngram,lm.perplexity(readFile(path)))) if args['common']: lm = loadPickle() if args['--number']: number = args['--number'] else: number = 5 lm.count_common_ngram(int(number))
class LanguageModelTests(unittest.TestCase): @classmethod def setUpClass(cls): print("\LanguageModelTests starts") print("==========") @classmethod def tearDownClass(cls): print("==========") print("LanguageModelTests has ended") def setUp(self): self.lm = LanguageModel(3) self.token_sequences = [['the', 'cat', 'runs'], ['the', 'dog', 'runs']] self.lm.train(self.token_sequences) def test_get_ngrams(self): print("id: " + self.id()) self.lm.n = 4 input_tokens = ['the', 'cat', 'in', 'the', 'hat'] result_ngrams = [ (None, None, None, 'the'), (None, None, 'the', 'cat'), (None, 'the', 'cat', 'in'), ('the', 'cat', 'in', 'the'), ('cat', 'in', 'the', 'hat'), ('in', 'the', 'hat', None), ('the', 'hat', None, None), ('hat', None, None, None) ] self.assertEqual(self.lm.get_ngrams(input_tokens), result_ngrams) def test_train_vocabulary_and_counts(self): print("id: " + self.id()) self.assertEqual(self.lm.vocabulary, {None, 'the', 'cat', 'runs', 'dog'}) result_counts = { (None, None): { 'the': 2 }, (None, 'the'): { 'cat': 1, 'dog': 1 }, ('the', 'cat'): { 'runs': 1 }, ('cat', 'runs'): { None: 1 }, ('runs', None): { None: 2 }, ('the', 'dog'): { 'runs': 1 }, ('dog', 'runs'): { None: 1 } } self.assertEqual(self.lm.counts, result_counts) def test_normalize(self): print("id: " + self.id()) input_words = {'cat': 1, 'dog': 1} result_probabilities = {'cat': 0.5, 'dog': 0.5} self.assertEqual(self.lm.normalize(input_words), result_probabilities) def test_normalize_sum_probabilies(self): print("id: " + self.id()) input_words = {'cat': 1, 'dog': 1} probabilities = self.lm.normalize(input_words) prob_sum = 0 for key in probabilities: prob_sum += probabilities[key] self.assertEqual(prob_sum, 1) def test_predict_next(self): print("id: " + self.id()) input_tokens = [None, "zero", None, 'the', 'dog'] result_probabilities = {'runs': 1} self.assertEqual(self.lm.p_next(input_tokens), result_probabilities) def test_sample(self): print("id: " + self.id()) input_probability_distribution = {'heads': 0.5, 'tails': 0.5} predicted_word = self.lm.sample(input_probability_distribution)[0] self.assertIn(predicted_word, input_probability_distribution)
def test_p_next_sums_to_one(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert sum(lm.p_next(['this', 'text']).values()) == 1
def test_kn_produces_expected_values_n4(): lm = LanguageModel(4) data = open_file('kn_test.txt') lm.train(data) assert lm.kn_evaluate(['shall', 'train', 'text', '.']) == -0.7742507185722116
def test_models_have_correct_vocab_size(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert (lm.models[0].ngram_vocab_size == 7) assert (lm.models[1].ngram_vocab_size == 9)
def test_laplace_produces_expected_values2(): lm = LanguageModel(1) data = open_file('kn_test.txt') lm.train(data) assert lm.laplace_evaluate(['text']) == math.log(3 / 12) assert lm.laplace_evaluate(['dog']) == math.log(1 / 12)
def test_discount(): lm = LanguageModel(2) data = open_file('kn_test.txt') lm.train(data) assert lm.discount == 0.75
def test_lm_has_correct_number_tokens_and_unigram_types(): lm = LanguageModel(3) data = open_file('kn_test.txt') lm.train(data) assert lm.num_tokens == 7 assert len(lm.unigrams) == 5