def main(argv): """Trains an nltk language model. Loads in files of normalized text, partitions them into a train partition (3/4 of data) and a test partition (last 1/4 of data). Uses Laplace smoothing for unseen ngrams. """ if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") normalized_data = load_normalized_data(FLAGS.language, FLAGS.data_source, FLAGS.pass_valid, FLAGS.experiment) train_partition, test_partition = partition_data(normalized_data) train_ngrams, vocab = padded_everygram_pipeline(2, train_partition) test_ngrams, _ = padded_everygram_pipeline(2, test_partition) language_model = Laplace(2) language_model.fit(train_ngrams, vocab) avg_perp, count = compute_avg_perplexity(test_ngrams, language_model) print("\n----------------------------\n" "Language Model Parameters:\n" f"\tLanguage={FLAGS.language}\n" f"\tData Sources={FLAGS.data_source}\n" f"\tPass Valid={FLAGS.pass_valid}\n" f"\tExperiment={FLAGS.experiment}\n" "----------------------------\n") print(f"Average perplexity across {count} ngrams:\t{avg_perp}")
def test_d2_1_gp(): global food_corpus, natr_corpus food_corpus_tk = train.tokenize_corpus(food_corpus) natr_corpus_tk = train.tokenize_corpus(natr_corpus) food_train, food_vocab = padded_everygram_pipeline( 3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))]) natr_train, natr_vocab = padded_everygram_pipeline( 3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))]) food_test = sum([['<s>'] + x + ['</s>'] for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], []) natr_test = sum([['<s>'] + x + ['</s>'] for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], []) food_lm = Laplace(3) natr_lm = Laplace(3) food_lm.fit(food_train, food_vocab) natr_lm.fit(natr_train, natr_vocab) eq_(int(evaluate.get_perplexity(food_lm, food_test[:2500])), 7318) eq_(int(evaluate.get_perplexity(food_lm, natr_test[:2500])), 7309) eq_(int(evaluate.get_perplexity(natr_lm, natr_test[:2500])), 5222) eq_(int(evaluate.get_perplexity(natr_lm, food_test[:2500])), 5354)
def test_d2_1_gp(self): nltk.download('punkt') food_corpus_tk = lab3.tokenize_corpus(self.food_corpus) natr_corpus_tk = lab3.tokenize_corpus(self.natr_corpus) food_train, food_vocab = padded_everygram_pipeline( 3, food_corpus_tk[:int(0.8 * len(food_corpus_tk))]) natr_train, natr_vocab = padded_everygram_pipeline( 3, natr_corpus_tk[:int(0.8 * len(natr_corpus_tk))]) food_test = sum( [['<s>'] + x + ['</s>'] for x in food_corpus_tk[int(0.8 * len(food_corpus_tk)):]], []) natr_test = sum( [['<s>'] + x + ['</s>'] for x in natr_corpus_tk[int(0.8 * len(natr_corpus_tk)):]], []) food_lm = Laplace(3) natr_lm = Laplace(3) food_lm.fit(food_train, food_vocab) natr_lm.fit(natr_train, natr_vocab) self.assertEqual(int(lab3.get_perplexity(food_lm, food_test[:2500])), 7318) self.assertEqual(int(lab3.get_perplexity(food_lm, natr_test[:2500])), 7309) self.assertEqual(int(lab3.get_perplexity(natr_lm, natr_test[:2500])), 5222) self.assertEqual(int(lab3.get_perplexity(natr_lm, food_test[:2500])), 5354)
class LaplaceBigramTests(unittest.TestCase): """unit tests for Laplace class""" score_tests = [ # basic sanity-check: # count(d | c) = 1 # *count(d | c) = 2 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 9 ("d", ["c"], 2.0 / 9), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 8 = 22 # count("a") = 2 # *count("a") = 3 ("a", None, 3.0 / 22), # in vocabulary but unseen # count("z") = 0 # *count("z") = 1 ("z", None, 1.0 / 22), # out of vocabulary should use "UNK" score # count("<UNK>") = 3 # *count("<UNK>") = 4 ("y", None, 4.0 / 22), ] def setUp(self): vocab, training_text = _prepare_test_data(2) self.model = Laplace(2, vocabulary=vocab) self.model.fit(training_text) def test_gamma(self): # Make sure the gamma is set to 1 self.assertEqual(1, self.model.gamma) def test_entropy_perplexity(self): text = [ ("<s>", "a"), ("a", "c"), ("c", "<UNK>"), ("<UNK>", "d"), ("d", "c"), ("c", "</s>"), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # <s>, a = 0.2, -2.3219 # a, c = 0.1, -3.3219 # c, UNK = 0.(1), -3.1699 # UNK, d = 0.(09), 3.4594 # d, c = 0.1 -3.3219 # c, </s> = 0.(1), -3.1699 # Total logscores: −18.7651 # - AVG logscores: 3.1275 H = 3.1275 perplexity = 8.7393 self.assertAlmostEqual(H, self.model.entropy(text), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
def nltk_ngram_perplexity(train, test): # Unigram train_sentences = [line.strip() for line in open(train, 'r')] tokenized_text = [list(nltk.tokenize.word_tokenize(sent)) for sent in train_sentences] single_line = [list(itertools.chain.from_iterable(tokenized_text))] n = 1 # train_data = [ngrams(sent, 1) for sent in tokenized_text] train_data = [ngrams(sent, 1) for sent in single_line] model = Laplace(n) words = [word for sent in tokenized_text for word in sent] padded_vocab = Vocabulary(words) model.fit(train_data, padded_vocab) test_sentences = [line.strip() for line in open(test, 'r')] tokenized_text = [list(nltk.tokenize.word_tokenize(sent)) for sent in test_sentences] single_line = [list(itertools.chain.from_iterable(tokenized_text))] # test_data = [ngrams(sent, 1) for sent in tokenized_text] test_data = [ngrams(sent, 1) for sent in single_line] for i, test_d in enumerate(test_data): print(f'unigram: {model.perplexity(test_d)}') # print(model.entropy(test_d)) # Bigram train_sentences = [line.strip() for line in open(train, 'r')] tokenized_text = [list(nltk.tokenize.word_tokenize(sent)) for sent in train_sentences] single_line = [list(itertools.chain.from_iterable(tokenized_text))] n = 2 # train_data = [ngrams_pad(sent, n) for sent in tokenized_text] train_data = [ngrams_pad(sent, n) for sent in single_line] model = Laplace(n) words = [word for sent in tokenized_text for word in sent] words.extend(["<s>", "</s>"]) padded_vocab = Vocabulary(words) model.fit(train_data, padded_vocab) test_sentences = [line.strip() for line in open(test, 'r')] tokenized_text = [list(nltk.tokenize.word_tokenize(sent)) for sent in test_sentences] single_line = [list(itertools.chain.from_iterable(tokenized_text))] # test_data = [ngrams_pad(sent, n) for sent in tokenized_text] test_data = [ngrams_pad(sent, n) for sent in single_line] for i, test_d in enumerate(test_data): print(f'bigram: {model.perplexity(test_d)}')
def treinando_modelo_Laplace(lista_de_textos): #Salvando todas as frases em uma unica variavel todas_as_questoes = " ".join(lista_de_textos) #Separando as palavras do texto levando em consideração o espaço em branco todas_as_palavras = WhitespaceTokenizer().tokenize(todas_as_questoes) #Adicionando os fake chars em cada palavra e gerando o vetor de vocabulos(nesse caso letras de cada palavra) treino_bigram,vocab = padded_everygram_pipeline(2,todas_as_palavras) #Criando modelo MLE para bigramas modelo = Laplace(2) #Treinando os modelos modelo.fit(treino_bigram,vocab) return modelo
def compute_pp(self, n, tokenized_train, tokenized_test): train_data, padded_sents = padded_everygram_pipeline( n, tokenized_train) test_data, padded_sents = padded_everygram_pipeline(n, tokenized_test) model = Laplace(1) model.fit(train_data, padded_sents) s = 0 for i, test in enumerate(test_data): p = model.perplexity(test) s += p perplexity = s / (i + 1) return perplexity
def fit_mle_model(text, text_dict): # text dict key: index value: text, nie ma w tokenizer domyslnie trzeba odwrocic slownik model = Laplace(2) tokenized_text = [[text_dict[index] for index in sentence] for sentence in text] train_data = [list(nltk.bigrams(t)) for t in tokenized_text] train_data_without_unk = [] for bigrams in train_data: filtered_text = [] for bigram in bigrams: if bigram[0] != 'UNK' and bigram[1] != 'UNK': filtered_text.append(bigram) train_data_without_unk.append(filtered_text) words = [word for sentence in tokenized_text for word in sentence] vocab = Vocabulary(words) model.fit(train_data_without_unk, vocab) return model
def make_all_model(self): texts = [] for e in self.db: for l in e.lexes: texts.append(l['text']) tokenized_texts = [normalize_text(x).split() for x in texts] n = 3 train_data, padded_texts = padded_everygram_pipeline( n, tokenized_texts) model = Laplace(n) model.fit(train_data, padded_texts) return model
def __init__(self, n=3): tokens = [] for book in shakespeare.fileids(): elt = shakespeare.xml(book) iterator = elt.getiterator() for node in iterator: lines = node.findall("LINE") for line in lines: line_tokens = list(str(line.text)) line_tokens.insert(0, "<L>") line_tokens.append("</L>") tokens.append(line_tokens) t = (everygrams(x, max_len=n) for x in tokens) v = flatten(tokens) lm = Laplace(order=n) # add-one smoothing lm.fit(t, v) self._n = n self._lm = lm self._tokenize_pattern = re.compile(r'(<L>)|(</L>)')
def make_model(self, e): target_triples = set(e.triples) texts = [] for e_ in self.db: if target_triples.intersection(e_.triples): for l in e_.lexes: texts.append(l['text']) tokenized_texts = [normalize_text(x).split() for x in texts] n = 3 train_data, padded_texts = padded_everygram_pipeline( n, tokenized_texts) model = Laplace(n) model.fit(train_data, padded_texts) return model
def likelihoods_gen(ngrams_dir=NGRAMS_DIR, n_gram=N_GRAM): with open(ngrams_dir, 'rb') as pickle_in: ngrams = pickle.load(pickle_in, encoding='utf8') tokenized_train_corpus = Tokenizer.job_data_tokenizer(TRAIN_CORPUS_DIR) train_data, padded_sents = padded_everygram_pipeline( N_GRAM, tokenized_train_corpus) # Maximum Likelihood Estimator (MLE) model using Laplace Smoothing (gamma is always 1). lm = Laplace(n_gram) lm.fit(train_data, padded_sents) likelihoods = defaultdict(list) # Likelihood estimator for ngrams for k in ngrams: for ng in ngrams[k]: # ngram is deteremined by the number of splits it has in its sentence tokens = ng.split(' ') # Score a word given some optional context. Unseen words are assigned probability 0. x, y = tokens[-1], tuple(tokens[:-1]) score = lm.unmasked_score(x, context=y) # P('x'|'y') # we create a mapping of given word y to a list of possible next words and their scores if score != 0: likelihoods[y].append((score, x)) with open('bin/likelihoods.pkl', 'wb') as output: pickle.dump(likelihoods, output) output.close() def evaluate(): with open(TEST_CORPUS_DIR, 'rb') as pickle_in: test_corpus = pickle.load(pickle_in, encoding='utf8') # Evaluate the total entropy of a corpus with respect to the model. # This is the sum of the log probability of each word in the test corpus. file = open('bin/model_evaluation.txt', 'w') file.write('Model Evaluation Score (Entropy): {}'.format( lm.entropy(test_corpus))) file.close() evaluate()
def vary_ngram(train_corpus, test_corpus, n_gram_orders): ''' Use the nltk.lm.Laplace for training. Returns a dictionary of perplexity values at different order n-gram LMs :param train_corpus: list of list of str, corpus to train language model on. :param test_corpus: list of list of str, corpus to test language model on. :n_gram_orders: list of ints, orders of n-grams desired. :returns: a dictionary of perplexities at different orders, key=order, value=perplexity. :rtype: dict. Hint: Follow the same LM training procedure as in the notebook in the end of Exercise 1. ''' test = sum([['<s>'] + x + ['</s>'] for x in test_corpus], []) ret = {} for order in n_gram_orders: train, vocab = padded_everygram_pipeline(order, train_corpus) lm = Laplace(order) lm.fit(train, vocab) ret[order] = lm.perplexity(test) return ret
class NGramSentences: def __init__(self, n=3, filename='cache/book.txt'): with open(filename) as file: text = file.read() tokens = [ list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text) ] train, vocab = padded_everygram_pipeline(3, tokens) self.model = Laplace(n) self.model.fit(train, vocab) def generate(self, prev_word='<s>', max_words=25): return detokenize( list( itertools.takewhile( lambda word: word != '</s>', itertools.dropwhile( lambda word: word == '<s>', (word for word in self.model.generate( max_words, text_seed=[prev_word]))))))
entropy = -1 * mean perplexity = pow(2.0, entropy) return perplexity def avg_sent_perplexity(corpus, lm): perplexities = [] for sent in corpus: ngrams = [ngram for ngram in sent] perplexities.append(lm.perplexity(ngrams)) return sum(perplexities) / len(perplexities) if __name__ == '__main__': args = parse_args() lm = Laplace(args.n) # smoothing if args.train is not None: train_corpus = load_corpus(args.train) else: train_corpus = brown.sents() train, vocab = padded_everygram_pipeline(args.n, train_corpus) lm.fit(train, vocab) for test_file in args.corpora: test_corpus = load_corpus(test_file) test, vocab = padded_everygram_pipeline(args.n, test_corpus) perplexity = avg_sent_perplexity(test, lm) print('{}: {}'.format(test_file, perplexity))
class Viterbi(): def __init__(self, num_of_tags, num_of_vocab, tags, bigram_tags, sents_tags: List[Tuple[List, List]]): self.num_of_tags = num_of_tags self.num_of_vocab = num_of_vocab self.tags = list(tags) self.bigram_tags = bigram_tags self.sents_tags = sents_tags self.laplace = Laplace(2) self.laplace.fit(text=[self.bigram_tags], vocabulary_text=self.tags) self.cache = {} def _get_prob_wi_ti(self, word, tag): key = word + tag if key in self.cache: return self.cache[key] tag_cnt = 0 word_tag_cnt = 0 for s, t in self.sents_tags: if len(s) != len(t): raise Exception( f'sentence and tag are not aligned.\n sentence:{s}\n tag:{t}' ) tag_cnt += t.count(tag) word_tag_cnt += len( [i for i in range(len(s)) if s[i] == word and t[i] == tag]) self.cache[key] = (word_tag_cnt + 1) / (tag_cnt + self.num_of_vocab) return self.cache[key] def _get_prob_ti_1_and_ti(self, ti_1: str, ti: str): return self.laplace.score(ti, [ti_1]) def viterbi(self, sentence): """ sentence = ['من','به','مدرسه','رفتم'] """ # init viterbi table viterbi = [[0 for _ in range(len(sentence) + 1)] for _ in range(self.num_of_tags)] backtrace = [[0 for _ in range(len(sentence) + 1)] for _ in range(self.num_of_tags)] for index in range(len(self.tags)): p_trans = self._get_prob_ti_1_and_ti('<s>', self.tags[index]) p_emis = self._get_prob_wi_ti(sentence[0], self.tags[index]) viterbi[index][0] = p_trans * p_emis backtrace[index][0] = 0 for w_index in range(1, len(sentence)): cur_word = sentence[w_index] for t_index in range(len(self.tags)): cur_tag = self.tags[t_index] p_emis = self._get_prob_wi_ti(cur_word, cur_tag) tmp = [ viterbi[i][w_index - 1] * self._get_prob_ti_1_and_ti(self.tags[i], cur_tag) * p_emis for i in range(len(self.tags)) ] viterbi[t_index][w_index] = max(tmp) backtrace[t_index][w_index] = np.argmax([ viterbi[i][w_index - 1] * self._get_prob_ti_1_and_ti(self.tags[i], cur_tag) for i in range(len(self.tags)) ]) viterbi[-1][-1] = max([ viterbi[i][len(sentence) - 1] * self._get_prob_ti_1_and_ti(self.tags[i], '</s>') for i in range(len(self.tags)) ]) backtrace[-1][-1] = np.argmax([ viterbi[i][len(sentence) - 1] * self._get_prob_ti_1_and_ti(self.tags[i], '</s>') for i in range(len(self.tags)) ]) result = backtrace[-1][1:] # last list contain index of tags return ['<s>'] + [self.tags[result[i]] for i in range(0, len(result))] + ['</s>']
def laplace_bigram_model(bigram_training_data, vocabulary): model = Laplace(2, vocabulary=vocabulary) model.fit(bigram_training_data) return model
from nltk.lm.preprocessing import flatten from nltk.util import everygrams bigramsList = list(map(lambda x: list(trigrams(x)), y)) bigramsList = list(flatten(bigramsList)) #list(everygrams(bigramsList, max_len=2)) vocab = list(flatten(pad_both_ends(sent, n=2) for sent in text)) from nltk.lm import Vocabulary vocab = list(Vocabulary(vocab, unk_cutoff=1)) ''' from nltk.lm.preprocessing import padded_everygram_pipeline train, vocab = padded_everygram_pipeline(2, text) ''' lm = Laplace(3) lm.fit([bigramsList], vocabulary_text=list(vocab)) lm.generate(4, text_seed=["government", "had"]) def generateSentences(v): sent = v v = [lm.generate(1, text_seed=v)] sent = sent + v while v[0] != '</s>': l = len(sent) v = [lm.generate(1, text_seed=[sent[l - 2], sent[l - 1]])] sent = sent + v return sent
def ngram_model(train_data, vocab, n): model = Laplace(n) model.fit(train_data, vocab) return model