def calculate_linear_perplexity(linear_model, lamda1, lambda2, lambda3, eval_path) -> int: exponent = 0 path_to_unked_data = write_new_training_data(eval_path, linear_model.get_vocab(), EVAL_UNKED_DATA_) eval_stream = open(path_to_unked_data, "r") sentence = eval_stream.readline() ngram_counter = 0 corpus_size = 0 ngram_size = 3 while sentence: sentence_tokens = pad_sentence(sentence, ngram_size - 1).split() corpus_size += len( sentence.split()) + 1 # don't count start -- but do count stop for i in range(len(sentence_tokens)): if i + ngram_size - 1 < len(sentence_tokens): ngram_key = tuple(sentence_tokens[i:i + ngram_size]) exponent += log2( linear_model.get_prob(ngram_key, lamda1, lambda2, lambda3)) ngram_counter += 1 sentence = eval_stream.readline() # Perplexity is equal to 2 to the power of the negative `l` perplexity = 2**-(exponent / corpus_size) eval_stream.close() return round(perplexity, 4)
def extract_vocab(self, training_data_path): # set up our model structure self.unigram_counter = {} # use a dictionary for convenience training_data_file = open(training_data_path, "r") # open a handle to our training corpus (read mode) sentence = training_data_file.readline() # get the first sentence while sentence: sentence = pad_sentence(sentence) self.consume_sentence(sentence) # update our model with this sentence sentence = training_data_file.readline() # go to the next line training_data_file.close()
def calculate_ngram_perplexity(eval_path: str, vocab: dict, probs: dict, ngram_size: int, smoothed=False, report_mode=False) -> int: exponent = 0 path_to_unked_data = write_new_training_data(eval_path, vocab, EVAL_UNKED_DATA_) eval_stream = open(path_to_unked_data, "r") sentence = eval_stream.readline() ngram_counter = 0 corpus_size = 0 while sentence: sentence_tokens = pad_sentence(sentence, ngram_size - 1).split() corpus_size += len( sentence.split()) + 1 # don't count start -- but do count stop for i in range(len(sentence_tokens)): ngram_key = tuple( sentence_tokens[i:i + ngram_size]) # slice to get ngram size tuples if probs.get(ngram_key): exponent += log2(probs[ngram_key]) else: if not smoothed: exponent += -inf else: # universal smooth -- words we knew but didnt see together exponent += log2(probs[get_unk_tuple( ngram_size)]) # the ngram sized ungram we add ngram_counter += 1 sentence = eval_stream.readline() # Perplexity is equal to 2 to the power of the negative `l` perplexity = 2**-(exponent / corpus_size) if report_mode: print("Perplexity Score: {}".format(perplexity)) eval_stream.close() return round(perplexity, 4)
def extract_ngrams(self, training_data_path, ngram_size): training_data = open(training_data_path, "r") sentence = training_data.readline() # loop through first and unk low occurrence words while sentence: sentence_tokens = pad_sentence( sentence, ngram_size - 1).split() # pad our sentences with starts n stops for i in range(len(sentence_tokens)): # test for if this is a valid ngram with this size if i < len(sentence_tokens) - ( ngram_size - 1 ): # don't overstep into late sentence words who would include [STOP] ngram_key = tuple(sentence_tokens[i:i + ngram_size - 1]) ngram_tuple = tuple(sentence_tokens[i:i + ngram_size]) self.ngram_key_occurrence[ngram_key] = \ self.ngram_key_occurrence.get(ngram_key, 0) + 1 # increase how many sightings of the prefix self.ngram_sighting[ngram_tuple] = self.ngram_sighting.get( ngram_tuple, 0) + 1 # how many times have seen ngram self.total_ngrams += 1 # count up the total number of ngrams we see sentence = training_data.readline() training_data.close()
def calculate_perplexity(eval_path: str, probs: dict, report_mode=False) -> int: perplexity = -1 exponent = 0 eval_stream = open(eval_path, "r") sentence = eval_stream.readline() corpus_size = 0 unigram_counter = 0 while sentence: sentence = pad_sentence(sentence) for token in sentence.split(): exponent += log2(probs.get(token, probs[UNK_])) corpus_size += 1 unigram_counter += 1 sentence = eval_stream.readline() # Perplexity is equal to 2 to the power of the negative `l` perplexity = 2**-(exponent / corpus_size) if report_mode: print("Perplexity Score: {}".format(perplexity)) eval_stream.close() return round(perplexity, 4)
def test_pad_sentence_adds_starts_if_asked(self): self.assertEqual(START_ + " sentence " + STOP_, pad_sentence("sentence",1)) self.assertEqual(START_ + " " + START_ + " sentence " + STOP_, pad_sentence("sentence",2)) self.assertEqual(START_ + " " + START_ + " " + START_ + " sentence " + STOP_, pad_sentence("sentence",3))
def test_pad_sentence_adds_whitespace_and_special_stop(self): self.assertEqual("sentence " + STOP_,pad_sentence("sentence"))