def test_add_one_word(self): """Assert that adding a single word to a newly constructed Vocabulary works as expected""" v = Vocabulary() i = v.get_int("hello") self.assertEqual(i, 0) self.assertEqual(v.size(), 1) self.assertEqual(v.words(), ["hello"]) self.assertEqual(v.get_int("hello"), i) self.assertEqual(v.get_word(0), "hello")
class ParallelCorpus: # Define a constructor def __init__(self): # List of English sentences. Each sentence will be represented as a list of ints. self.e = list() # List of foreign sentences Each sentence will be represented as a list of ints. self.f = list() # Initially empty vocabularies self.e_vocab = Vocabulary() self.f_vocab = Vocabulary() # Returns the number of sentence pairs that have been added to this parallel corpus def size(self): return len(self.e) # Returns the list of integers corresponding to the English sentence at the specified sentence index def get_e(self, sentence_index): return self.e[sentence_index] # Returns the list of integers corresponding to the foreign sentence at the specified sentence index def get_f(self, sentence_index): return self.f[sentence_index] # Given a string representing an English sentence # and a string representing a foreign sentence, # tokenize each string using nltk.word_tokenize, # and use the appropriate vocabulary to convert each token to an int. # # Append the list of integers (corresponding to the English sentence) to self.e # Append the list of integers (corresponding to the foreign sentence) to self.f def add(self, e, f): #tokenizing the sentences english_words = nltk.word_tokenize(e) foreign_words = nltk.word_tokenize(f) #creating lists to store variables english_list = list() foreign_list = list() #going through each word of sentence to find the corresponding int and append to list for english_word in english_words: cur_int = Vocabulary.get_int(self.e_vocab, english_word) english_list.append(cur_int) for foreign_word in foreign_words: cur_int = Vocabulary.get_int(self.f_vocab, foreign_word) foreign_list.append(cur_int) #appending list to of int to self.? self.e.append(english_list) self.f.append(foreign_list) # Construct a conditional distribution with the given name. # # Use the formula given in the supplementary instructions def create_uniform_distribution(self, name): #use constructor of Conditional class init_val = 1 / self.f_vocab.size() # print(init_val) return Conditional(name, self.e_vocab, self.f_vocab, init_val) # Given a sentence index, a scaling factor epsilon, and a conditional distribution, # calculate the conditional probability # of the English sentence (at that sentence index) # given the foreign sentence (at that sentence index) # # Use the formula given in the supplementary instructions def conditional_probability(self, sentence_index, epsilon, conditional): #getting int from sentences at sentence_index english = self.get_e(sentence_index) foreign = self.get_f(sentence_index) # print(english) # print(foreign) #using formula from PDF to initialize operands of prob factor = epsilon / (len(english) * len(foreign)) sum_of_sum = 0 # print(factor) #gathering values of each word for j in range(0, len(english)): for i in range(0, len(foreign)): new_val = conditional.get(english[j], foreign[i]) # print(new_val) sum_of_sum = sum_of_sum + new_val # print(sum_of_sum) cond_prob = factor * sum_of_sum return cond_prob # Given a conditional distribution and a scaling factor epsilon, # calculate the perplexity of this parallel corpus. # # Use the formula given in the supplementary instructions def perplexity(self, epsilon, conditional): sum_perx = 0 range_bound = self.size() # print(range_bound) #following formula from PDF for s in range(0, range_bound): # print(s) temp = self.conditional_probability(s, epsilon, conditional) # print(temp) new_val_2 = math.log2(temp) sum_perx = sum_perx + new_val_2 PP = -1 * sum_perx return PP
def test_empty(self): """Assert that a newly constructed vocabulary has size zero""" v = Vocabulary() self.assertEqual(v.size(), 0)
class ParallelCorpus: # Define a constructor def __init__(self): # List of English sentences. Each sentence will be represented as a list of ints. self.e = list() # List of foreign sentences Each sentence will be represented as a list of ints. self.f = list() # Initially empty vocabularies self.e_vocab = Vocabulary() self.f_vocab = Vocabulary() # Returns the number of sentence pairs that have been added to this parallel corpus def size(self): return len(self.e) # Returns the list of integers corresponding to the English sentence at the specified sentence index def get_e(self, sentence_index): return self.e[sentence_index] # Returns the list of integers corresponding to the foreign sentence at the specified sentence index def get_f(self, sentence_index): return self.f[sentence_index] # Given a string representing an English sentence # and a string representing a foreign sentence, # tokenize each string using nltk.word_tokenize, # and use the appropriate vocabulary to convert each token to an int. # # Append the list of integers (corresponding to the English sentence) to self.e # Append the list of integers (corresponding to the foreign sentence) to self.f def add(self, e, f): wordlist = [] #this is to get a list for each sentence of ints words = nltk.word_tokenize(e) for word in words: wordlist.append(self.e_vocab.get_int(word)) self.e.append(wordlist) wordlist = [] #this is to get a list for each sentence of ints words = nltk.word_tokenize(f) for word in words: wordlist.append(self.f_vocab.get_int(word)) self.f.append(wordlist) # Construct a conditional distribution with the given name. # # Use the formula given in the supplementary instructions def create_uniform_distribution(self, name): initial_prob = 1.0 / self.f_vocab.size() #init probability, i.e 25% return Conditional(name, self.e_vocab, self.f_vocab, initial_prob) # Given a sentence index, a scaling factor epsilon, and a conditional distribution, # calculate the conditional probability # of the English sentence (at that sentence index) # given the foreign sentence (at that sentence index) # # Use the formula given in the supplementary instructions def conditional_probability(self, sentence_index, epsilon, conditional): le = len(self.e[sentence_index]) lf = len(self.f[sentence_index]) tsum = 0 for j in range(le): for i in range(lf): tsum = tsum + conditional.get(j, i) lftole = math.pow(lf, le) p = epsilon / lftole * tsum return p # Given a conditional distribution and a scaling factor epsilon, # calculate the perplexity of this parallel corpus. # # Use the formula given in the supplementary instructions def perplexity(self, epsilon, conditional): s = self.size() pp = 0 for i in range(s): p = self.conditional_probability(i, epsilon, conditional) pp = pp + math.log(p, 2) pp = -pp return pp
class ParallelCorpus: # Define a constructor def __init__(self): # List of English sentences. Each sentence will be represented as a list of ints. self.e = list() # List of foreign sentences Each sentence will be represented as a list of ints. self.f = list() # Initially empty vocabularies self.e_vocab = Vocabulary() self.f_vocab = Vocabulary() # Returns the number of sentence pairs that have been added to this parallel corpus def size(self): return len(self.e) # Returns the list of integers corresponding to the English sentence at the specified sentence index def get_e(self, sentence_index): return self.e[sentence_index] # Returns the list of integers corresponding to the foreign sentence at the specified sentence index def get_f(self, sentence_index): return self.f[sentence_index] # Given a string representing an English sentence # and a string representing a foreign sentence, # tokenize each string using nltk.word_tokenize, # and use the appropriate vocabulary to convert each token to an int. # # Append the list of integers (corresponding to the English sentence) to self.e # Append the list of integers (corresponding to the foreign sentence) to self.f def add(self, e, f): words_e = nltk.word_tokenize(e) words_f = nltk.word_tokenize(f) sent_e, sent_f = list(), list() for single_e in words_e: sent_e.append(Vocabulary.get_int(self.e_vocab, single_e)) for single_f in words_f: sent_f.append(Vocabulary.get_int(self.f_vocab, single_f)) self.e.append(sent_e) self.f.append(sent_f) # Construct a conditional distribution with the given name. # # Use the formula given in the supplementary instructions def create_uniform_distribution(self, name): return Conditional(name, self.e_vocab, self.f_vocab, 1 / self.f_vocab.size()) # Given a sentence index, a scaling factor epsilon, and a conditional distribution, # calculate the conditional probability # of the English sentence (at that sentence index) # given the foreign sentence (at that sentence index) # # Use the formula given in the supplementary instructions def conditional_probability(self, sentence_index, epsilon, conditional): sent_e = self.get_e(sentence_index) sent_f = self.get_f(sentence_index) frac = epsilon / (len(sent_f)**len(sent_e)) sum_total = 0 for i in range(0, len(sent_e)): for j in range(0, len(sent_f)): sum_total += conditional.get(sent_e[i], sent_f[j]) return frac * sum_total # Given a conditional distribution and a scaling factor epsilon, # calculate the perplexity of this parallel corpus. # # Use the formula given in the supplementary instructions def perplexity(self, epsilon, conditional): sum_total = 0 for s in range(0, self.size()): sum_total += math.log2( self.conditional_probability(s, epsilon, conditional)) return -1 * sum_total