def get(self, e_i, f_i): e_word = Vocabulary.get_word(self.e_vocab, e_i) f_word = Vocabulary.get_word(self.f_vocab, f_i) #looking for value corresponding to dict of dict val ret_val = self.dict_of_dict[e_word][f_word] return ret_val
def __str__(self): ret_str = "" for word_e in Vocabulary.words(self.e_vocab): for word_f in Vocabulary.words(self.f_vocab): ret_str += self.name + "[" + word_e + " | " + word_f + "] = " + str( self.double_dict[word_e][word_f]) + "\n" return ret_str
def __str__(self): ret_string = "" for e in Vocabulary.words(self.e_vocab): for f in Vocabulary.words(self.f_vocab): ret_string = ret_string + self.name + "[" + e + " | " + f + "]" + " = " + str( self.dict_of_dict[e][f]) + '\n' return ret_string
def add(self, e, f): words_e = nltk.word_tokenize(e) words_f = nltk.word_tokenize(f) sent_e, sent_f = list(), list() for single_e in words_e: sent_e.append(Vocabulary.get_int(self.e_vocab, single_e)) for single_f in words_f: sent_f.append(Vocabulary.get_int(self.f_vocab, single_f)) self.e.append(sent_e) self.f.append(sent_f)
def __init__(self, name, e_vocab, f_vocab, initial_value): self.name = name self.e_vocab = e_vocab self.f_vocab = f_vocab #cast as dict?? self.double_dict = dict(defaultdict(dict))? self.double_dict = defaultdict(dict) for word_e in Vocabulary.words(e_vocab): for word_f in Vocabulary.words(f_vocab): self.double_dict[word_e][word_f] = initial_value
def __init__(self, name, e_vocab, f_vocab, initial_value): self.name = name self.e_vocab = e_vocab self.f_vocab = f_vocab #self.initial_value = 0 self.dict_of_dict = defaultdict(dict) #storing initial value for every pair of words in dictionary of dictionaries for e_word in Vocabulary.words(e_vocab): for f_word in Vocabulary.words(f_vocab): self.dict_of_dict[e_word][f_word] = initial_value
def __init__(self): # List of English sentences. Each sentence will be represented as a list of ints. self.e = list() # List of foreign sentences Each sentence will be represented as a list of ints. self.f = list() # Initially empty vocabularies self.e_vocab = Vocabulary() self.f_vocab = Vocabulary()
def test_add_one_word(self): """Assert that adding a single word to a newly constructed Vocabulary works as expected""" v = Vocabulary() i = v.get_int("hello") self.assertEqual(i, 0) self.assertEqual(v.size(), 1) self.assertEqual(v.words(), ["hello"]) self.assertEqual(v.get_int("hello"), i) self.assertEqual(v.get_word(0), "hello")
def add(self, e, f): #tokenizing the sentences english_words = nltk.word_tokenize(e) foreign_words = nltk.word_tokenize(f) #creating lists to store variables english_list = list() foreign_list = list() #going through each word of sentence to find the corresponding int and append to list for english_word in english_words: cur_int = Vocabulary.get_int(self.e_vocab, english_word) english_list.append(cur_int) for foreign_word in foreign_words: cur_int = Vocabulary.get_int(self.f_vocab, foreign_word) foreign_list.append(cur_int) #appending list to of int to self.? self.e.append(english_list) self.f.append(foreign_list)
def test_adding_words(self): """Assert that words are properly added to the vocabulary""" v = Vocabulary() tokens = "Four score and seven years ago".split() ints = list() for token in tokens: ints.append(v.get_int(token)) self.assertEqual(v.words(), tokens) for token in tokens: i = v.get_int(token) self.assertNotEqual(i, None) t = v.get_word(i) self.assertEqual(t, token) for i in range(0, len(tokens)): self.assertNotEqual(i, None) t = v.get_word(i) self.assertEqual(t, tokens[i])
def test_empty(self): """Assert that a newly constructed vocabulary has size zero""" v = Vocabulary() self.assertEqual(v.size(), 0)
class ParallelCorpus: # Define a constructor def __init__(self): # List of English sentences. Each sentence will be represented as a list of ints. self.e = list() # List of foreign sentences Each sentence will be represented as a list of ints. self.f = list() # Initially empty vocabularies self.e_vocab = Vocabulary() self.f_vocab = Vocabulary() # Returns the number of sentence pairs that have been added to this parallel corpus def size(self): return len(self.e) # Returns the list of integers corresponding to the English sentence at the specified sentence index def get_e(self, sentence_index): return self.e[sentence_index] # Returns the list of integers corresponding to the foreign sentence at the specified sentence index def get_f(self, sentence_index): return self.f[sentence_index] # Given a string representing an English sentence # and a string representing a foreign sentence, # tokenize each string using nltk.word_tokenize, # and use the appropriate vocabulary to convert each token to an int. # # Append the list of integers (corresponding to the English sentence) to self.e # Append the list of integers (corresponding to the foreign sentence) to self.f def add(self, e, f): #tokenizing the sentences english_words = nltk.word_tokenize(e) foreign_words = nltk.word_tokenize(f) #creating lists to store variables english_list = list() foreign_list = list() #going through each word of sentence to find the corresponding int and append to list for english_word in english_words: cur_int = Vocabulary.get_int(self.e_vocab, english_word) english_list.append(cur_int) for foreign_word in foreign_words: cur_int = Vocabulary.get_int(self.f_vocab, foreign_word) foreign_list.append(cur_int) #appending list to of int to self.? self.e.append(english_list) self.f.append(foreign_list) # Construct a conditional distribution with the given name. # # Use the formula given in the supplementary instructions def create_uniform_distribution(self, name): #use constructor of Conditional class init_val = 1 / self.f_vocab.size() # print(init_val) return Conditional(name, self.e_vocab, self.f_vocab, init_val) # Given a sentence index, a scaling factor epsilon, and a conditional distribution, # calculate the conditional probability # of the English sentence (at that sentence index) # given the foreign sentence (at that sentence index) # # Use the formula given in the supplementary instructions def conditional_probability(self, sentence_index, epsilon, conditional): #getting int from sentences at sentence_index english = self.get_e(sentence_index) foreign = self.get_f(sentence_index) # print(english) # print(foreign) #using formula from PDF to initialize operands of prob factor = epsilon / (len(english) * len(foreign)) sum_of_sum = 0 # print(factor) #gathering values of each word for j in range(0, len(english)): for i in range(0, len(foreign)): new_val = conditional.get(english[j], foreign[i]) # print(new_val) sum_of_sum = sum_of_sum + new_val # print(sum_of_sum) cond_prob = factor * sum_of_sum return cond_prob # Given a conditional distribution and a scaling factor epsilon, # calculate the perplexity of this parallel corpus. # # Use the formula given in the supplementary instructions def perplexity(self, epsilon, conditional): sum_perx = 0 range_bound = self.size() # print(range_bound) #following formula from PDF for s in range(0, range_bound): # print(s) temp = self.conditional_probability(s, epsilon, conditional) # print(temp) new_val_2 = math.log2(temp) sum_perx = sum_perx + new_val_2 PP = -1 * sum_perx return PP
class ParallelCorpus: # Define a constructor def __init__(self): # List of English sentences. Each sentence will be represented as a list of ints. self.e = list() # List of foreign sentences Each sentence will be represented as a list of ints. self.f = list() # Initially empty vocabularies self.e_vocab = Vocabulary() self.f_vocab = Vocabulary() # Returns the number of sentence pairs that have been added to this parallel corpus def size(self): return len(self.e) # Returns the list of integers corresponding to the English sentence at the specified sentence index def get_e(self, sentence_index): return self.e[sentence_index] # Returns the list of integers corresponding to the foreign sentence at the specified sentence index def get_f(self, sentence_index): return self.f[sentence_index] # Given a string representing an English sentence # and a string representing a foreign sentence, # tokenize each string using nltk.word_tokenize, # and use the appropriate vocabulary to convert each token to an int. # # Append the list of integers (corresponding to the English sentence) to self.e # Append the list of integers (corresponding to the foreign sentence) to self.f def add(self, e, f): wordlist = [] #this is to get a list for each sentence of ints words = nltk.word_tokenize(e) for word in words: wordlist.append(self.e_vocab.get_int(word)) self.e.append(wordlist) wordlist = [] #this is to get a list for each sentence of ints words = nltk.word_tokenize(f) for word in words: wordlist.append(self.f_vocab.get_int(word)) self.f.append(wordlist) # Construct a conditional distribution with the given name. # # Use the formula given in the supplementary instructions def create_uniform_distribution(self, name): initial_prob = 1.0 / self.f_vocab.size() #init probability, i.e 25% return Conditional(name, self.e_vocab, self.f_vocab, initial_prob) # Given a sentence index, a scaling factor epsilon, and a conditional distribution, # calculate the conditional probability # of the English sentence (at that sentence index) # given the foreign sentence (at that sentence index) # # Use the formula given in the supplementary instructions def conditional_probability(self, sentence_index, epsilon, conditional): le = len(self.e[sentence_index]) lf = len(self.f[sentence_index]) tsum = 0 for j in range(le): for i in range(lf): tsum = tsum + conditional.get(j, i) lftole = math.pow(lf, le) p = epsilon / lftole * tsum return p # Given a conditional distribution and a scaling factor epsilon, # calculate the perplexity of this parallel corpus. # # Use the formula given in the supplementary instructions def perplexity(self, epsilon, conditional): s = self.size() pp = 0 for i in range(s): p = self.conditional_probability(i, epsilon, conditional) pp = pp + math.log(p, 2) pp = -pp return pp
def set(self, e_i, f_i, value): word_e = Vocabulary.get_word(self.e_vocab, e_i) word_f = Vocabulary.get_word(self.f_vocab, f_i) self.double_dict[word_e][word_f] = value
def test_empty_list(self): """Assert that a newly constructed vocabulary contains an empty list""" v = Vocabulary() self.assertEqual(v.words(), list())
def test_empty_word_index(self): """Assert that a newly constructed vocabulary does not associate any string with index zero""" v = Vocabulary() self.assertEqual(v.get_word(0), None)
def test_negative_indices(self): """Assert that a newly constructed vocabulary returns None for negative numbers""" v = Vocabulary() for i in range(-1000, -1): self.assertEqual(v.get_word(i), None)
def set(self, e_i, f_i, value): e_word = Vocabulary.get_word(self.e_vocab, e_i) f_word = Vocabulary.get_word(self.f_vocab, f_i) self.dict_of_dict[e_word][f_word] = value
def get(self, e_i, f_i): word_e = Vocabulary.get_word(self.e_vocab, e_i) word_f = Vocabulary.get_word(self.f_vocab, f_i) return self.double_dict[word_e][word_f]
def create_vocab(words): v = Vocabulary() for word in words: v.get_int(word) return v
class ParallelCorpus: # Define a constructor def __init__(self): # List of English sentences. Each sentence will be represented as a list of ints. self.e = list() # List of foreign sentences Each sentence will be represented as a list of ints. self.f = list() # Initially empty vocabularies self.e_vocab = Vocabulary() self.f_vocab = Vocabulary() # Returns the number of sentence pairs that have been added to this parallel corpus def size(self): return len(self.e) # Returns the list of integers corresponding to the English sentence at the specified sentence index def get_e(self, sentence_index): return self.e[sentence_index] # Returns the list of integers corresponding to the foreign sentence at the specified sentence index def get_f(self, sentence_index): return self.f[sentence_index] # Given a string representing an English sentence # and a string representing a foreign sentence, # tokenize each string using nltk.word_tokenize, # and use the appropriate vocabulary to convert each token to an int. # # Append the list of integers (corresponding to the English sentence) to self.e # Append the list of integers (corresponding to the foreign sentence) to self.f def add(self, e, f): words_e = nltk.word_tokenize(e) words_f = nltk.word_tokenize(f) sent_e, sent_f = list(), list() for single_e in words_e: sent_e.append(Vocabulary.get_int(self.e_vocab, single_e)) for single_f in words_f: sent_f.append(Vocabulary.get_int(self.f_vocab, single_f)) self.e.append(sent_e) self.f.append(sent_f) # Construct a conditional distribution with the given name. # # Use the formula given in the supplementary instructions def create_uniform_distribution(self, name): return Conditional(name, self.e_vocab, self.f_vocab, 1 / self.f_vocab.size()) # Given a sentence index, a scaling factor epsilon, and a conditional distribution, # calculate the conditional probability # of the English sentence (at that sentence index) # given the foreign sentence (at that sentence index) # # Use the formula given in the supplementary instructions def conditional_probability(self, sentence_index, epsilon, conditional): sent_e = self.get_e(sentence_index) sent_f = self.get_f(sentence_index) frac = epsilon / (len(sent_f)**len(sent_e)) sum_total = 0 for i in range(0, len(sent_e)): for j in range(0, len(sent_f)): sum_total += conditional.get(sent_e[i], sent_f[j]) return frac * sum_total # Given a conditional distribution and a scaling factor epsilon, # calculate the perplexity of this parallel corpus. # # Use the formula given in the supplementary instructions def perplexity(self, epsilon, conditional): sum_total = 0 for s in range(0, self.size()): sum_total += math.log2( self.conditional_probability(s, epsilon, conditional)) return -1 * sum_total