def test_set_uniform_distortion_probabilities_of_non_domain_values(self): # arrange src_classes = {'schinken': 0, 'eier': 0, 'spam': 1} trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2} corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model5 = IBMModel5(corpus, 0, src_classes, trg_classes) # act model5.set_uniform_distortion_probabilities(corpus) # assert # examine dv and max_v values that are not in the training data domain self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB) self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB) self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB) self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB)
def test_vocabularies_are_initialized(self): parallel_corpora = [ AlignedSent(['one', 'two', 'three', 'four'], ['un', 'deux', 'trois']), AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']), AlignedSent([], ['sept']) ] ibm_model = IBMModel(parallel_corpora) self.assertEqual(len(ibm_model.src_vocab), 8) self.assertEqual(len(ibm_model.trg_vocab), 6)
def align(self, sentence_pair): """ Determines the best word alignment for one sentence pair from the corpus that the model was trained on. The original sentence pair is not modified. Results are undefined if ``sentence_pair`` is not in the training set. :param sentence_pair: A sentence in the source language and its counterpart sentence in the target language :type sentence_pair: AlignedSent :return: ``AlignedSent`` filled in with the best word alignment :rtype: AlignedSent """ if self.translation_table is None: raise ValueError("The model has not been trained.") alignment = [] for j, trg_word in enumerate(sentence_pair.words): # Initialize trg_word to align with the NULL token best_alignment = (self.translation_table[trg_word][None], None) for i, src_word in enumerate(sentence_pair.mots): align_prob = self.translation_table[trg_word][src_word] best_alignment = max(best_alignment, (align_prob, i)) # If trg_word is not aligned to the NULL token, # add it to the viterbi_alignment. if best_alignment[1] is not None: alignment.append((j, best_alignment[1])) return AlignedSent(sentence_pair.words, sentence_pair.mots, alignment) for j, en_word in enumerate(align_sent.words): # Initialize the maximum probability with Null token max_align_prob = (self.probabilities[en_word][None], None) for i, fr_word in enumerate(align_sent.mots): # Find out the maximum probability max_align_prob = max(max_align_prob, (self.probabilities[en_word][fr_word], i)) # If the maximum probability is not Null token, # then append it to the alignment. if max_align_prob[1] is not None: alignment.append((j, max_align_prob[1])) return AlignedSent(align_sent.words, align_sent.mots, alignment)
def align(self, align_sent): """ Returns the alignment result for one sentence pair. """ if self.probabilities is None: raise ValueError("The model does not train.") alignment = [] for j, en_word in enumerate(align_sent.words): # Initialize the maximum probability with Null token max_align_prob = (self.probabilities[en_word][None], None) for i, fr_word in enumerate(align_sent.mots): # Find out the maximum probability max_align_prob = max(max_align_prob, (self.probabilities[en_word][fr_word], i)) # If the maximum probability is not Null token, # then append it to the alignment. if max_align_prob[1] is not None: alignment.append((j, max_align_prob[1])) return AlignedSent(align_sent.words, align_sent.mots, alignment)
def test_prune(self): # arrange alignment_infos = [ AlignmentInfo((1, 1), None, None, None), AlignmentInfo((1, 2), None, None, None), AlignmentInfo((2, 1), None, None, None), AlignmentInfo((2, 2), None, None, None), AlignmentInfo((0, 0), None, None, None) ] min_factor = IBMModel5.MIN_SCORE_FACTOR best_score = 0.9 scores = { (1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold (1, 2): best_score, (2, 1): min_factor * best_score, # at threshold (2, 2): min_factor * best_score * 0.5, # low score (0, 0): min(min_factor * 1.1, 1) * 1.2 # above threshold } corpus = [AlignedSent(['a'], ['b'])] original_prob_function = IBMModel4.model4_prob_t_a_given_s # mock static method IBMModel4.model4_prob_t_a_given_s = staticmethod( lambda a, model: scores[a.alignment]) model5 = IBMModel5(corpus, 0, None, None) # act pruned_alignments = model5.prune(alignment_infos) # assert self.assertEqual(len(pruned_alignments), 3) # restore static method IBMModel4.model4_prob_t_a_given_s = original_prob_function
def align(self, align_sent): if self.t is None or self.q is None: raise ValueError("The model does not train.") alignment = [] l_e = len(align_sent.mots) l_f = len(align_sent.words) for j, fr_word in enumerate(align_sent.words): # Initialize the maximum probability with Null token max_align_prob = (self.t[None][fr_word] * self.q[0][j + 1][l_e][l_f], None) for i, en_word in enumerate(align_sent.mots): # Find out the maximum probability max_align_prob = max(max_align_prob, (self.t[en_word][fr_word] * self.q[i + 1][j + 1][l_e][l_f], i)) # If the maximum probability is not Null token, # then append it to the alignment. if max_align_prob[1] is not None: alignment.append((j, max_align_prob[1])) return AlignedSent(align_sent.words, align_sent.mots, alignment)
def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, None) translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 model1 = IBMModel1(corpus, 0) model1.translation_table = translation_table # act probability = model1.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 expected_probability = lexical_translation self.assertEqual(round(probability, 4), round(expected_probability, 4))
def aligned(self): if self.probabilities is None: raise ValueError("No probabilities calculated") aligned = [] # Alignment Learning from t(e|f) for aligned_sent in self.aligned_sents: alignment = [] le = len(aligned_sent.words) lf = len(aligned_sent.mots) for j, e_w in enumerate(aligned_sent.words): f_max = (self.alignments[lf, j, le, lf], None) for i, f_w in enumerate(aligned_sent.mots): f_max = max(f_max, (self.alignments[i, j, le, lf], i)) print(f_max) # print self.alignments[i, j, le, lf] # for every English word # for j, e_w in enumerate(aligned_sent.words): # find the French word that gives maximized t(e|f) # NULL token is the initial candidate # f_max = (self.probabilities[e_w, None], None) # for i, f_w in enumerate(aligned_sent.mots): # f_max = max(f_max, (self.probabilities[e_w, f_w], i)) # only output alignment with non-NULL mapping if f_max[1] is not None: alignment.append((j, f_max[1])) # substitute the alignment of AlignedSent with the yielded one aligned.append( AlignedSent(aligned_sent.words, aligned_sent.mots, alignment)) return aligned
def replaceByNumber(bitexts): """ This function implements the approach to replace a word by an number in the sentence pairs. >>> en_dict, fr_dict, bitexts = replaceByNumber(comtrans.aligned_sents()[:100]) >>> en_dict['Kommission'] 474 >>> fr_dict['check'] 331 >>> bitexts[0] AlignedSent([1, 2, 3], [1, 2, 3, 4], Alignment([(0, 0), (1, 1), (1, 2), (2, 3)])) Arguments: bitexts -- A list of instances of AlignedSent class, which contains sentence pairs. Returns: en_dict -- A dictionary with an English word as a key and an integer as a value. fr_dict -- A dictionary with an foreign word as a key and an integer as a value. new_bitexts -- A list of instances of AlignedSent class, which the sentence pairs that each word is represented by a number. """ new_bitexts = [] # Assign zero as an initial value en_dict = defaultdict(lambda: 0) fr_dict = defaultdict(lambda: 0) # The number starts from one to represent each word en_count = 1 fr_count = 1 for aligned_sent in bitexts: new_words = [] for word in aligned_sent.words: if en_dict[word] == 0: en_dict[word] = en_count en_count += 1 # Append the integer to the new sentence new_words.append(en_dict[word]) new_mots = [] for mots in aligned_sent.mots: if fr_dict[mots] == 0: fr_dict[mots] = fr_count fr_count += 1 # Append the integer to the new sentence new_mots.append(fr_dict[mots]) # Create a new instance of AlignedSent class # and append it to new list of sentence pairs. new_bitexts.append( AlignedSent(new_words, new_mots, aligned_sent.alignment)) return en_dict, fr_dict, new_bitexts
def test_best_model2_alignment_handles_fertile_words(self): # arrange sentence_pair = AlignedSent( ['i', 'really', ',', 'really', 'love', 'ham'], TestIBMModel.__TEST_SRC_SENTENCE) # 'bien' produces 2 target words: 'really' and another 'really' translation_table = { 'i': { "j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0 }, 'really': { "j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09 }, ',': { "j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7 }, 'love': { "j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03 }, 'ham': { "j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0 } } alignment_table = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.2)))) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4)) self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
def learn_aligned_text(self, fr_lines, en_lines): '''Learn from IBM-aligned model of lists of lines of both languages''' bitext = [] bitext_flip = [] for pair in zip(fr_lines, en_lines): pp0 = preprocess(pair[0]) pp1 = preprocess(pair[1]) if len(pp0) == 0 or len(pp1) == 0: continue if pp0[0] == '' or pp0[1] == '': continue bitext.append(AlignedSent(pp0, pp1)) bitext_flip.append(AlignedSent(pp1, pp0)) self.learn_aligned_sentence(pair[1], pair[0]) ibm = IBMModel3(bitext, 5) self.translation_table = ibm.translation_table ibm_flip = IBMModel3(bitext_flip, 5) self.fertility_table = ibm_flip.fertility_table
def test_best_model2_alignment_handles_empty_trg_sentence(self): # arrange sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE) ibm_model = IBMModel([]) # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], ()) self.assertEqual(a_info.cepts, [[], [], [], [], []])
def test_best_model2_alignment_handles_empty_src_sentence(self): # arrange sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, []) ibm_model = IBMModel([]) # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (0, 0, 0)) self.assertEqual(a_info.cepts, [[1, 2, 3]])
def test_sample(self): # arrange sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE) ibm_model = IBMModel([]) ibm_model.prob_t_a_given_s = lambda x: 0.001 # act samples, best_alignment = ibm_model.sample(sentence_pair) # assert self.assertEqual(len(samples), 61)
def read_block(self, stream): block = [self._word_tokenizer.tokenize(sent_str) for alignedsent_str in self._alignedsent_block_reader(stream) for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)] if self._aligned: block[2] = " ".join(block[2]) # kludge; we shouldn't have tokenized the alignment string block = [AlignedSent(*block)] elif self._group_by_sent: block = [block[0]] else: block = block[0] return block
def align(self, aligned_sent): best_alignment = [] out_sent = AlignedSent(aligned_sent.words, aligned_sent.mots) l = len(aligned_sent.mots) m = len(aligned_sent.words) for i, t_word in enumerate(aligned_sent.words): best_prob = (self.t[t_word][None] * self.q[0][i + 1][l][m]) best_alig = None for j, s_word in enumerate(aligned_sent.mots): alig_prob = (self.t[t_word][s_word] * self.q[j + 1][i + 1][l][m]) if alig_prob > best_prob: best_prob = alig_prob best_alig = j if best_alig != None: best_alignment.append((i, best_alig)) out_sent.alignment = Alignment(best_alignment) return out_sent
def GetAlignment(derivation, tree1, tree2, cost_threshold = 0.3): """ Given a derivation (sequence of rules that puts in correspondence tree1 and tree2), it returns a word-to-word alignment. """ src_tokens = tree1.leaves() trg_tokens = tree2.leaves() alignments_dict = defaultdict(list) src_leave_paths = sorted(tree1.treepositions('leaves')) trg_leave_paths = sorted(tree2.treepositions('leaves')) assert len(src_tokens) == len(src_leave_paths), \ 'Num. source leaves and leave paths mismatch: {0} vs. {1}'\ .format(src_tokens, src_leave_paths) assert len(trg_tokens) == len(trg_leave_paths), \ 'Num. target leaves and leave paths mismatch: {0} vs. {1}'\ .format(trg_tokens, trg_leave_paths) # Create an index that maps paths to word indices in the sentences. src_path_to_index = {path : index for index, path in enumerate(src_leave_paths)} trg_path_to_index = {path : index for index, path in enumerate(trg_leave_paths)} # Extract word-to-word alignments, one production at a time. for production in derivation: # Get the absolute path positions of lhs and rhs leaves. state, src_path, trg_path = production.non_terminal if state != 'dist_sim': continue lhs, rhs = production.rhs.rule.lhs, production.rhs.rule.rhs src_leave_abs_positions = \ [src_path + src_leaf_path for src_leaf_path in GetLeavePositions(lhs)] trg_leave_abs_positions = \ [trg_path + trg_leaf_path for trg_leaf_path in GetLeavePositions(rhs)] cost = production.rhs.rule.weight num_src_and_trg_leaves = \ len(src_leave_abs_positions) + len(trg_leave_abs_positions) if (cost / float(num_src_and_trg_leaves)) > cost_threshold: continue # Set the index of each source word to align to the index of each # target word appearing in this production (rule). for src_leaf_path in src_leave_abs_positions: src_leaf_index = src_path_to_index[src_leaf_path] for trg_leaf_path in trg_leave_abs_positions: trg_leaf_index = trg_path_to_index[trg_leaf_path] alignments_dict[src_leaf_index].append(trg_leaf_index) # List of tuples with alignments. E.g. [(0, 0), (0, 1), (1, 2), (2, 3), ...] alignments = [(src_index, trg_index) \ for (src_index, trg_indices) in alignments_dict.items() \ for trg_index in trg_indices] alignments_str = ' '.join([str(src_index) + '-' + str(trg_index) \ for (src_index, trg_index) in alignments]) aligned_sentence = AlignedSent(src_tokens, trg_tokens, alignments_str) return aligned_sentence
def test_set_uniform_distortion_probabilities_of_max_displacements(self): # arrange src_classes = {'schinken': 0, 'eier': 0, 'spam': 1} trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2} corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model5 = IBMModel5(corpus, 0, src_classes, trg_classes) # act model5.set_uniform_distortion_probabilities(corpus) # assert # number of vacancy difference values = # 2 * number of words in longest target sentence expected_prob = 1.0 / (2 * 4) # examine the boundary values for (dv, max_v, trg_class) self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob) self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob) self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob) self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
def alignSents(self, alignSents): """ Returns the alignment result for several sentence pairs. """ if self.probabilities is None or self.alignments is None: raise ValueError("The model does not train.") aligned_sents = [] for sent in alignSents: new_alignSent = AlignedSent(sent.words, sent.mots) aligned_sents.append(self.align(new_alignSent)) return aligned_sents
def test_set_uniform_distortion_probabilities_of_max_displacements(self): # arrange src_classes = {'schinken': 0, 'eier': 0, 'spam': 1} trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2} corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model4 = IBMModel4(corpus, 0, src_classes, trg_classes) # act model4.set_uniform_distortion_probabilities(corpus) # assert # number of displacement values = # 2 *(number of words in longest target sentence - 1) expected_prob = 1.0 / (2 * (4 - 1)) # examine the boundary values for (displacement, src_class, trg_class) self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob) self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob) self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob) self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob)
def align(self, sentence_pair): """ Determines the best word alignment for one sentence pair from the corpus that the model was trained on. The original sentence pair is not modified. Results are undefined if ``sentence_pair`` is not in the training set. Note that the algorithm used is not strictly Model 3, because fertilities and NULL insertion probabilities are ignored. :param sentence_pair: A sentence in the source language and its counterpart sentence in the target language :type sentence_pair: AlignedSent :return: ``AlignedSent`` filled in with the best word alignment :rtype: AlignedSent """ if self.translation_table is None or self.distortion_table is None: raise ValueError("The model has not been trained.") alignment = [] l = len(sentence_pair.mots) m = len(sentence_pair.words) for j, trg_word in enumerate(sentence_pair.words): # Initialize trg_word to align with the NULL token best_prob = (self.translation_table[trg_word][None] * self.distortion_table[j + 1][0][l][m]) best_prob = max(best_prob, IBMModel.MIN_PROB) best_alignment = None for i, src_word in enumerate(sentence_pair.mots): align_prob = (self.translation_table[trg_word][src_word] * self.distortion_table[j + 1][i + 1][l][m]) if align_prob >= best_prob: best_prob = align_prob best_alignment = i # If trg_word is not aligned to the NULL token, # add it to the viterbi_alignment. if best_alignment is not None: alignment.append((j, best_alignment)) return AlignedSent(sentence_pair.words, sentence_pair.mots, alignment)
def align(self, align_sent): this_alignment = [] en_len = len(align_sent.words) ger_len = len(align_sent.mots) for j, en_word in enumerate(align_sent.words): p = self.t[en_word][None] * self.q[0][j + 1][en_len][ger_len] max_prob = (p, None) for i, ger_word in enumerate(align_sent.mots): this_p = self.t[en_word][ger_word] * self.q[i + 1][j + 1][en_len][ger_len] this_prob = (this_p, i) max_prob = max(max_prob, this_prob) if max_prob[1] is not None: this_alignment.append((j, max_prob[1])) return AlignedSent(align_sent.words, align_sent.mots, this_alignment)
def align(self, align_sent): # alignments = [] # english = ['STOP'] + align_sent.mots # german = align_sent.words # l = len(english) # m = len(german) # for i in range(0,m): # maximum = -sys.maxint # argmax = None # for j in range(0,l): # if self.q[(j,i,l,m)] == 0 or self.t[(german[i], english[j])] == 0: # prod = -1000 # else: # prod = math.log(self.q[(j,i,l,m)],2) + math.log(self.t[(german[i], english[j])],2) # if prod > maximum: # maximum = prod # argmax = (i,j) # alignments.append(argmax) # sent = AlignedSent(german, english, Alignment(alignments)) # return sent if self.t is None or self.q is None: raise ValueError("The model does not train.") alignment = [] l_e = len(align_sent.words) l_g = len(align_sent.mots) for j, en_word in enumerate(align_sent.words): # Initialize the maximum probability with Null token max_align_prob = (self.t[en_word][None] * self.q[0][j + 1][l_e][l_g], None) for i, g_word in enumerate(align_sent.mots): # Find out the maximum probability max_align_prob = max(max_align_prob, (self.t[en_word][g_word] * self.q[i + 1][j + 1][l_e][l_g], i)) # If the maximum probability is not Null token, # then append it to the alignment. if max_align_prob[1] is not None: alignment.append((j, max_align_prob[1])) return AlignedSent(align_sent.words, align_sent.mots, alignment)
def align(self, align_sent): alignment = [] english = align_sent.words german = align_sent.mots l = len(english) m = len([None] + german) for j, en_word in enumerate(english): max_align_prob = (-sys.maxint, 0) for i, g_word in enumerate(german): max_align_prob = max( max_align_prob, (self.t[en_word][g_word] * self.q[(i, j, l, m)], i)) if max_align_prob[1] is not None: alignment.append((j, max_align_prob[1])) return AlignedSent(align_sent.words, align_sent.mots, alignment)
def align(self, align_sent): if self.t is None or self.q is None: raise ValueError("No parameters trained") wordlen = align_sent.words.__len__() motslen = align_sent.mots.__len__() alignment = [] ### Check this: for i, en_word in enumerate(align_sent.words): max_align = (self.t[en_word][None] * self.q[0][i + 1][wordlen][motslen], None) for j, ge_word in enumerate(align_sent.mots): max_align = max(max_align, (self.t[en_word][ge_word] * self.q[j + 1][i + 1][wordlen][motslen], j)) if max_align[1] is not None: alignment.append((i, max_align[1])) return AlignedSent(align_sent.words, align_sent.mots, alignment)
def computeAlignments(self, frst_corpus, scnd_corpus, chosen_ibm): i = 0 aligned_sents = [] while i < len(frst_corpus.sents): aligned_sents.append( AlignedSent(frst_corpus.sents[i], scnd_corpus.sents[i])) i += 1 if chosen_ibm == "ibm1": ibm = IBMModel1(aligned_sents) return ibm.aligned() elif chosen_ibm == "ibm2": ibm = IBMModel2(aligned_sents) return ibm.aligned() else: ibm1 = IBMModel1(aligned_sents) ibm2 = IBMModel2(aligned_sents) return ibm1.aligned(), ibm2.aligned()
def test_best_model2_alignment(self): # arrange sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE) # None and 'bien' have zero fertility translation_table = { 'i': { "j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0 }, 'love': { "j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03 }, 'ham': { "j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0 } } alignment_table = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.2)))) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
def align(self, align_sent): alignment = [] l = len(align_sent.words) m = len(align_sent.mots) for j, ej in enumerate(align_sent.words): # Initialize maximum probability as None max_align_prob = (self.t[ej][None] * self.q[0][j + 1][l][m], None) for i, fi in enumerate(align_sent.mots): # Calculate maximum probability prod = (self.t[ej][fi] * self.q[i + 1][j + 1][l][m], i) max_align_prob = max(max_align_prob, prod) # If max probability is not None, append it to alignments list if max_align_prob[1] is not None: alignment.append((j, max_align_prob[1])) return AlignedSent(align_sent.words, align_sent.mots, alignment)
def align(self, align_sent): if self.t is None or self.q is None: raise ValueError("No parameters trained") #MIN_PROB = 1.0e-12 wordlen = align_sent.words.__len__() motslen = align_sent.mots.__len__() alignment = [] for i, en_word in enumerate(align_sent.words): max_align = (self.t[en_word][None] * self.q[0][i + 1][wordlen][motslen], None) for j, ge_word in enumerate(align_sent.mots): align_prob = (self.t[en_word][ge_word] * self.q[j + 1][i + 1][wordlen][motslen], j) if align_prob >= max_align: max_align = align_prob best_pt = j alignment.append((i, best_pt)) return AlignedSent(align_sent.words, align_sent.mots, Alignment(alignment)) #uses built-in Alignment func
def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, None) translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(float)))) alignment_table[0][3][5][6] = 0.97 # None -> to alignment_table[1][1][5][6] = 0.97 # ich -> i alignment_table[2][4][5][6] = 0.97 # esse -> eat alignment_table[4][2][5][6] = 0.97 # gern -> love alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham model2 = IBMModel2(corpus, 0) model2.translation_table = translation_table model2.alignment_table = alignment_table # act probability = model2.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96 expected_probability = lexical_translation * alignment self.assertEqual(round(probability, 4), round(expected_probability, 4))
def align(self, align_sent): alignment = [] l = len(align_sent.words) m = len(align_sent.mots) for j, en_word in enumerate(align_sent.words): # Initialize the maximum probability with Null token max_align_prob = (-sys.maxint, None) for i, g_word in enumerate(align_sent.mots): # Find out the maximum probability max_align_prob = max(max_align_prob, (self.t[en_word][g_word] * self.q[(i + 1, j + 1, l, m)], i)) # If the maximum probability is not Null token, # then append it to the alignment. if max_align_prob[1] is not None: alignment.append((j, max_align_prob[1])) return AlignedSent(align_sent.words, align_sent.mots, alignment)