def __init__(self, aligned_sents, convergent_threshold=1e-2): # Dictionary of translation probabilities t(e,f). # Lexical step self.aligned_sents = aligned_sents self.convergent_threshold = convergent_threshold self.probabilities = IBMModel1(aligned_sents).probabilities self._train()
def computeAlignments(self, frst_corpus, scnd_corpus, chosen_ibm): i = 0 aligned_sents = [] while i < len(frst_corpus.sents): aligned_sents.append( AlignedSent(frst_corpus.sents[i], scnd_corpus.sents[i])) i += 1 if chosen_ibm == "ibm1": ibm = IBMModel1(aligned_sents) return ibm.aligned() elif chosen_ibm == "ibm2": ibm = IBMModel2(aligned_sents) return ibm.aligned() else: ibm1 = IBMModel1(aligned_sents) ibm2 = IBMModel2(aligned_sents) return ibm1.aligned(), ibm2.aligned()
def initParameters(self, align_sents): # Compute t probabilities from IBMModel1 num_iters = 15 ibm1 = IBMModel1(align_sents, num_iters) t_ef = ibm1.probabilities align = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.0)))) # Initialise alignment probability with uniform distribution for alignSent in align_sents: en_set = alignSent.words fr_set = [None] + alignSent.mots m = len(fr_set) - 1 l = len(en_set) for i in range(0, m + 1): for j in range(1, l + 1): align[i][j][l][m] = 1.0 / (m + 1) return t_ef, align
def __init__(self, sentence_aligned_corpus, iterations): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model and an alignment model. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. Runs a few iterations of Model 1 training to initialize model parameters. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int """ super(IBMModel2, self).__init__(sentence_aligned_corpus) # Get initial translation probability distribution # from a few iterations of Model 1 training. ibm1 = IBMModel1(sentence_aligned_corpus, 10) self.translation_table = ibm1.translation_table # Initialize the distribution of alignment probability, # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m for aligned_sentence in sentence_aligned_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) initial_value = 1 / (l + 1) if initial_value > IBMModel.MIN_PROB: for i in range(0, l + 1): for j in range(1, m + 1): self.alignment_table[i][j][l][m] = initial_value else: warnings.warn("Source sentence is too long (" + str(l) + " words). Results may be less accurate.") self.train(sentence_aligned_corpus, iterations) self.__align_all(sentence_aligned_corpus)
def create_ibm1(aligned_sents): ibm = IBMModel1(aligned_sents, 10) return ibm
def create_ibm1(aligned_sents): num_iters = 10 ibm1 = IBMModel1(aligned_sents, num_iters) return ibm1
def create_ibm1(aligned_sents): # from homework pdf # ibm = IBMModel1(aligned_sents, num_iters) ibm = IBMModel1(aligned_sents, 10) return ibm
def train(self, aligned_sents, num_iters): src_vocab = set() trg_vocab = set() for aligned_sent in aligned_sents: src_vocab.update(aligned_sent.words) trg_vocab.update(aligned_sent.mots) src_vocab.add(None) src_vocab.add(None) t_ef = IBMModel1(aligned_sents, 5).probabilities corpus_fe = [] for aligned_sent in aligned_sents: corpus_fe.append(aligned_sent.invert()) t_fe = IBMModel1(corpus_fe, 5).probabilities q_ef = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.0)))) q_fe = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.0)))) for aligned_sent in aligned_sents: l = len(aligned_sent.mots) m = len(aligned_sent.words) initial_value = 1.0 / (l + 1) for i in range(0, l + 1): for j in range(1, m + 1): q_ef[i][j][m][l] = initial_value initial_value = 1.0 / (m + 1) for i in range(0, m + 1): for j in range(1, l + 1): q_fe[i][j][l][m] = initial_value # Start iterations for itr in range(0, num_iters): count_t_ef = defaultdict(lambda: defaultdict(lambda: 0.0)) count_t_f = defaultdict(lambda: 0.0) count_q_ef = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.0)))) count_q_f = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) count_t_fe = defaultdict(lambda: defaultdict(lambda: 0.0)) count_t_e = defaultdict(lambda: 0.0) count_q_fe = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.0)))) count_q_e = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) # Expectation step for aligned_sent in aligned_sents: total_value = defaultdict(lambda: 0.0) src = aligned_sent.words trg = [None] + aligned_sent.mots l = len(trg) - 1 m = len(src) for j in range(1, m + 1): src_word = src[j - 1] for i in range(0, l + 1): trg_word = trg[i] total_value[src_word] += t_ef[src_word][ trg_word] * q_ef[i][j][m][l] for j in range(1, m + 1): src_word = src[j - 1] for i in range(0, l + 1): trg_word = trg[i] delta = t_ef[src_word][trg_word] * q_ef[i][j][m][ l] / total_value[src_word] count_t_ef[src_word][trg_word] += delta count_t_f[trg_word] += delta count_q_ef[i][j][m][l] += delta count_q_f[j][m][l] += delta total_value = defaultdict(lambda: 0.0) src = aligned_sent.mots trg = [None] + aligned_sent.words l = len(trg) - 1 m = len(src) for j in range(1, m + 1): src_word = src[j - 1] for i in range(0, l + 1): trg_word = trg[i] total_value[src_word] += t_fe[src_word][ trg_word] * q_fe[i][j][m][l] for j in range(1, m + 1): src_word = src[j - 1] for i in range(0, l + 1): trg_word = trg[i] delta = t_fe[src_word][trg_word] * q_fe[i][j][m][ l] / total_value[src_word] count_t_fe[src_word][trg_word] += delta count_t_e[trg_word] += delta count_q_fe[i][j][m][l] += delta count_q_e[j][m][l] += delta # Compute average count for src_word in count_t_ef: for trg_word in count_t_ef[src_word]: avg = (count_t_ef[src_word][trg_word] + count_t_fe[trg_word][src_word]) / 2.0 count_t_ef[src_word][trg_word] = avg count_t_fe[trg_word][src_word] = avg for aligned_sent in aligned_sents: m = len(aligned_sent.words) l = len(aligned_sent.mots) for j in range(1, m + 1): for i in range(0, l + 1): avg = (count_q_ef[i][j][m][l] + count_q_fe[j][i][l][m]) / 2.0 count_q_ef[i][j][m][l] = avg count_q_fe[j][i][l][m] = avg t_ef = defaultdict(lambda: defaultdict(lambda: 0.0)) q_ef = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.0)))) t_fe = defaultdict(lambda: defaultdict(lambda: 0.0)) q_fe = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.0)))) # M-step for trg_word in trg_vocab: for src_word in src_vocab: t_ef[src_word][trg_word] = count_t_ef[src_word][ trg_word] / count_t_f[trg_word] for src_word in src_vocab: for trg_word in trg_vocab: t_fe[trg_word][src_word] = count_t_fe[trg_word][ src_word] / count_t_e[src_word] # Estimate the new values for aligned_sent in aligned_sents: l = len(aligned_sent.mots) m = len(aligned_sent.words) for i in range(0, l + 1): for j in range(1, m + 1): q_ef[i][j][m][ l] = count_q_ef[i][j][m][l] / count_q_f[j][m][l] for i in range(0, m + 1): for j in range(1, l + 1): q_fe[i][j][l][ m] = count_q_fe[i][j][l][m] / count_q_e[j][l][m] t = t_ef q = q_ef return t, q
def train(self, aligned_sents, num_iters): MIN_PROB = 1.0e-12 t = {} q = {} t = defaultdict( lambda: defaultdict(lambda: MIN_PROB)) q = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: MIN_PROB)))) #train model1: target --> source src_vocab = set() trg_vocab = set() for aligned_sentence in aligned_sents: trg_vocab.update(aligned_sentence.words) src_vocab.update(aligned_sentence.mots) # Add the NULL token src_vocab.add(None) translation_table1 = defaultdict( lambda: defaultdict(lambda: MIN_PROB)) alignment_table1 = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: MIN_PROB)))) # Initialize translation probability distribution #initial_value = 1.0 / len(trg_vocab) #for wt in trg_vocab: # for ws in src_vocab: # translation_table1[wt][ws] = initial_value ibm1 = IBMModel1(aligned_sents, 6) translation_table1 = ibm1.probabilities # Initialize alignment probability distribution, # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m for aligned_sentence in aligned_sents: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) initial_value = 1.0 / (l + 1) #!!! for i in range(l + 1): for j in range(1, m + 1): alignment_table1[i][j][l][m] = initial_value for i in range(num_iters): count_t_given_s1 = defaultdict(lambda: defaultdict(float)) count_any_t_given_s1 = defaultdict(float) # count of i given j, l, m alignment_count1 = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: 0.0)))) alignment_count_for_any_i1 = defaultdict( lambda: defaultdict(lambda: defaultdict( lambda: 0.0))) for aligned_sentence in aligned_sents: src_sentence = [None] + aligned_sentence.mots trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed l = len(aligned_sentence.mots) m = len(aligned_sentence.words) total_count = defaultdict(float) # E step (a): Compute normalization factors to weigh counts for j in range(1, m + 1): wt = trg_sentence[j] total_count[wt] = 0 for i in range(l + 1): ws = src_sentence[i] #print wt, translation_table1[wt][ws], alignment_table1[i][j][l][m] count = (translation_table1[wt][ws] * alignment_table1[i][j][l][m]) total_count[wt] += count # E step (b): Collect counts for j in range(1, m + 1): wt = trg_sentence[j] for i in range(l + 1): ws = src_sentence[i] count = (translation_table1[wt][ws] * alignment_table1[i][j][l][m]) #print total_count normalized_count = count / total_count[wt] count_t_given_s1[wt][ws] += normalized_count count_any_t_given_s1[ws] += normalized_count alignment_count1[i][j][l][m] += normalized_count alignment_count_for_any_i1[j][l][m] += normalized_count # M step: Update probabilities with maximum likelihood estimates for ws in src_vocab: for wt in trg_vocab: estimate = count_t_given_s1[wt][ws] / count_any_t_given_s1[ws] translation_table1[wt][ws] = max(estimate, MIN_PROB) for aligned_sentence in aligned_sents: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) for i in range(l + 1): for j in range(1, m + 1): estimate = (alignment_count1[i][j][l][m] / alignment_count_for_any_i1[j][l][m]) alignment_table1[i][j][l][m] = max(estimate, MIN_PROB) #train model2: source --> target src_vocab = set() trg_vocab = set() for aligned_sentence in aligned_sents: trg_vocab.update(aligned_sentence.mots) src_vocab.update(aligned_sentence.words) # Add the NULL token src_vocab.add(None) translation_table2 = defaultdict( lambda: defaultdict(lambda: MIN_PROB)) alignment_table2 = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: MIN_PROB)))) # Initialize translation probability distribution #initial_value = 1.0 / len(trg_vocab) #for wt in trg_vocab: # for ws in src_vocab: # translation_table2[wt][ws] = initial_value ibm1 = IBMModel1(aligned_sents, 6) translation_table2 = ibm1.probabilities # Initialize alignment probability distribution, # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m for aligned_sentence in aligned_sents: l = len(aligned_sentence.words) m = len(aligned_sentence.mots) initial_value = 1.0 / (l + 1) #!!! for i in range(l + 1): for j in range(1, m + 1): alignment_table2[i][j][l][m] = initial_value for i in range(num_iters): count_t_given_s2 = defaultdict(lambda: defaultdict(float)) count_any_t_given_s2 = defaultdict(float) # count of i given j, l, m alignment_count2 = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: 0.0)))) alignment_count_for_any_i2 = defaultdict( lambda: defaultdict(lambda: defaultdict( lambda: 0.0))) for aligned_sentence in aligned_sents: src_sentence = [None] + aligned_sentence.words trg_sentence = ['UNUSED'] + aligned_sentence.mots # 1-indexed l = len(aligned_sentence.words) m = len(aligned_sentence.mots) total_count = defaultdict(float) # E step (a): Compute normalization factors to weigh counts for j in range(1, m + 1): wt = trg_sentence[j] total_count[wt] = 0 for i in range(l + 1): ws = src_sentence[i] count = (translation_table2[wt][ws] * alignment_table2[i][j][l][m]) total_count[wt] += count # E step (b): Collect counts for j in range(1, m + 1): wt = trg_sentence[j] for i in range(l + 1): ws = src_sentence[i] count = (translation_table2[wt][ws] * alignment_table2[i][j][l][m]) normalized_count = count / total_count[wt] count_t_given_s2[wt][ws] += normalized_count count_any_t_given_s2[ws] += normalized_count alignment_count2[i][j][l][m] += normalized_count alignment_count_for_any_i2[j][l][m] += normalized_count # M step: Update probabilities with maximum likelihood estimates for ws in src_vocab: for wt in trg_vocab: #print count_any_t_given_s estimate = count_t_given_s2[wt][ws] / count_any_t_given_s2[ws] translation_table2[wt][ws] = max(estimate, MIN_PROB) for aligned_sentence in aligned_sents: l = len(aligned_sentence.words) m = len(aligned_sentence.mots) for i in range(l + 1): for j in range(1, m + 1): estimate = (alignment_count2[i][j][l][m] / alignment_count_for_any_i2[j][l][m]) alignment_table2[i][j][l][m] = max(estimate, MIN_PROB) # average two models for ws in trg_vocab: for wt in src_vocab: #translation_table1[wt][ws]=translation_table1[wt][ws]*0.45+translation_table2[ws][wt]*0.55 translation_table1[wt][ws]=(count_t_given_s1[wt][ws]+count_t_given_s2[ws][wt])/ (count_any_t_given_s1[ws]+count_any_t_given_s2[wt]) for aligned_sentence in aligned_sents: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) for i in range(l + 1): for j in range(1, m + 1): alignment_table1[i][j][l][m]=alignment_table1[i][j][l][m]*0.57+alignment_table2[j][i][m][l]*0.43 #alignment_table1[i][j][l][m]=(alignment_count1[i][j][l][m]+alignment_count2[j][i][m][j])/(alignment_count_for_any_i1[j][l][m]+alignment_count_for_any_i2[i][m][l]) t = translation_table1 q = alignment_table1 return (t,q)
def create_ibm1(aligned_sents): return IBMModel1(aligned_sents, 10) #Change to 10
def train(self, aligned_sents, num_iters): t = {} q = {} # John's edit starts here # initialization step of EM # might have to change this to the IBMModel1 init method # instead of this function ibm1 = IBMModel1(aligned_sents, 10) t_eg = ibm1.probabilities t_ge = ibm1.probabilities t_eg_copy = t_eg.copy() align_eg = self.initEM(aligned_sents, 'Forwards') # go forwards for the first set align_ge = self.initEM(aligned_sents, 'Backwards') align_eg_copy = align_eg.copy() # make vocabulary sets for each language ger_vocab = set() en_vocab = set() for sent in aligned_sents: en_vocab.update(sent.words) ger_vocab.update(sent.mots) en_vocab.add(None) ger_vocab.add(None) derp = True derp2 = True derp3 = True # begin EM Iterations for n in range (0, num_iters): print 'Iter: ' + str(n + 1) + ' of 10' count_eg = defaultdict(lambda: defaultdict(lambda: 0.0)) # counts going forward count_ge = defaultdict(lambda: defaultdict(lambda: 0.0)) # counts going backward total_g = defaultdict(lambda: 0.0) total_e = defaultdict(lambda: 0.0) # denominators for the equation total_enorm = defaultdict(lambda: 0.0) total_gnorm = defaultdict(lambda: 0.0) count_align_eg = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))) total_align_eg = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) count_align_ge = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))) total_align_ge = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) for sent in aligned_sents: german = [None] + sent.mots english = [None] + sent.words ger_len = len(german) - 1 en_len = len(english) - 1 # this whole thing uses the equation on page 13 of collins # get sum for the denominator # this uses dictionaries total_enorm and total_gnorm # forwards for j in range (1, en_len + 1): en_word = english[j] for i in range (0, ger_len + 1): # supposed to be 0 or 1? # if derp3: # print 'Derp3' # print german[i] # derp3 = False ger_word = german[i] add_en = t_eg[en_word][ger_word] * align_eg[i][j][en_len][ger_len] total_enorm[en_word] = total_enorm[en_word] + add_en # backwards for j in range (1, ger_len + 1): ger_word = german[j] for i in range (0, en_len + 1): # supposed to be 0 or 1? en_word = english[i] add_ger = t_ge[ger_word][en_word] * align_ge[i][j][ger_len][en_len] total_gnorm[ger_word] = total_gnorm[ger_word] + add_ger # compute counts using delta # uses count_eg, total_g, count_align_eg, total_align_eg # forwards for j in range (1, en_len + 1): en_word = english[j] for i in range (0, ger_len + 1): # supposed to be 0 or 1? ger_word = german[i] add_en = t_eg[en_word][ger_word] * align_eg[i][j][en_len][ger_len] # if derp: # print 'Derp' # print ger_word # print t_eg[en_word][ger_word] # print align_eg[i][j][en_len][ger_len] # derp = False en_delta = add_en / total_enorm[en_word] #now that we have delta, add it to all of our variables from above count_eg[en_word][ger_word] = count_eg[en_word][ger_word] + en_delta total_g[ger_word] = total_g[ger_word] + en_delta count_align_eg[i][j][en_len][ger_len] = count_align_eg[j][i][en_len][ger_len] + en_delta total_align_eg[j][en_len][ger_len] = total_align_eg[j][en_len][ger_len] + en_delta # backwards for j in range (1, ger_len + 1): ger_word = german[j] # if derp: # print ger_word # derp = False for i in range (0, en_len + 1): # supposed to be 0 or 1? en_word = english[i] add_ger = t_ge[ger_word][en_word] * align_ge[i][j][ger_len][en_len] # if add_ger == 0: # print 'add en sucks' # print en_word # print ger_word # print t_ge[ger_word][en_word] # print align_ge[i][j][ger_len][en_len] ger_delta = add_ger / total_gnorm[ger_word] #now that we have delta, add it to all of our variables from above count_ge[ger_word][en_word] = count_ge[ger_word][en_word] + ger_delta total_e[en_word] = total_e[en_word] + ger_delta count_align_ge[i][j][ger_len][en_len] = count_align_ge[i][j][ger_len][en_len] + ger_delta total_align_ge[j][ger_len][en_len] = total_align_ge[j][ger_len][en_len] + ger_delta # now done with for sent in aligned sent # copied form IBM2 Pydoc # Smoothing the counts for alignments for alignSent in aligned_sents: en_set = alignSent.words fr_set = [None] + alignSent.mots l_f = len(fr_set) - 1 l_e = len(en_set) laplace = 1.0 for i in range(0, l_f+1): for j in range(1, l_e+1): value = count_align_eg[i][j][l_e][l_f] if 0 < value < laplace: laplace = value laplace *= 0.5 for i in range(0, l_f+1): for j in range(1, l_e+1): count_align_eg[i][j][l_e][l_f] += laplace initial_value = laplace * l_e for j in range(1, l_e+1): total_align_eg[j][l_e][l_f] += initial_value for alignSent in aligned_sents: en_set = alignSent.mots fr_set = [None] + alignSent.words l_f = len(fr_set) - 1 l_e = len(en_set) laplace = 1.0 for i in range(0, l_f+1): for j in range(1, l_e+1): value = count_align_ge[i][j][l_e][l_f] if 0 < value < laplace: laplace = value laplace *= 0.5 for i in range(0, l_f+1): for j in range(1, l_e+1): count_align_ge[i][j][l_e][l_f] += laplace initial_value = laplace * l_e for j in range(1, l_e+1): total_align_ge[j][l_e][l_f] += initial_value # # Now we average the distortion and translation # # might be able to put this in the above loop # for sent in aligned_sents: # german = [None] + sent.mots # english = [None] + sent.words # ger_len = len(german) - 1 # en_len = len(english) - 1 # # # averaging the two values for forward and backward q (distortion) # # this can be done inside of the big loop for each sent # for j in range (1, ger_len + 1): # for i in range (0, en_len + 1): # en_val = count_align_eg[i][j][en_len][ger_len] # ger_val = count_align_ge[j][i][ger_len][en_len] # avg_val = (en_val + ger_val) / float(2) # # now set the value to each # count_align_eg[i][j][en_len][ger_len] = avg_val # count_align_ge[j][i][ger_len][en_len] = avg_val # # # averaging the values for t (translation) # for en_word in count_eg: # for ger_word in count_ge: # en_count = count_eg[en_word][ger_word] # ger_count = count_ge[ger_word][en_word] # avg_count = (en_count + ger_count) / float(2) # # now set the value to each # count_eg[en_word][ger_word] = avg_count # count_ge[ger_word][en_word] = avg_count # now we can find the next round of translation and distortion probabilities t_eg_new = defaultdict(lambda: defaultdict(lambda: 0.0)) # counts going forwards align_eg_new = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))) t_ge_new = defaultdict(lambda: defaultdict(lambda: 0.0)) # counts going backwards align_ge_new = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))) # Estimating lexical translation probabilities for en_word in en_vocab: for ger_word in ger_vocab: # if derp2: # print 'Derp2' # print ger_word # derp2 = False try: t_eg_new[en_word][ger_word] = count_eg[en_word][ger_word] / total_g[ger_word] t_ge_new[ger_word][en_word] = count_ge[ger_word][en_word] / total_e[en_word] except: print 'Except triggered:' print ger_word print en_word exit(1) # Estimating new alignment probabilities for sent in aligned_sents: german = [None] + sent.mots english = [None] + sent.words ger_len = len(german) - 1 en_len = len(english) - 1 # forwards for j in range (1, en_len + 1): for i in range (0, ger_len + 1): # supposed to be 0 or 1? align_eg_new[i][j][en_len][ger_len] = count_align_eg[i][j][en_len][ger_len] / total_align_eg[j][en_len][ger_len] # backwards for j in range (1, ger_len + 1): for i in range (0, en_len + 1): # supposed to be 0 or 1? align_ge_new[i][j][ger_len][en_len] = count_align_ge[i][j][ger_len][en_len] / total_align_ge[j][ger_len][en_len] # set values so it loops properly t_eg = t_eg_new.copy() t_ge = t_ge_new.copy() align_eg = align_eg_new.copy() align_ge = align_ge_new.copy() # Now we average the distortion and translation for sent in aligned_sents: german = [None] + sent.mots english = [None] + sent.words ger_len = len(german) - 1 en_len = len(english) - 1 # averaging the two values for forward and backward q (distortion) # this can be done inside of the big loop for each sent for j in range (1, ger_len + 1): for i in range (0, en_len + 1): en_val = align_eg[i][j][en_len][ger_len] ger_val = align_ge[j][i][ger_len][en_len] avg_val = (en_val + ger_val) / float(2) # now set the value to each align_eg[i][j][en_len][ger_len] = avg_val # averaging the values for t (translation) for en_word in t_eg: for ger_word in t_ge: en_count = t_eg[en_word][ger_word] ger_count = t_ge[ger_word][en_word] avg_count = (en_count + ger_count) / float(2) # now set the value to each t_eg[en_word][ger_word] = avg_count # return provided by the professor t = t_eg.copy() q = align_eg_copy.copy() # using a copy of the initialized q improves the results # which means align_eg as it iterates hurts the accuracy return (t,q)