def __init__(self, sentence_aligned_corpus, iterations): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model, a distortion model, a fertility model, and a model for generating NULL-aligned words. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. Runs a few iterations of Model 2 training to initialize model parameters. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int """ super(IBMModel3, self).__init__(sentence_aligned_corpus) # Get the translation and alignment probabilities from IBM model 2 ibm2 = IBMModel2(sentence_aligned_corpus, iterations) self.translation_table = ibm2.translation_table # Alignment table is only used for hill climbing and is not part # of the output of Model 3 training self.alignment_table = ibm2.alignment_table self.train(sentence_aligned_corpus, iterations)
def testmain(aligned_sents): for n in range(19, 20): print('%s\n' % (n)) ibm1 = IBMModel1(aligned_sents, n) avg_aer = compute_avg_aer(aligned_sents, ibm1, 50) print('IBM Model 1') print('---------------------------') print('Average AER: {0:.3f}\n'.format(avg_aer)) ibm2 = IBMModel2(aligned_sents, n) avg_aer = compute_avg_aer(aligned_sents, ibm2, 50) print('IBM Model 2') print('---------------------------') print('Average AER: {0:.3f}\n'.format(avg_aer))
def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, None) translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(float)))) alignment_table[0][3][5][6] = 0.97 # None -> to alignment_table[1][1][5][6] = 0.97 # ich -> i alignment_table[2][4][5][6] = 0.97 # esse -> eat alignment_table[4][2][5][6] = 0.97 # gern -> love alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham model2 = IBMModel2(corpus, 0) model2.translation_table = translation_table model2.alignment_table = alignment_table # act probability = model2.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96 expected_probability = lexical_translation * alignment self.assertEqual(round(probability, 4), round(expected_probability, 4))
def create_ibm2(aligned_sents): return IBMModel2(aligned_sents, 10)
def train(self, parallel_corpus, iterations): """ Learns and sets probability tables """ # Get the translation and alignment probabilities from IBM model 2 ibm2 = IBMModel2(parallel_corpus, iterations) self.translation_table = ibm2.translation_table """ dict(dict(float)): probability(target word | source word). Values accessed with ``translation_table[target_word][source_word].`` """ self.alignment_table = ibm2.alignment_table src_vocab = set() trg_vocab = set() for aligned_sentence in parallel_corpus: trg_vocab.update(aligned_sentence.words) src_vocab.update(aligned_sentence.mots) # Add the NULL token src_vocab.add(None) # Initial probability of null insertion self.p0 = 0.5 """ float: probability that a generated word does not require another target word that is aligned to NULL """ self.fertility_table = defaultdict( lambda: defaultdict(lambda: self.PROB_SMOOTH)) """ dict(dict(float)))): probability(fertility | source word). Values accessed with ``fertility_table[fertility][source_word].`` """ self.distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: self.PROB_SMOOTH)))) """ dict(dict(dict(dict(float)))): probability(j | i,l,m). Values accessed with ``distortion_table[j][i][m][l].`` """ for k in range(0, iterations): max_fertility = 0 # Reset all counts count_t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0)) count_any_t_given_s = defaultdict(lambda: 0.0) distortion_count = defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))) distortion_count_for_any_j = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) count_p0 = 0.0 count_p1 = 0.0 fertility_count = defaultdict(lambda: defaultdict(lambda: 0.0)) fertility_count_for_any_phi = defaultdict(lambda: 0.0) for aligned_sentence in parallel_corpus: trg_sentence = aligned_sentence.words src_sentence = [None] + aligned_sentence.mots l = len(src_sentence) - 1 m = len(trg_sentence) # Sample the alignment space sampled_alignments = self.sample(trg_sentence, src_sentence) total_count = 0.0 # E step (a): Compute normalization factors to weigh counts for (alignment, fert) in sampled_alignments: count = self.probability(alignment, trg_sentence, src_sentence, fert) total_count += count # E step (b): Collect counts for (alignment, fert) in sampled_alignments: count = self.probability(alignment, trg_sentence, src_sentence, fert) normalized_count = count / total_count null_count = 0 for j in range(1, m + 1): t = trg_sentence[j - 1] i = alignment[j] s = src_sentence[i] # Lexical translation count_t_given_s[t][s] += normalized_count count_any_t_given_s[s] += normalized_count # Distortion distortion_count[j][i][m][l] += normalized_count distortion_count_for_any_j[i][m][l] += normalized_count if i == 0: null_count += 1 # NULL-aligned words generation count_p1 += null_count * normalized_count count_p0 += (m - 2 * null_count) * normalized_count # Fertility for i in range(0, l + 1): fertility = 0 for j in range(1, m + 1): if i == alignment[j]: fertility += 1 s = src_sentence[i] fertility_count[fertility][s] += normalized_count fertility_count_for_any_phi[s] += normalized_count if fertility > max_fertility: max_fertility = fertility self.translation_table = defaultdict( lambda: defaultdict(lambda: 0.0)) self.distortion_table = defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))) self.fertility_table = defaultdict( lambda: defaultdict(lambda: 0.0)) # M step: Update probabilities with maximum likelihood estimates # Lexical translation for s in src_vocab: for t in trg_vocab: self.translation_table[t][s] = (count_t_given_s[t][s] / count_any_t_given_s[s]) # Distortion for aligned_sentence in parallel_corpus: trg_sentence = aligned_sentence.words src_sentence = [None] + aligned_sentence.mots l = len(src_sentence) - 1 m = len(trg_sentence) for i in range(0, l + 1): for j in range(1, m + 1): self.distortion_table[j][i][m][l] = ( distortion_count[j][i][m][l] / distortion_count_for_any_j[i][m][l]) # Fertility for fertility in range(0, max_fertility + 1): for s in src_vocab: self.fertility_table[fertility][s] = ( fertility_count[fertility][s] / fertility_count_for_any_phi[s]) # NULL-aligned words generation p1 = count_p1 / (count_p1 + count_p0) self.p0 = 1 - p1
def train(self, align_sents, num_iter): """ This function is the main process of training model, which initialize all the probability distributions and executes a specific number of iterations. """ # Get the translation and alignment probabilities from IBM model 2 ibm2 = IBMModel2(align_sents, num_iter) self.probabilities, self.align_table = ibm2.probabilities, ibm2.alignments fr_vocab = set() en_vocab = set() for alignSent in align_sents: en_vocab.update(alignSent.words) fr_vocab.update(alignSent.mots) fr_vocab.add(None) # Initial probability of null insertion. self.null_insertion = 0.5 self.fertility = defaultdict( lambda: defaultdict(lambda: self.PROB_SMOOTH)) self.distortion = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: self.PROB_SMOOTH)))) for k in range(0, num_iter): max_fert = 0 # Set all count* and total* to 0 count_t = defaultdict(lambda: defaultdict(lambda: 0.0)) total_t = defaultdict(lambda: 0.0) count_d = defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: 0.0)))) total_d = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) count_p0 = 0.0 count_p1 = 0.0 count_f = defaultdict(lambda: defaultdict(lambda: 0.0)) total_f = defaultdict(lambda: 0.0) for alignSent in align_sents: en_set = alignSent.words fr_set = [None] + alignSent.mots l_f = len(fr_set) - 1 l_e = len(en_set) # Sample the alignment space A = self.sample(en_set, fr_set) # Collect counts c_total = 0.0 for (a, fert) in A: c_total += self.probability(a, en_set, fr_set, fert) for (a, fert) in A: c = self.probability(a, en_set, fr_set, fert) / c_total null = 0 for j in range(1, l_e + 1): en_word = en_set[j - 1] fr_word = fr_set[a[j]] # Lexical translation count_t[en_word][fr_word] += c total_t[fr_word] += c # Distortion count_d[j][a[j]][l_e][l_f] += c total_d[a[j]][l_e][l_f] += c if a[j] == 0: null += 1 # Collect the counts of null insetion count_p1 += null * c count_p0 += (l_e - 2 * null) * c # Collect the counts of fertility for i in range(0, l_f + 1): fertility = 0 for j in range(1, l_e + 1): if i == a[j]: fertility += 1 fr_word = fr_set[i] count_f[fertility][fr_word] += c total_f[fr_word] += c if fertility > max_fert: max_fert = fertility self.probabilities = defaultdict(lambda: defaultdict(lambda: 0.0)) self.distortion = defaultdict(lambda: defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))) self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0)) # Estimate translation probability distribution for f in fr_vocab: for e in en_vocab: self.probabilities[e][f] = count_t[e][f] / total_t[f] # Estimate distortion for alignSent in align_sents: en_set = alignSent.words fr_set = [None] + alignSent.mots l_f = len(fr_set) - 1 l_e = len(en_set) for i in range(0, l_f + 1): for j in range(1, l_e + 1): self.distortion[j][i][l_e][l_f] = count_d[j][i][l_e][ l_f] / total_d[i][l_e][l_f] # Estimate the fertility, n(Fertility | input word) for ferti in range(0, max_fert + 1): for fr_word in fr_vocab: self.fertility[ferti][ fr_word] = count_f[ferti][fr_word] / total_f[fr_word] # Estimate the probability of null insertion p1 = count_p1 / (count_p1 + count_p0) self.null_insertion = 1 - p1