def hillclimb(self, alignment_info, j_pegged=None): """ Starting from the alignment in ``alignment_info``, look at neighboring alignments iteratively for the best one, according to Model 4 Note that Model 4 scoring is used instead of Model 5 because the latter is too expensive to compute. There is no guarantee that the best alignment in the alignment space will be found, because the algorithm might be stuck in a local maximum. :param j_pegged: If specified, the search will be constrained to alignments where ``j_pegged`` remains unchanged :type j_pegged: int :return: The best alignment found from hill climbing :rtype: AlignmentInfo """ alignment = alignment_info # alias with shorter name max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self) while True: old_alignment = alignment for neighbor_alignment in self.neighboring(alignment, j_pegged): neighbor_probability = IBMModel4.model4_prob_t_a_given_s( neighbor_alignment, self) if neighbor_probability > max_probability: alignment = neighbor_alignment max_probability = neighbor_probability if alignment == old_alignment: # Until there are no better alignments break alignment.score = max_probability return alignment
def hillclimb(self, alignment_info, j_pegged=None): """ Starting from the alignment in ``alignment_info``, look at neighboring alignments iteratively for the best one, according to Model 4 Note that Model 4 scoring is used instead of Model 5 because the latter is too expensive to compute. There is no guarantee that the best alignment in the alignment space will be found, because the algorithm might be stuck in a local maximum. :param j_pegged: If specified, the search will be constrained to alignments where ``j_pegged`` remains unchanged :type j_pegged: int :return: The best alignment found from hill climbing :rtype: AlignmentInfo """ alignment = alignment_info # alias with shorter name max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self) while True: old_alignment = alignment for neighbor_alignment in self.neighboring(alignment, j_pegged): neighbor_probability = IBMModel4.model4_prob_t_a_given_s(neighbor_alignment, self) if neighbor_probability > max_probability: alignment = neighbor_alignment max_probability = neighbor_probability if alignment == old_alignment: # Until there are no better alignments break alignment.score = max_probability return alignment
def prune(self, alignment_infos): """ Removes alignments from ``alignment_infos`` that have substantially lower Model 4 scores than the best alignment :return: Pruned alignments :rtype: set(AlignmentInfo) """ alignments = [] best_score = 0 for alignment_info in alignment_infos: score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self) best_score = max(score, best_score) alignments.append((alignment_info, score)) threshold = IBMModel5.MIN_SCORE_FACTOR * best_score alignments = [a[0] for a in alignments if a[1] > threshold] return set(alignments)
def __init__(self, sentence_aligned_corpus, iterations, source_word_classes, target_word_classes, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model, vacancy models, a fertility model, and a model for generating NULL-aligned words. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param source_word_classes: Lookup table that maps a source word to its word class, the latter represented by an integer id :type source_word_classes: dict[str]: int :param target_word_classes: Lookup table that maps a target word to its word class, the latter represented by an integer id :type target_word_classes: dict[str]: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``, ``fertility_table``, ``p1``, ``head_distortion_table``, ``non_head_distortion_table``, ``head_vacancy_table``, ``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``, and ``IBMModel5`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super(IBMModel5, self).__init__(sentence_aligned_corpus) self.reset_probabilities() self.src_classes = source_word_classes self.trg_classes = target_word_classes if probability_tables is None: # Get probabilities from IBM model 4 ibm4 = IBMModel4(sentence_aligned_corpus, iterations, source_word_classes, target_word_classes) self.translation_table = ibm4.translation_table self.alignment_table = ibm4.alignment_table self.fertility_table = ibm4.fertility_table self.p1 = ibm4.p1 self.head_distortion_table = ibm4.head_distortion_table self.non_head_distortion_table = ibm4.non_head_distortion_table self.set_uniform_distortion_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables['translation_table'] self.alignment_table = probability_tables['alignment_table'] self.fertility_table = probability_tables['fertility_table'] self.p1 = probability_tables['p1'] self.head_distortion_table = probability_tables[ 'head_distortion_table'] self.non_head_distortion_table = probability_tables[ 'non_head_distortion_table'] self.head_vacancy_table = probability_tables[ 'head_vacancy_table'] self.non_head_vacancy_table = probability_tables[ 'non_head_vacancy_table'] for k in range(0, iterations): self.train(sentence_aligned_corpus)