def set_uniform_distortion_probabilities(self, sentence_aligned_corpus): """ Set distortion probabilities uniformly to 1 / cardinality of displacement values """ max_m = longest_target_sentence_length(sentence_aligned_corpus) # The maximum displacement is m-1, when a word is in the last # position m of the target sentence and the previously placed # word is in the first position. # Conversely, the minimum displacement is -(m-1). # Thus, the displacement range is (m-1) - (-(m-1)). Note that # displacement cannot be zero and is not included in the range. if max_m <= 1: initial_prob = IBMModel.MIN_PROB else: initial_prob = float(1) / (2 * (max_m - 1)) if initial_prob < IBMModel.MIN_PROB: warnings.warn("A target sentence is too long (" + str(max_m) + " words). Results may be less accurate.") for dj in range(1, max_m): self.head_distortion_table[dj] = defaultdict( lambda: defaultdict(lambda: initial_prob)) self.head_distortion_table[-dj] = defaultdict( lambda: defaultdict(lambda: initial_prob)) self.non_head_distortion_table[dj] = defaultdict( lambda: initial_prob) self.non_head_distortion_table[-dj] = defaultdict( lambda: initial_prob)
def set_uniform_distortion_probabilities(self, sentence_aligned_corpus): """ Set vacancy probabilities uniformly to 1 / cardinality of vacancy difference values """ max_m = longest_target_sentence_length(sentence_aligned_corpus) # The maximum vacancy difference occurs when a word is placed in # the last available position m of the target sentence and the # previous word position has no vacancies. # The minimum is 1-max_v, when a word is placed in the first # available position and the previous word is placed beyond the # last available position. # Thus, the number of possible vacancy difference values is # (max_v) - (1-max_v) + 1 = 2 * max_v. if max_m > 0 and (float(1) / (2 * max_m)) < IBMModel.MIN_PROB: warnings.warn("A target sentence is too long (" + str(max_m) + " words). Results may be less accurate.") for max_v in range(1, max_m + 1): for dv in range(1, max_m + 1): initial_prob = 1 / (2 * max_v) self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob) self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict(lambda: initial_prob) self.non_head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob) self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict(lambda: initial_prob)
def set_uniform_distortion_probabilities(self, sentence_aligned_corpus): """ Set vacancy probabilities uniformly to 1 / cardinality of vacancy difference values """ max_m = longest_target_sentence_length(sentence_aligned_corpus) # The maximum vacancy difference occurs when a word is placed in # the last available position m of the target sentence and the # previous word position has no vacancies. # The minimum is 1-max_v, when a word is placed in the first # available position and the previous word is placed beyond the # last available position. # Thus, the number of possible vacancy difference values is # (max_v) - (1-max_v) + 1 = 2 * max_v. if max_m > 0 and (float(1) / (2 * max_m)) < IBMModel.MIN_PROB: warnings.warn("A target sentence is too long (" + str(max_m) + " words). Results may be less accurate.") for max_v in range(1, max_m + 1): for dv in range(1, max_m + 1): initial_prob = 1 / (2 * max_v) self.head_vacancy_table[dv][max_v] = defaultdict( lambda: initial_prob) self.head_vacancy_table[-(dv-1)][max_v] = defaultdict( lambda: initial_prob) self.non_head_vacancy_table[dv][max_v] = defaultdict( lambda: initial_prob) self.non_head_vacancy_table[-(dv-1)][max_v] = defaultdict( lambda: initial_prob)