Beispiel #1
0
    def set_uniform_distortion_probabilities(self, sentence_aligned_corpus):
        """
        Set distortion probabilities uniformly to
        1 / cardinality of displacement values
        """
        max_m = longest_target_sentence_length(sentence_aligned_corpus)

        # The maximum displacement is m-1, when a word is in the last
        # position m of the target sentence and the previously placed
        # word is in the first position.
        # Conversely, the minimum displacement is -(m-1).
        # Thus, the displacement range is (m-1) - (-(m-1)). Note that
        # displacement cannot be zero and is not included in the range.
        if max_m <= 1:
            initial_prob = IBMModel.MIN_PROB
        else:
            initial_prob = float(1) / (2 * (max_m - 1))
        if initial_prob < IBMModel.MIN_PROB:
            warnings.warn("A target sentence is too long (" + str(max_m) +
                          " words). Results may be less accurate.")

        for dj in range(1, max_m):
            self.head_distortion_table[dj] = defaultdict(
                lambda: defaultdict(lambda: initial_prob))
            self.head_distortion_table[-dj] = defaultdict(
                lambda: defaultdict(lambda: initial_prob))
            self.non_head_distortion_table[dj] = defaultdict(
                lambda: initial_prob)
            self.non_head_distortion_table[-dj] = defaultdict(
                lambda: initial_prob)
Beispiel #2
0
    def set_uniform_distortion_probabilities(self, sentence_aligned_corpus):
        """
        Set vacancy probabilities uniformly to
        1 / cardinality of vacancy difference values
        """
        max_m = longest_target_sentence_length(sentence_aligned_corpus)

        # The maximum vacancy difference occurs when a word is placed in
        # the last available position m of the target sentence and the
        # previous word position has no vacancies.
        # The minimum is 1-max_v, when a word is placed in the first
        # available position and the previous word is placed beyond the
        # last available position.
        # Thus, the number of possible vacancy difference values is
        # (max_v) - (1-max_v) + 1 = 2 * max_v.
        if max_m > 0 and (float(1) / (2 * max_m)) < IBMModel.MIN_PROB:
            warnings.warn("A target sentence is too long (" + str(max_m) + " words). Results may be less accurate.")

        for max_v in range(1, max_m + 1):
            for dv in range(1, max_m + 1):
                initial_prob = 1 / (2 * max_v)
                self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob)
                self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict(lambda: initial_prob)
                self.non_head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob)
                self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict(lambda: initial_prob)
    def set_uniform_distortion_probabilities(self, sentence_aligned_corpus):
        """
        Set vacancy probabilities uniformly to
        1 / cardinality of vacancy difference values
        """
        max_m = longest_target_sentence_length(sentence_aligned_corpus)

        # The maximum vacancy difference occurs when a word is placed in
        # the last available position m of the target sentence and the
        # previous word position has no vacancies.
        # The minimum is 1-max_v, when a word is placed in the first
        # available position and the previous word is placed beyond the
        # last available position.
        # Thus, the number of possible vacancy difference values is
        # (max_v) - (1-max_v) + 1 = 2 * max_v.
        if max_m > 0 and (float(1) / (2 * max_m)) < IBMModel.MIN_PROB:
            warnings.warn("A target sentence is too long (" + str(max_m) +
                          " words). Results may be less accurate.")

        for max_v in range(1, max_m + 1):
            for dv in range(1, max_m + 1):
                initial_prob = 1 / (2 * max_v)
                self.head_vacancy_table[dv][max_v] = defaultdict(
                    lambda: initial_prob)
                self.head_vacancy_table[-(dv-1)][max_v] = defaultdict(
                    lambda: initial_prob)
                self.non_head_vacancy_table[dv][max_v] = defaultdict(
                    lambda: initial_prob)
                self.non_head_vacancy_table[-(dv-1)][max_v] = defaultdict(
                    lambda: initial_prob)
    def set_uniform_distortion_probabilities(self, sentence_aligned_corpus):
        """
        Set distortion probabilities uniformly to
        1 / cardinality of displacement values
        """
        max_m = longest_target_sentence_length(sentence_aligned_corpus)

        # The maximum displacement is m-1, when a word is in the last
        # position m of the target sentence and the previously placed
        # word is in the first position.
        # Conversely, the minimum displacement is -(m-1).
        # Thus, the displacement range is (m-1) - (-(m-1)). Note that
        # displacement cannot be zero and is not included in the range.
        if max_m <= 1:
            initial_prob = IBMModel.MIN_PROB
        else:
            initial_prob = float(1) / (2 * (max_m - 1))
        if initial_prob < IBMModel.MIN_PROB:
            warnings.warn("A target sentence is too long (" + str(max_m) +
                          " words). Results may be less accurate.")

        for dj in range(1, max_m):
            self.head_distortion_table[dj] = defaultdict(
                lambda: defaultdict(lambda: initial_prob))
            self.head_distortion_table[-dj] = defaultdict(
                lambda: defaultdict(lambda: initial_prob))
            self.non_head_distortion_table[dj] = defaultdict(
                lambda: initial_prob)
            self.non_head_distortion_table[-dj] = defaultdict(
                lambda: initial_prob)