Exemple #1
0
    def test_set_uniform_distortion_probabilities_of_non_domain_values(self):
        # arrange
        src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
        trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
        corpus = [
            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
        ]
        model5 = IBMModel5(corpus, 0, src_classes, trg_classes)

        # act
        model5.set_uniform_distortion_probabilities(corpus)

        # assert
        # examine dv and max_v values that are not in the training data domain
        self.assertEqual(model5.head_vacancy_table[5][4][0],
                         IBMModel.MIN_PROB)
        self.assertEqual(model5.head_vacancy_table[-4][1][2],
                         IBMModel.MIN_PROB)
        self.assertEqual(model5.head_vacancy_table[4][0][0],
                         IBMModel.MIN_PROB)
        self.assertEqual(model5.non_head_vacancy_table[5][4][0],
                         IBMModel.MIN_PROB)
        self.assertEqual(model5.non_head_vacancy_table[-4][1][2],
                         IBMModel.MIN_PROB)
    def test_vocabularies_are_initialized(self):
        parallel_corpora = [
            AlignedSent(['one', 'two', 'three', 'four'],
                        ['un', 'deux', 'trois']),
            AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']),
            AlignedSent([], ['sept'])
        ]

        ibm_model = IBMModel(parallel_corpora)
        self.assertEqual(len(ibm_model.src_vocab), 8)
        self.assertEqual(len(ibm_model.trg_vocab), 6)
Exemple #3
0
    def align(self, sentence_pair):
        """
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The original sentence pair is not modified. Results are
        undefined if ``sentence_pair`` is not in the training set.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent

        :return: ``AlignedSent`` filled in with the best word alignment
        :rtype: AlignedSent
        """

        if self.translation_table is None:
            raise ValueError("The model has not been trained.")

        alignment = []

        for j, trg_word in enumerate(sentence_pair.words):
            # Initialize trg_word to align with the NULL token
            best_alignment = (self.translation_table[trg_word][None], None)
            for i, src_word in enumerate(sentence_pair.mots):
                align_prob = self.translation_table[trg_word][src_word]
                best_alignment = max(best_alignment, (align_prob, i))

            # If trg_word is not aligned to the NULL token,
            # add it to the viterbi_alignment.
            if best_alignment[1] is not None:
                alignment.append((j, best_alignment[1]))

        return AlignedSent(sentence_pair.words, sentence_pair.mots, alignment)

        for j, en_word in enumerate(align_sent.words):

            # Initialize the maximum probability with Null token
            max_align_prob = (self.probabilities[en_word][None], None)
            for i, fr_word in enumerate(align_sent.mots):
                # Find out the maximum probability
                max_align_prob = max(max_align_prob,
                                     (self.probabilities[en_word][fr_word], i))

            # If the maximum probability is not Null token,
            # then append it to the alignment.
            if max_align_prob[1] is not None:
                alignment.append((j, max_align_prob[1]))

        return AlignedSent(align_sent.words, align_sent.mots, alignment)
Exemple #4
0
    def align(self, align_sent):
        """
        Returns the alignment result for one sentence pair. 
        """

        if self.probabilities is None:
            raise ValueError("The model does not train.")

        alignment = []

        for j, en_word in enumerate(align_sent.words):

            # Initialize the maximum probability with Null token
            max_align_prob = (self.probabilities[en_word][None], None)
            for i, fr_word in enumerate(align_sent.mots):
                # Find out the maximum probability
                max_align_prob = max(max_align_prob,
                                     (self.probabilities[en_word][fr_word], i))

            # If the maximum probability is not Null token,
            # then append it to the alignment.
            if max_align_prob[1] is not None:
                alignment.append((j, max_align_prob[1]))

        return AlignedSent(align_sent.words, align_sent.mots, alignment)
Exemple #5
0
    def test_prune(self):
        # arrange
        alignment_infos = [
            AlignmentInfo((1, 1), None, None, None),
            AlignmentInfo((1, 2), None, None, None),
            AlignmentInfo((2, 1), None, None, None),
            AlignmentInfo((2, 2), None, None, None),
            AlignmentInfo((0, 0), None, None, None)
        ]
        min_factor = IBMModel5.MIN_SCORE_FACTOR
        best_score = 0.9
        scores = {
            (1, 1): min(min_factor * 1.5, 1) * best_score,  # above threshold
            (1, 2): best_score,
            (2, 1): min_factor * best_score,        # at threshold
            (2, 2): min_factor * best_score * 0.5,  # low score
            (0, 0): min(min_factor * 1.1, 1) * 1.2  # above threshold
        }
        corpus = [AlignedSent(['a'], ['b'])]
        original_prob_function = IBMModel4.model4_prob_t_a_given_s
        # mock static method
        IBMModel4.model4_prob_t_a_given_s = staticmethod(
            lambda a, model: scores[a.alignment])
        model5 = IBMModel5(corpus, 0, None, None)

        # act
        pruned_alignments = model5.prune(alignment_infos)

        # assert
        self.assertEqual(len(pruned_alignments), 3)

        # restore static method
        IBMModel4.model4_prob_t_a_given_s = original_prob_function
Exemple #6
0
    def align(self, align_sent):
        if self.t is None or self.q is None:
            raise ValueError("The model does not train.")
        alignment = []

        l_e = len(align_sent.mots)
        l_f = len(align_sent.words)

        for j, fr_word in enumerate(align_sent.words):

            # Initialize the maximum probability with Null token
            max_align_prob = (self.t[None][fr_word] *
                              self.q[0][j + 1][l_e][l_f], None)
            for i, en_word in enumerate(align_sent.mots):
                # Find out the maximum probability
                max_align_prob = max(max_align_prob,
                                     (self.t[en_word][fr_word] *
                                      self.q[i + 1][j + 1][l_e][l_f], i))

            # If the maximum probability is not Null token,
            # then append it to the alignment.
            if max_align_prob[1] is not None:
                alignment.append((j, max_align_prob[1]))

        return AlignedSent(align_sent.words, align_sent.mots, alignment)
    def test_prob_t_a_given_s(self):
        # arrange
        src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
        trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
        corpus = [AlignedSent(trg_sentence, src_sentence)]
        alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5),
                                       [None] + src_sentence,
                                       ['UNUSED'] + trg_sentence, None)

        translation_table = defaultdict(lambda: defaultdict(float))
        translation_table['i']['ich'] = 0.98
        translation_table['love']['gern'] = 0.98
        translation_table['to'][None] = 0.98
        translation_table['eat']['esse'] = 0.98
        translation_table['smoked']['räucherschinken'] = 0.98
        translation_table['ham']['räucherschinken'] = 0.98

        model1 = IBMModel1(corpus, 0)
        model1.translation_table = translation_table

        # act
        probability = model1.prob_t_a_given_s(alignment_info)

        # assert
        lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
        expected_probability = lexical_translation
        self.assertEqual(round(probability, 4), round(expected_probability, 4))
    def aligned(self):
        if self.probabilities is None:
            raise ValueError("No probabilities calculated")

        aligned = []
        # Alignment Learning from t(e|f)
        for aligned_sent in self.aligned_sents:
            alignment = []
            le = len(aligned_sent.words)
            lf = len(aligned_sent.mots)

            for j, e_w in enumerate(aligned_sent.words):
                f_max = (self.alignments[lf, j, le, lf], None)
                for i, f_w in enumerate(aligned_sent.mots):
                    f_max = max(f_max, (self.alignments[i, j, le, lf], i))
                    print(f_max)
                    # print self.alignments[i, j, le, lf]

            # for every English word
            # for j, e_w in enumerate(aligned_sent.words):
            # find the French word that gives maximized t(e|f)
            # NULL token is the initial candidate
            # f_max = (self.probabilities[e_w, None], None)
            # for i, f_w in enumerate(aligned_sent.mots):
            # f_max = max(f_max, (self.probabilities[e_w, f_w], i))

            # only output alignment with non-NULL mapping
                if f_max[1] is not None:
                    alignment.append((j, f_max[1]))

            # substitute the alignment of AlignedSent with the yielded one
            aligned.append(
                AlignedSent(aligned_sent.words, aligned_sent.mots, alignment))

        return aligned
Exemple #9
0
def replaceByNumber(bitexts):
    """
    This function implements the approach to replace a word by an 
    number in the sentence pairs. 

    >>> en_dict, fr_dict, bitexts = replaceByNumber(comtrans.aligned_sents()[:100])
    >>> en_dict['Kommission']
    474
    >>> fr_dict['check']
    331
    >>> bitexts[0]
    AlignedSent([1, 2, 3], [1, 2, 3, 4], Alignment([(0, 0), (1, 1), (1, 2), (2, 3)]))

    Arguments:
    bitexts   -- A list of instances of AlignedSent class, which 
                 contains sentence pairs. 

    Returns:
    en_dict         -- A dictionary with an English word as a key and
                       an integer as a value.
    fr_dict         -- A dictionary with an foreign word as a key and
                       an integer as a value.
    new_bitexts     -- A list of instances of AlignedSent class, which
                       the sentence pairs that each word is represented 
                       by a number.
    """
    new_bitexts = []

    # Assign zero as an initial value
    en_dict = defaultdict(lambda: 0)
    fr_dict = defaultdict(lambda: 0)

    # The number starts from one to represent each word
    en_count = 1
    fr_count = 1
    for aligned_sent in bitexts:
        new_words = []
        for word in aligned_sent.words:
            if en_dict[word] == 0:
                en_dict[word] = en_count
                en_count += 1
            # Append the integer to the new sentence
            new_words.append(en_dict[word])

        new_mots = []
        for mots in aligned_sent.mots:
            if fr_dict[mots] == 0:
                fr_dict[mots] = fr_count
                fr_count += 1
            # Append the integer to the new sentence
            new_mots.append(fr_dict[mots])

        # Create a new instance of AlignedSent class
        # and append it to new list of sentence pairs.
        new_bitexts.append(
            AlignedSent(new_words, new_mots, aligned_sent.alignment))

    return en_dict, fr_dict, new_bitexts
    def test_best_model2_alignment_handles_fertile_words(self):
        # arrange
        sentence_pair = AlignedSent(
            ['i', 'really', ',', 'really', 'love', 'ham'],
            TestIBMModel.__TEST_SRC_SENTENCE)
        # 'bien' produces 2 target words: 'really' and another 'really'
        translation_table = {
            'i': {
                "j'": 0.9,
                'aime': 0.05,
                'bien': 0.02,
                'jambon': 0.03,
                None: 0
            },
            'really': {
                "j'": 0,
                'aime': 0,
                'bien': 0.9,
                'jambon': 0.01,
                None: 0.09
            },
            ',': {
                "j'": 0,
                'aime': 0,
                'bien': 0.3,
                'jambon': 0,
                None: 0.7
            },
            'love': {
                "j'": 0.05,
                'aime': 0.9,
                'bien': 0.01,
                'jambon': 0.01,
                None: 0.03
            },
            'ham': {
                "j'": 0,
                'aime': 0.01,
                'bien': 0,
                'jambon': 0.99,
                None: 0
            }
        }
        alignment_table = defaultdict(lambda: defaultdict(lambda: defaultdict(
            lambda: defaultdict(lambda: 0.2))))

        ibm_model = IBMModel([])
        ibm_model.translation_table = translation_table
        ibm_model.alignment_table = alignment_table

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4))
        self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]])
Exemple #11
0
    def learn_aligned_text(self, fr_lines, en_lines):
        '''Learn from IBM-aligned model of lists of lines of both languages'''
        bitext = []
        bitext_flip = []
        for pair in zip(fr_lines, en_lines):
            pp0 = preprocess(pair[0])
            pp1 = preprocess(pair[1])
            if len(pp0) == 0 or len(pp1) == 0:
                continue
            if pp0[0] == '' or pp0[1] == '':
                continue
            bitext.append(AlignedSent(pp0, pp1))
            bitext_flip.append(AlignedSent(pp1, pp0))
            self.learn_aligned_sentence(pair[1], pair[0])
        ibm = IBMModel3(bitext, 5)
        self.translation_table = ibm.translation_table

        ibm_flip = IBMModel3(bitext_flip, 5)
        self.fertility_table = ibm_flip.fertility_table
    def test_best_model2_alignment_handles_empty_trg_sentence(self):
        # arrange
        sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE)
        ibm_model = IBMModel([])

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], ())
        self.assertEqual(a_info.cepts, [[], [], [], [], []])
    def test_best_model2_alignment_handles_empty_src_sentence(self):
        # arrange
        sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, [])
        ibm_model = IBMModel([])

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], (0, 0, 0))
        self.assertEqual(a_info.cepts, [[1, 2, 3]])
    def test_sample(self):
        # arrange
        sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE,
                                    TestIBMModel.__TEST_SRC_SENTENCE)
        ibm_model = IBMModel([])
        ibm_model.prob_t_a_given_s = lambda x: 0.001

        # act
        samples, best_alignment = ibm_model.sample(sentence_pair)

        # assert
        self.assertEqual(len(samples), 61)
Exemple #15
0
    def read_block(self, stream):
        block = [self._word_tokenizer.tokenize(sent_str)
                 for alignedsent_str in self._alignedsent_block_reader(stream)
                 for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)]
        if self._aligned:
            block[2] = " ".join(block[2]) # kludge; we shouldn't have tokenized the alignment string
            block = [AlignedSent(*block)]
        elif self._group_by_sent:
            block = [block[0]]
        else:
            block = block[0]

        return block
Exemple #16
0
    def align(self, aligned_sent):

        best_alignment = []
        out_sent = AlignedSent(aligned_sent.words, aligned_sent.mots)
        l = len(aligned_sent.mots)
        m = len(aligned_sent.words)

        for i, t_word in enumerate(aligned_sent.words):
            best_prob = (self.t[t_word][None] * self.q[0][i + 1][l][m])
            best_alig = None

            for j, s_word in enumerate(aligned_sent.mots):
                alig_prob = (self.t[t_word][s_word] * self.q[j + 1][i + 1][l][m])
                if alig_prob > best_prob:
                    best_prob = alig_prob
                    best_alig = j

            if best_alig != None:
                best_alignment.append((i, best_alig))

        out_sent.alignment = Alignment(best_alignment)
        return out_sent
Exemple #17
0
def GetAlignment(derivation, tree1, tree2, cost_threshold = 0.3):
  """
  Given a derivation (sequence of rules that puts in correspondence tree1
  and tree2), it returns a word-to-word alignment.
  """
  src_tokens = tree1.leaves()
  trg_tokens = tree2.leaves()
  alignments_dict = defaultdict(list)
  src_leave_paths = sorted(tree1.treepositions('leaves'))
  trg_leave_paths = sorted(tree2.treepositions('leaves'))
  assert len(src_tokens) == len(src_leave_paths), \
    'Num. source leaves and leave paths mismatch: {0} vs. {1}'\
    .format(src_tokens, src_leave_paths)
  assert len(trg_tokens) == len(trg_leave_paths), \
    'Num. target leaves and leave paths mismatch: {0} vs. {1}'\
    .format(trg_tokens, trg_leave_paths)
  # Create an index that maps paths to word indices in the sentences.
  src_path_to_index = {path : index for index, path in enumerate(src_leave_paths)}
  trg_path_to_index = {path : index for index, path in enumerate(trg_leave_paths)}
  # Extract word-to-word alignments, one production at a time.
  for production in derivation:
    # Get the absolute path positions of lhs and rhs leaves.
    state, src_path, trg_path = production.non_terminal
    if state != 'dist_sim':
      continue
    lhs, rhs = production.rhs.rule.lhs, production.rhs.rule.rhs
    src_leave_abs_positions = \
      [src_path + src_leaf_path for src_leaf_path in GetLeavePositions(lhs)]
    trg_leave_abs_positions = \
      [trg_path + trg_leaf_path for trg_leaf_path in GetLeavePositions(rhs)]
    cost = production.rhs.rule.weight
    num_src_and_trg_leaves = \
      len(src_leave_abs_positions) + len(trg_leave_abs_positions)
    if (cost / float(num_src_and_trg_leaves)) > cost_threshold:
      continue
    # Set the index of each source word to align to the index of each
    # target word appearing in this production (rule).
    for src_leaf_path in src_leave_abs_positions:
      src_leaf_index = src_path_to_index[src_leaf_path]
      for trg_leaf_path in trg_leave_abs_positions:
        trg_leaf_index = trg_path_to_index[trg_leaf_path]
        alignments_dict[src_leaf_index].append(trg_leaf_index)
  # List of tuples with alignments. E.g. [(0, 0), (0, 1), (1, 2), (2, 3), ...]
  alignments = [(src_index, trg_index) \
                  for (src_index, trg_indices) in alignments_dict.items() \
                    for trg_index in trg_indices]
  alignments_str = ' '.join([str(src_index) + '-' + str(trg_index) \
                               for (src_index, trg_index) in alignments])
  aligned_sentence = AlignedSent(src_tokens, trg_tokens, alignments_str)
  return aligned_sentence
Exemple #18
0
    def test_set_uniform_distortion_probabilities_of_max_displacements(self):
        # arrange
        src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
        trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
        corpus = [
            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
        ]
        model5 = IBMModel5(corpus, 0, src_classes, trg_classes)

        # act
        model5.set_uniform_distortion_probabilities(corpus)

        # assert
        # number of vacancy difference values =
        #     2 * number of words in longest target sentence
        expected_prob = 1.0 / (2 * 4)

        # examine the boundary values for (dv, max_v, trg_class)
        self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob)
        self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob)
        self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob)
        self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob)
Exemple #19
0
    def alignSents(self, alignSents):
        """
        Returns the alignment result for several sentence pairs. 
        """

        if self.probabilities is None or self.alignments is None:
            raise ValueError("The model does not train.")

        aligned_sents = []
        for sent in alignSents:
            new_alignSent = AlignedSent(sent.words, sent.mots)
            aligned_sents.append(self.align(new_alignSent))

        return aligned_sents
Exemple #20
0
    def test_set_uniform_distortion_probabilities_of_max_displacements(self):
        # arrange
        src_classes = {'schinken': 0, 'eier': 0, 'spam': 1}
        trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2}
        corpus = [
            AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']),
            AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']),
        ]
        model4 = IBMModel4(corpus, 0, src_classes, trg_classes)

        # act
        model4.set_uniform_distortion_probabilities(corpus)

        # assert
        # number of displacement values =
        #     2 *(number of words in longest target sentence - 1)
        expected_prob = 1.0 / (2 * (4 - 1))

        # examine the boundary values for (displacement, src_class, trg_class)
        self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob)
        self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob)
        self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob)
        self.assertEqual(model4.non_head_distortion_table[-3][2],
                         expected_prob)
Exemple #21
0
    def align(self, sentence_pair):
        """
        Determines the best word alignment for one sentence pair from
        the corpus that the model was trained on.

        The original sentence pair is not modified. Results are
        undefined if ``sentence_pair`` is not in the training set.

        Note that the algorithm used is not strictly Model 3, because
        fertilities and NULL insertion probabilities are ignored.

        :param sentence_pair: A sentence in the source language and its
            counterpart sentence in the target language
        :type sentence_pair: AlignedSent

        :return: ``AlignedSent`` filled in with the best word alignment
        :rtype: AlignedSent
        """

        if self.translation_table is None or self.distortion_table is None:
            raise ValueError("The model has not been trained.")

        alignment = []

        l = len(sentence_pair.mots)
        m = len(sentence_pair.words)

        for j, trg_word in enumerate(sentence_pair.words):
            # Initialize trg_word to align with the NULL token
            best_prob = (self.translation_table[trg_word][None] *
                         self.distortion_table[j + 1][0][l][m])
            best_prob = max(best_prob, IBMModel.MIN_PROB)
            best_alignment = None
            for i, src_word in enumerate(sentence_pair.mots):
                align_prob = (self.translation_table[trg_word][src_word] *
                              self.distortion_table[j + 1][i + 1][l][m])
                if align_prob >= best_prob:
                    best_prob = align_prob
                    best_alignment = i

            # If trg_word is not aligned to the NULL token,
            # add it to the viterbi_alignment.
            if best_alignment is not None:
                alignment.append((j, best_alignment))

        return AlignedSent(sentence_pair.words, sentence_pair.mots, alignment)
    def align(self, align_sent):
        this_alignment = []
        en_len = len(align_sent.words)
        ger_len = len(align_sent.mots)

        for j, en_word in enumerate(align_sent.words):
            p = self.t[en_word][None] * self.q[0][j + 1][en_len][ger_len]
            max_prob = (p, None)
            for i, ger_word in enumerate(align_sent.mots):
                this_p = self.t[en_word][ger_word] * self.q[i + 1][j + 1][en_len][ger_len]
                this_prob = (this_p, i)
                max_prob = max(max_prob, this_prob)

            if max_prob[1] is not None:
                this_alignment.append((j, max_prob[1]))

        return AlignedSent(align_sent.words, align_sent.mots, this_alignment)
Exemple #23
0
    def align(self, align_sent):
        # alignments = []
        # english = ['STOP'] +  align_sent.mots
        # german = align_sent.words
        # l = len(english)
        # m = len(german)
        # for i in range(0,m):
        #     maximum = -sys.maxint
        #     argmax = None
        #     for j in range(0,l):
        #         if self.q[(j,i,l,m)] == 0 or self.t[(german[i], english[j])] == 0:
        #             prod = -1000
        #         else:
        #             prod = math.log(self.q[(j,i,l,m)],2) + math.log(self.t[(german[i], english[j])],2)
        #         if prod > maximum:
        #             maximum = prod
        #             argmax = (i,j)
        #     alignments.append(argmax)
        # sent = AlignedSent(german, english, Alignment(alignments))
        # return sent
        if self.t is None or self.q is None:
            raise ValueError("The model does not train.")

        alignment = []

        l_e = len(align_sent.words)
        l_g = len(align_sent.mots)

        for j, en_word in enumerate(align_sent.words):

            # Initialize the maximum probability with Null token
            max_align_prob = (self.t[en_word][None] *
                              self.q[0][j + 1][l_e][l_g], None)
            for i, g_word in enumerate(align_sent.mots):
                # Find out the maximum probability
                max_align_prob = max(max_align_prob,
                                     (self.t[en_word][g_word] *
                                      self.q[i + 1][j + 1][l_e][l_g], i))

            # If the maximum probability is not Null token,
            # then append it to the alignment.
            if max_align_prob[1] is not None:
                alignment.append((j, max_align_prob[1]))

        return AlignedSent(align_sent.words, align_sent.mots, alignment)
Exemple #24
0
    def align(self, align_sent):
        alignment = []
        english = align_sent.words
        german = align_sent.mots
        l = len(english)
        m = len([None] + german)

        for j, en_word in enumerate(english):
            max_align_prob = (-sys.maxint, 0)
            for i, g_word in enumerate(german):
                max_align_prob = max(
                    max_align_prob,
                    (self.t[en_word][g_word] * self.q[(i, j, l, m)], i))

            if max_align_prob[1] is not None:
                alignment.append((j, max_align_prob[1]))

        return AlignedSent(align_sent.words, align_sent.mots, alignment)
Exemple #25
0
    def align(self, align_sent):
        if self.t is None or self.q is None:
            raise ValueError("No parameters trained")

        wordlen = align_sent.words.__len__()
        motslen = align_sent.mots.__len__()
        alignment = []
        ### Check this:
        for i, en_word in enumerate(align_sent.words):
            max_align = (self.t[en_word][None] *
                         self.q[0][i + 1][wordlen][motslen], None)
            for j, ge_word in enumerate(align_sent.mots):
                max_align = max(max_align,
                                (self.t[en_word][ge_word] *
                                 self.q[j + 1][i + 1][wordlen][motslen], j))
            if max_align[1] is not None:
                alignment.append((i, max_align[1]))

        return AlignedSent(align_sent.words, align_sent.mots, alignment)
    def computeAlignments(self, frst_corpus, scnd_corpus, chosen_ibm):
        i = 0
        aligned_sents = []

        while i < len(frst_corpus.sents):
            aligned_sents.append(
                AlignedSent(frst_corpus.sents[i], scnd_corpus.sents[i]))
            i += 1

        if chosen_ibm == "ibm1":
            ibm = IBMModel1(aligned_sents)
            return ibm.aligned()
        elif chosen_ibm == "ibm2":
            ibm = IBMModel2(aligned_sents)
            return ibm.aligned()
        else:
            ibm1 = IBMModel1(aligned_sents)
            ibm2 = IBMModel2(aligned_sents)
            return ibm1.aligned(), ibm2.aligned()
    def test_best_model2_alignment(self):
        # arrange
        sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE,
                                    TestIBMModel.__TEST_SRC_SENTENCE)
        # None and 'bien' have zero fertility
        translation_table = {
            'i': {
                "j'": 0.9,
                'aime': 0.05,
                'bien': 0.02,
                'jambon': 0.03,
                None: 0
            },
            'love': {
                "j'": 0.05,
                'aime': 0.9,
                'bien': 0.01,
                'jambon': 0.01,
                None: 0.03
            },
            'ham': {
                "j'": 0,
                'aime': 0.01,
                'bien': 0,
                'jambon': 0.99,
                None: 0
            }
        }
        alignment_table = defaultdict(lambda: defaultdict(lambda: defaultdict(
            lambda: defaultdict(lambda: 0.2))))

        ibm_model = IBMModel([])
        ibm_model.translation_table = translation_table
        ibm_model.alignment_table = alignment_table

        # act
        a_info = ibm_model.best_model2_alignment(sentence_pair)

        # assert
        self.assertEqual(a_info.alignment[1:], (1, 2, 4))  # 0th element unused
        self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]])
Exemple #28
0
    def align(self, align_sent):
        alignment = []

        l = len(align_sent.words)
        m = len(align_sent.mots)

        for j, ej in enumerate(align_sent.words):

            # Initialize maximum probability as None
            max_align_prob = (self.t[ej][None] * self.q[0][j + 1][l][m], None)

            for i, fi in enumerate(align_sent.mots):
                # Calculate maximum probability
                prod = (self.t[ej][fi] * self.q[i + 1][j + 1][l][m], i)
                max_align_prob = max(max_align_prob, prod)

            # If max probability is not None, append it to alignments list
            if max_align_prob[1] is not None:
                alignment.append((j, max_align_prob[1]))

        return AlignedSent(align_sent.words, align_sent.mots, alignment)
Exemple #29
0
Fichier : B.py Projet : bdqnghi/NLP
    def align(self, align_sent):
        if self.t is None or self.q is None:
            raise ValueError("No parameters trained")
        #MIN_PROB = 1.0e-12
        wordlen = align_sent.words.__len__()
        motslen = align_sent.mots.__len__()
        alignment = []

        for i, en_word in enumerate(align_sent.words):
            max_align = (self.t[en_word][None] *
                         self.q[0][i + 1][wordlen][motslen], None)
            for j, ge_word in enumerate(align_sent.mots):
                align_prob = (self.t[en_word][ge_word] *
                              self.q[j + 1][i + 1][wordlen][motslen], j)
                if align_prob >= max_align:
                    max_align = align_prob
                    best_pt = j
            alignment.append((i, best_pt))

        return AlignedSent(align_sent.words, align_sent.mots,
                           Alignment(alignment))  #uses built-in Alignment func
Exemple #30
0
    def test_prob_t_a_given_s(self):
        # arrange
        src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
        trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
        corpus = [AlignedSent(trg_sentence, src_sentence)]
        alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5),
                                       [None] + src_sentence,
                                       ['UNUSED'] + trg_sentence,
                                       None)

        translation_table = defaultdict(lambda: defaultdict(float))
        translation_table['i']['ich'] = 0.98
        translation_table['love']['gern'] = 0.98
        translation_table['to'][None] = 0.98
        translation_table['eat']['esse'] = 0.98
        translation_table['smoked']['räucherschinken'] = 0.98
        translation_table['ham']['räucherschinken'] = 0.98

        alignment_table = defaultdict(
            lambda: defaultdict(lambda: defaultdict(
                lambda: defaultdict(float))))
        alignment_table[0][3][5][6] = 0.97  # None -> to
        alignment_table[1][1][5][6] = 0.97  # ich -> i
        alignment_table[2][4][5][6] = 0.97  # esse -> eat
        alignment_table[4][2][5][6] = 0.97  # gern -> love
        alignment_table[5][5][5][6] = 0.96  # räucherschinken -> smoked
        alignment_table[5][6][5][6] = 0.96  # räucherschinken -> ham

        model2 = IBMModel2(corpus, 0)
        model2.translation_table = translation_table
        model2.alignment_table = alignment_table

        # act
        probability = model2.prob_t_a_given_s(alignment_info)

        # assert
        lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
        alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96
        expected_probability = lexical_translation * alignment
        self.assertEqual(round(probability, 4), round(expected_probability, 4))
Exemple #31
0
    def align(self, align_sent):
        alignment = []

        l = len(align_sent.words)
        m = len(align_sent.mots)

        for j, en_word in enumerate(align_sent.words):

            # Initialize the maximum probability with Null token
            max_align_prob = (-sys.maxint, None)
            for i, g_word in enumerate(align_sent.mots):
                # Find out the maximum probability
                max_align_prob = max(max_align_prob,
                                     (self.t[en_word][g_word] *
                                      self.q[(i + 1, j + 1, l, m)], i))

            # If the maximum probability is not Null token,
            # then append it to the alignment.
            if max_align_prob[1] is not None:
                alignment.append((j, max_align_prob[1]))

        return AlignedSent(align_sent.words, align_sent.mots, alignment)