Exemple #1
0
    def __init__(self, sentence_aligned_corpus, iterations):
        """
        Train on ``sentence_aligned_corpus`` and create a lexical
        translation model, a distortion model, a fertility model, and a
        model for generating NULL-aligned words.

        Translation direction is from ``AlignedSent.mots`` to
        ``AlignedSent.words``.

        Runs a few iterations of Model 2 training to initialize
        model parameters.

        :param sentence_aligned_corpus: Sentence-aligned parallel corpus
        :type sentence_aligned_corpus: list(AlignedSent)

        :param iterations: Number of iterations to run training algorithm
        :type iterations: int
        """

        super(IBMModel3, self).__init__(sentence_aligned_corpus)

        # Get the translation and alignment probabilities from IBM model 2
        ibm2 = IBMModel2(sentence_aligned_corpus, iterations)
        self.translation_table = ibm2.translation_table

        # Alignment table is only used for hill climbing and is not part
        # of the output of Model 3 training
        self.alignment_table = ibm2.alignment_table

        self.train(sentence_aligned_corpus, iterations)
Exemple #2
0
Fichier : A.py Projet : bdqnghi/NLP
def testmain(aligned_sents):
    for n in range(19, 20):
        print('%s\n' % (n))
        ibm1 = IBMModel1(aligned_sents, n)
        avg_aer = compute_avg_aer(aligned_sents, ibm1, 50)

        print('IBM Model 1')
        print('---------------------------')
        print('Average AER: {0:.3f}\n'.format(avg_aer))

        ibm2 = IBMModel2(aligned_sents, n)
        avg_aer = compute_avg_aer(aligned_sents, ibm2, 50)

        print('IBM Model 2')
        print('---------------------------')
        print('Average AER: {0:.3f}\n'.format(avg_aer))
Exemple #3
0
    def test_prob_t_a_given_s(self):
        # arrange
        src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken']
        trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham']
        corpus = [AlignedSent(trg_sentence, src_sentence)]
        alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5),
                                       [None] + src_sentence,
                                       ['UNUSED'] + trg_sentence,
                                       None)

        translation_table = defaultdict(lambda: defaultdict(float))
        translation_table['i']['ich'] = 0.98
        translation_table['love']['gern'] = 0.98
        translation_table['to'][None] = 0.98
        translation_table['eat']['esse'] = 0.98
        translation_table['smoked']['räucherschinken'] = 0.98
        translation_table['ham']['räucherschinken'] = 0.98

        alignment_table = defaultdict(
            lambda: defaultdict(lambda: defaultdict(
                lambda: defaultdict(float))))
        alignment_table[0][3][5][6] = 0.97  # None -> to
        alignment_table[1][1][5][6] = 0.97  # ich -> i
        alignment_table[2][4][5][6] = 0.97  # esse -> eat
        alignment_table[4][2][5][6] = 0.97  # gern -> love
        alignment_table[5][5][5][6] = 0.96  # räucherschinken -> smoked
        alignment_table[5][6][5][6] = 0.96  # räucherschinken -> ham

        model2 = IBMModel2(corpus, 0)
        model2.translation_table = translation_table
        model2.alignment_table = alignment_table

        # act
        probability = model2.prob_t_a_given_s(alignment_info)

        # assert
        lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98
        alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96
        expected_probability = lexical_translation * alignment
        self.assertEqual(round(probability, 4), round(expected_probability, 4))
Exemple #4
0
Fichier : A.py Projet : bdqnghi/NLP
def create_ibm2(aligned_sents):
    return IBMModel2(aligned_sents, 10)
Exemple #5
0
    def train(self, parallel_corpus, iterations):
        """
        Learns and sets probability tables
        """

        # Get the translation and alignment probabilities from IBM model 2
        ibm2 = IBMModel2(parallel_corpus, iterations)
        self.translation_table = ibm2.translation_table
        """
        dict(dict(float)): probability(target word | source word). Values
            accessed with ``translation_table[target_word][source_word].``
        """

        self.alignment_table = ibm2.alignment_table

        src_vocab = set()
        trg_vocab = set()
        for aligned_sentence in parallel_corpus:
            trg_vocab.update(aligned_sentence.words)
            src_vocab.update(aligned_sentence.mots)
        # Add the NULL token
        src_vocab.add(None)

        # Initial probability of null insertion
        self.p0 = 0.5
        """
        float: probability that a generated word does not require
            another target word that is aligned to NULL
        """

        self.fertility_table = defaultdict(
            lambda: defaultdict(lambda: self.PROB_SMOOTH))
        """
        dict(dict(float)))): probability(fertility | source word). Values
            accessed with ``fertility_table[fertility][source_word].``
        """

        self.distortion_table = defaultdict(
            lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(
                lambda: self.PROB_SMOOTH))))
        """
        dict(dict(dict(dict(float)))): probability(j | i,l,m). Values
            accessed with ``distortion_table[j][i][m][l].``
        """

        for k in range(0, iterations):
            max_fertility = 0
            # Reset all counts
            count_t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0))
            count_any_t_given_s = defaultdict(lambda: 0.0)

            distortion_count = defaultdict(lambda: defaultdict(
                lambda: defaultdict(lambda: defaultdict(lambda: 0.0))))
            distortion_count_for_any_j = defaultdict(
                lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))

            count_p0 = 0.0
            count_p1 = 0.0

            fertility_count = defaultdict(lambda: defaultdict(lambda: 0.0))
            fertility_count_for_any_phi = defaultdict(lambda: 0.0)

            for aligned_sentence in parallel_corpus:
                trg_sentence = aligned_sentence.words
                src_sentence = [None] + aligned_sentence.mots
                l = len(src_sentence) - 1
                m = len(trg_sentence)

                # Sample the alignment space
                sampled_alignments = self.sample(trg_sentence, src_sentence)

                total_count = 0.0

                # E step (a): Compute normalization factors to weigh counts
                for (alignment, fert) in sampled_alignments:
                    count = self.probability(alignment, trg_sentence,
                                             src_sentence, fert)
                    total_count += count

                # E step (b): Collect counts
                for (alignment, fert) in sampled_alignments:
                    count = self.probability(alignment, trg_sentence,
                                             src_sentence, fert)
                    normalized_count = count / total_count
                    null_count = 0

                    for j in range(1, m + 1):
                        t = trg_sentence[j - 1]
                        i = alignment[j]
                        s = src_sentence[i]

                        # Lexical translation
                        count_t_given_s[t][s] += normalized_count
                        count_any_t_given_s[s] += normalized_count

                        # Distortion
                        distortion_count[j][i][m][l] += normalized_count
                        distortion_count_for_any_j[i][m][l] += normalized_count

                        if i == 0:
                            null_count += 1

                    # NULL-aligned words generation
                    count_p1 += null_count * normalized_count
                    count_p0 += (m - 2 * null_count) * normalized_count

                    # Fertility
                    for i in range(0, l + 1):
                        fertility = 0

                        for j in range(1, m + 1):
                            if i == alignment[j]:
                                fertility += 1

                        s = src_sentence[i]
                        fertility_count[fertility][s] += normalized_count
                        fertility_count_for_any_phi[s] += normalized_count

                        if fertility > max_fertility:
                            max_fertility = fertility

            self.translation_table = defaultdict(
                lambda: defaultdict(lambda: 0.0))
            self.distortion_table = defaultdict(lambda: defaultdict(
                lambda: defaultdict(lambda: defaultdict(lambda: 0.0))))
            self.fertility_table = defaultdict(
                lambda: defaultdict(lambda: 0.0))

            # M step: Update probabilities with maximum likelihood estimates
            # Lexical translation
            for s in src_vocab:
                for t in trg_vocab:
                    self.translation_table[t][s] = (count_t_given_s[t][s] /
                                                    count_any_t_given_s[s])

            # Distortion
            for aligned_sentence in parallel_corpus:
                trg_sentence = aligned_sentence.words
                src_sentence = [None] + aligned_sentence.mots
                l = len(src_sentence) - 1
                m = len(trg_sentence)

                for i in range(0, l + 1):
                    for j in range(1, m + 1):
                        self.distortion_table[j][i][m][l] = (
                            distortion_count[j][i][m][l] /
                            distortion_count_for_any_j[i][m][l])

            # Fertility
            for fertility in range(0, max_fertility + 1):
                for s in src_vocab:
                    self.fertility_table[fertility][s] = (
                        fertility_count[fertility][s] /
                        fertility_count_for_any_phi[s])

            # NULL-aligned words generation
            p1 = count_p1 / (count_p1 + count_p0)
            self.p0 = 1 - p1
Exemple #6
0
    def train(self, align_sents, num_iter):
        """
        This function is the main process of training model, which
        initialize all the probability distributions and executes 
        a specific number of iterations. 
        """
        # Get the translation and alignment probabilities from IBM model 2
        ibm2 = IBMModel2(align_sents, num_iter)
        self.probabilities, self.align_table = ibm2.probabilities, ibm2.alignments

        fr_vocab = set()
        en_vocab = set()
        for alignSent in align_sents:
            en_vocab.update(alignSent.words)
            fr_vocab.update(alignSent.mots)
        fr_vocab.add(None)

        # Initial probability of null insertion.
        self.null_insertion = 0.5

        self.fertility = defaultdict(
            lambda: defaultdict(lambda: self.PROB_SMOOTH))
        self.distortion = defaultdict(lambda: defaultdict(lambda: defaultdict(
            lambda: defaultdict(lambda: self.PROB_SMOOTH))))

        for k in range(0, num_iter):
            max_fert = 0
            # Set all count* and total* to 0
            count_t = defaultdict(lambda: defaultdict(lambda: 0.0))
            total_t = defaultdict(lambda: 0.0)

            count_d = defaultdict(lambda: defaultdict(lambda: defaultdict(
                lambda: defaultdict(lambda: 0.0))))
            total_d = defaultdict(
                lambda: defaultdict(lambda: defaultdict(lambda: 0.0)))

            count_p0 = 0.0
            count_p1 = 0.0

            count_f = defaultdict(lambda: defaultdict(lambda: 0.0))
            total_f = defaultdict(lambda: 0.0)

            for alignSent in align_sents:

                en_set = alignSent.words
                fr_set = [None] + alignSent.mots
                l_f = len(fr_set) - 1
                l_e = len(en_set)

                # Sample the alignment space
                A = self.sample(en_set, fr_set)

                # Collect counts
                c_total = 0.0

                for (a, fert) in A:
                    c_total += self.probability(a, en_set, fr_set, fert)

                for (a, fert) in A:
                    c = self.probability(a, en_set, fr_set, fert) / c_total
                    null = 0

                    for j in range(1, l_e + 1):
                        en_word = en_set[j - 1]
                        fr_word = fr_set[a[j]]

                        # Lexical translation
                        count_t[en_word][fr_word] += c
                        total_t[fr_word] += c

                        # Distortion
                        count_d[j][a[j]][l_e][l_f] += c
                        total_d[a[j]][l_e][l_f] += c

                        if a[j] == 0:
                            null += 1

                    # Collect the counts of null insetion
                    count_p1 += null * c
                    count_p0 += (l_e - 2 * null) * c

                    # Collect the counts of fertility
                    for i in range(0, l_f + 1):
                        fertility = 0

                        for j in range(1, l_e + 1):
                            if i == a[j]:
                                fertility += 1

                        fr_word = fr_set[i]
                        count_f[fertility][fr_word] += c
                        total_f[fr_word] += c

                        if fertility > max_fert:
                            max_fert = fertility

            self.probabilities = defaultdict(lambda: defaultdict(lambda: 0.0))
            self.distortion = defaultdict(lambda: defaultdict(
                lambda: defaultdict(lambda: defaultdict(lambda: 0.0))))
            self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0))

            # Estimate translation probability distribution
            for f in fr_vocab:
                for e in en_vocab:
                    self.probabilities[e][f] = count_t[e][f] / total_t[f]

            # Estimate distortion
            for alignSent in align_sents:
                en_set = alignSent.words
                fr_set = [None] + alignSent.mots
                l_f = len(fr_set) - 1
                l_e = len(en_set)

                for i in range(0, l_f + 1):
                    for j in range(1, l_e + 1):
                        self.distortion[j][i][l_e][l_f] = count_d[j][i][l_e][
                            l_f] / total_d[i][l_e][l_f]

            # Estimate the fertility, n(Fertility | input word)
            for ferti in range(0, max_fert + 1):
                for fr_word in fr_vocab:
                    self.fertility[ferti][
                        fr_word] = count_f[ferti][fr_word] / total_f[fr_word]

            # Estimate the probability of null insertion
            p1 = count_p1 / (count_p1 + count_p0)
            self.null_insertion = 1 - p1