Ejemplo n.º 1
0
    def train(self, bitextGen):
        self.frenchAlphabet = Alphabet.from_iterable(
            word for frSent, enSent in bitextGen(desc='French Alphabet')
            for word in frSent)
        self.englishAlphabet = Alphabet.from_iterable(
            word for frSent, enSent in bitextGen(desc='English Alphabet')
            for word in enSent)
        self.frenchAlphabet.freeze()
        self.englishAlphabet.freeze()
        vF = len(self.frenchAlphabet)
        vE = len(self.englishAlphabet)
        tOfEGivenF = np.ones((vE, vF)) / vF
        aOfIJGivenLenELenF = AlignmentDict()
        for ep in tqdm(range(self.epochs), desc='Epoch'):
            countOfEGivenF = np.zeros((vE, vF))
            totalOfF = np.zeros(vF)
            countOfIGivenJ = AlignmentDict()
            totalOfJ = CountDict()
            for frSent, enSent in bitextGen('Training'):
                # Compute Normalization stuff
                lenF = len(frSent)
                frMask = self.frenchAlphabet.map(frSent)

                lenE = len(enSent)
                enMask = self.englishAlphabet.map(enSent)

                aOfIJ = aOfIJGivenLenELenF[lenE, lenF]

                # total probability of each english word being translated from the french ones
                # has size of {len(enSent) x 1}
                sTotalOfE = np.sum(tOfEGivenF[np.ix_(enMask, frMask)] * aOfIJ,
                                   axis=1,
                                   keepdims=True)

                # calculate counts

                delta = tOfEGivenF[np.ix_(enMask, frMask)] * aOfIJ / sTotalOfE
                deltaSummedOverE = np.sum(delta, axis=0)

                countOfEGivenF[np.ix_(enMask, frMask)] += delta
                totalOfF[frMask] += deltaSummedOverE

                countOfIGivenJ[lenE, lenF] += delta
                totalOfJ[lenE, lenF] += deltaSummedOverE

            # estimate probabilities
            tOfEGivenF = countOfEGivenF / totalOfF
            for lenE, lenF in aOfIJGivenLenELenF:
                aOfIJGivenLenELenF[
                    lenE,
                    lenF] = countOfIGivenJ[lenE, lenF] / totalOfJ[lenE, lenF]

        self.tOfEGivenF = tOfEGivenF
        self.aOfIJGivenLenELenF = aOfIJGivenLenELenF
Ejemplo n.º 2
0
    def train(self, bitextGen):
        self.frenchAlphabet = Alphabet.from_iterable(
            word for frSent, enSent in bitextGen(desc='French Alphabet')
            for word in frSent)
        self.englishAlphabet = Alphabet.from_iterable(
            word for frSent, enSent in bitextGen(desc='English Alphabet')
            for word in enSent)
        self.frenchAlphabet.freeze()
        self.englishAlphabet.freeze()
        vF = len(self.frenchAlphabet)
        vE = len(self.englishAlphabet)
        tOfEGivenF = np.ones((vE, vF)) / vF
        for ep in tqdm(range(self.epochs), desc='Epoch'):
            countOfEGivenF = np.zeros((vE, vF))
            totalOfF = np.zeros(vF)
            for frSent, enSent in bitextGen('Training'):
                # Compute Normalization stuff
                frMask = self.frenchAlphabet.map(frSent)

                enMask = self.englishAlphabet.map(enSent)

                # total probability of each english word being translated from the french ones
                # has size of {len(enSent) x 1}
                sTotalOfE = np.sum(tOfEGivenF[np.ix_(enMask, frMask)],
                                   axis=1,
                                   keepdims=True)

                # calculate counts

                delta = tOfEGivenF[np.ix_(enMask, frMask)] / sTotalOfE
                countOfEGivenF[np.ix_(enMask, frMask)] += delta
                totalOfF[frMask] += np.sum(delta, axis=0)

            # estimate probabilities
            tOfEGivenF = countOfEGivenF / totalOfF

        self.tOfEGivenF = tOfEGivenF