def train(self, bitextGen): self.frenchAlphabet = Alphabet.from_iterable( word for frSent, enSent in bitextGen(desc='French Alphabet') for word in frSent) self.englishAlphabet = Alphabet.from_iterable( word for frSent, enSent in bitextGen(desc='English Alphabet') for word in enSent) self.frenchAlphabet.freeze() self.englishAlphabet.freeze() vF = len(self.frenchAlphabet) vE = len(self.englishAlphabet) tOfEGivenF = np.ones((vE, vF)) / vF aOfIJGivenLenELenF = AlignmentDict() for ep in tqdm(range(self.epochs), desc='Epoch'): countOfEGivenF = np.zeros((vE, vF)) totalOfF = np.zeros(vF) countOfIGivenJ = AlignmentDict() totalOfJ = CountDict() for frSent, enSent in bitextGen('Training'): # Compute Normalization stuff lenF = len(frSent) frMask = self.frenchAlphabet.map(frSent) lenE = len(enSent) enMask = self.englishAlphabet.map(enSent) aOfIJ = aOfIJGivenLenELenF[lenE, lenF] # total probability of each english word being translated from the french ones # has size of {len(enSent) x 1} sTotalOfE = np.sum(tOfEGivenF[np.ix_(enMask, frMask)] * aOfIJ, axis=1, keepdims=True) # calculate counts delta = tOfEGivenF[np.ix_(enMask, frMask)] * aOfIJ / sTotalOfE deltaSummedOverE = np.sum(delta, axis=0) countOfEGivenF[np.ix_(enMask, frMask)] += delta totalOfF[frMask] += deltaSummedOverE countOfIGivenJ[lenE, lenF] += delta totalOfJ[lenE, lenF] += deltaSummedOverE # estimate probabilities tOfEGivenF = countOfEGivenF / totalOfF for lenE, lenF in aOfIJGivenLenELenF: aOfIJGivenLenELenF[ lenE, lenF] = countOfIGivenJ[lenE, lenF] / totalOfJ[lenE, lenF] self.tOfEGivenF = tOfEGivenF self.aOfIJGivenLenELenF = aOfIJGivenLenELenF
def train(self, bitextGen): self.frenchAlphabet = Alphabet.from_iterable( word for frSent, enSent in bitextGen(desc='French Alphabet') for word in frSent) self.englishAlphabet = Alphabet.from_iterable( word for frSent, enSent in bitextGen(desc='English Alphabet') for word in enSent) self.frenchAlphabet.freeze() self.englishAlphabet.freeze() vF = len(self.frenchAlphabet) vE = len(self.englishAlphabet) tOfEGivenF = np.ones((vE, vF)) / vF for ep in tqdm(range(self.epochs), desc='Epoch'): countOfEGivenF = np.zeros((vE, vF)) totalOfF = np.zeros(vF) for frSent, enSent in bitextGen('Training'): # Compute Normalization stuff frMask = self.frenchAlphabet.map(frSent) enMask = self.englishAlphabet.map(enSent) # total probability of each english word being translated from the french ones # has size of {len(enSent) x 1} sTotalOfE = np.sum(tOfEGivenF[np.ix_(enMask, frMask)], axis=1, keepdims=True) # calculate counts delta = tOfEGivenF[np.ix_(enMask, frMask)] / sTotalOfE countOfEGivenF[np.ix_(enMask, frMask)] += delta totalOfF[frMask] += np.sum(delta, axis=0) # estimate probabilities tOfEGivenF = countOfEGivenF / totalOfF self.tOfEGivenF = tOfEGivenF