def __init__(self, corpus): tgtLangWordCount = corpus.getTargetLanguageWordCount() """ ============================================================================== """ """ BEGIN: INITIALIZATION """ """ ============================================================================== """ # iterate through the sentence pairs in the corpus for x in range(0, corpus.getEntryCount()): # get the current sentence pair curPair = corpus.getEntryAt(x) # split both sentences into words srcWords = curPair[0].split() tgtWords = curPair[1].split() # initializations for srcWord in srcWords: for tgtWord in tgtWords: # uniformly assign values to the translation probabilities of target words given source words, i.e. t(target|source) self.transProbOfTargetGivenSource[(tgtWord, srcWord)] = 1.0 / tgtLangWordCount # set count(target|source) to zeros self.countOfTargetGivenSource[(tgtWord, srcWord)] = 0 # set total(source) to zeros self.countOfSourceWordOccurrence[srcWord] = 0 """ END: INITIALIZATION """ """ ============================================================================== """ """ BEGIN: EM ITERATION """ """ ============================================================================== """ for i in range(0, self.epoch): for x in range(0, corpus.getEntryCount()): # get the current sentence pair curPair = corpus.getEntryAt(x) # split both sentences into words srcWords = curPair[0].split() tgtWords = curPair[1].split() # calculate the normalization term for tgtWord in tgtWords: self.normalizationForTarget[tgtWord] = 0 for srcWord in srcWords: self.normalizationForTarget[tgtWord] += self.transProbOfTargetGivenSource[(tgtWord, srcWord)] # collect counts for tgtWord in tgtWords: for srcWord in srcWords: self.countOfTargetGivenSource[(tgtWord, srcWord)] += ( self.transProbOfTargetGivenSource[(tgtWord, srcWord)] / self.normalizationForTarget[tgtWord] ) self.countOfSourceWordOccurrence[srcWord] += ( self.transProbOfTargetGivenSource[(tgtWord, srcWord)] / self.normalizationForTarget[tgtWord] ) # TODO: I don't know why the following block affects the result uniqueSrcWords = corpus.getAllSourceLanguageWords() uniqueTgtWords = corpus.getAllTargetLanguageWords() for srcWord in uniqueSrcWords: for tgtWord in uniqueTgtWords: safeCountOfTagetGivenSource = 0.0 if self.countOfTargetGivenSource.has_key((tgtWord, srcWord)): safeCountOfTagetGivenSource = self.countOfTargetGivenSource[(tgtWord, srcWord)] self.transProbOfTargetGivenSource[(tgtWord, srcWord)] = ( safeCountOfTagetGivenSource / self.countOfSourceWordOccurrence[srcWord] ) """ END: EM ITERATION """ """print self.transProbOfTargetGivenSource print self.countOfTargetGivenSource print self.countOfSourceWordOccurrence print self.normalizationForTarget""" printDictionary(self.transProbOfTargetGivenSource)
def __init__(self, corpus): """ INITIALIZATION: ALL PARAMETERS TO RANDOM VALUES """ for pairIdx in range(0, corpus.getEntryCount()): # get the current sentence pair curPair = corpus.getEntryAt(pairIdx) # split both sentences into words srcWords = curPair[0].split() tgtWords = curPair[1].split() # initializations for srcWordIdx in range(0, len(srcWords)): for tgtWordIdx in range(0, len(tgtWords)): self.t[(srcWords[srcWordIdx], tgtWords[tgtWordIdx])] = 0.5 self.q[(srcWordIdx, tgtWordIdx, len(srcWords), len(tgtWords))] = 0.5 for epochIdx in range(0, self.epoch): """ INITIALIZATION: SET ALL COUNTS TO ZEROS """ for pairIdx in range(0, corpus.getEntryCount()): # get the current sentence pair curPair = corpus.getEntryAt(pairIdx) # split both sentences into words srcWords = curPair[0].split() tgtWords = curPair[1].split() # initializations for srcWordIdx in range(0, len(srcWords)): for tgtWordIdx in range(0, len(tgtWords)): self.srcTgtTransCounts[(srcWords[srcWordIdx], tgtWords[tgtWordIdx])] = 0.0 self.srcOccurCounts[srcWords[srcWordIdx]] = 0.0 self.srcTgtAlignCounts[(srcWordIdx, tgtWordIdx, len(srcWords), len(tgtWords))] = 0.0 self.sentenceCoocurCounts[(tgtWordIdx, len(srcWords), len(tgtWords))] = 0.0 """ ITERATE THROUGH ALL SENTENCE PAIRS """ for pairIdx in range(0, corpus.getEntryCount()): # get the current sentence pair curPair = corpus.getEntryAt(pairIdx) # split both sentences into words srcWords = curPair[0].split() tgtWords = curPair[1].split() # inspect every combination of words from each sentence for tgtWordIdx in range(0, len(tgtWords)): for srcWordIdx in range(0, len(srcWords)): inc = self.calcInc(tgtWordIdx, srcWordIdx, tgtWords, srcWords, len(tgtWords), len(srcWords)) self.srcTgtTransCounts[(srcWords[srcWordIdx], tgtWords[tgtWordIdx])] += inc self.srcOccurCounts[srcWords[srcWordIdx]] += inc self.srcTgtAlignCounts[srcWordIdx, tgtWordIdx, len(srcWords), len(tgtWords)] += inc self.sentenceCoocurCounts[tgtWordIdx, len(srcWords), len(tgtWords)] += inc uniqueSrcWords = corpus.getAllSourceLanguageWords() uniqueTgtWords = corpus.getAllTargetLanguageWords() for srcWord in uniqueSrcWords: for tgtWord in uniqueTgtWords: srcTgtTransCounts_safe = 0.0 srcOccurCounts_safe = 0.0 if self.srcTgtTransCounts.has_key((srcWord, tgtWord)): srcTgtTransCounts_safe = self.srcTgtTransCounts[(srcWord, tgtWord)] if self.srcOccurCounts.has_key(srcWord): srcOccurCounts_safe = self.srcOccurCounts[srcWord] self.t[(srcWord, tgtWord)] = srcTgtTransCounts_safe / srcOccurCounts_safe printDictionary(self.t)