def Train(self, sourceCorpusFile, targetCorpusFile, iterations): sourceLines = u.readFromFile(sourceCorpusFile) targetLines = u.readFromFile(targetCorpusFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % (len(sourceLines), len(targetLines)) print u.now(), "Initializing" self.__Initialize(sourceLines, targetLines) for s in xrange(0, iterations): start1 = u.now() print start1, "Computing Counts for iteration", s+1 self.__ComputeCounts(sourceLines, targetLines) start2 = u.now() print start2, "Computing t values for iteration", s+1 self.__ComputeTValues(sourceLines, targetLines) end = u.now() print u.now(), "Iteration", s+1, "complete." print u.now(), "Started count computations at %s." % start1 print u.now(), "Started t value computations at %s." % start2 print u.now(), "Finished at %s" % end return self.__tMap
def Train(self, sourceCorpusFile, targetCorpusFile, iterations): sourceLines = u.readFromFile(sourceCorpusFile) targetLines = u.readFromFile(targetCorpusFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % ( len(sourceLines), len(targetLines)) print u.now(), "Initializing" self.__Initialize(sourceLines, targetLines) for s in xrange(0, iterations): start1 = u.now() print start1, "Computing Counts for iteration", s + 1 self.__ComputeCounts(sourceLines, targetLines) start2 = u.now() print start2, "Computing t values for iteration", s + 1 self.__ComputeTValues(sourceLines, targetLines) end = u.now() print u.now(), "Iteration", s + 1, "complete." print u.now(), "Started count computations at %s." % start1 print u.now(), "Started t value computations at %s." % start2 print u.now(), "Finished at %s" % end return self.__tMap
def Align(self, sourceFile, targetFile): sourceLines = u.readFromFile(sourceFile) targetLines = u.readFromFile(targetFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % ( len(sourceLines), len(targetLines)) lineCount = min(len(sourceLines), len(targetLines)) # SentenceIndex, EnglishIndex, ForeignIndex alignments = [] for k in xrange(0, lineCount): sourceWords = sourceLines[k].split() sourceWords.insert(0, self.NULLTAG) targetWords = targetLines[k].split() targetWords.insert(0, self.NULLTAG) mk = len(sourceWords) - 1 lk = len(targetWords) - 1 for i in xrange(1, mk + 1): bestAlignment = a3p1.Alignment(0, 0, "", 0, "", 0) for j in xrange(0, lk + 1): fi = sourceWords[i] ej = targetWords[j] tValue = self._IBM1__tMap[a3p1.WordMap(fi, ej)] #qValue = self.__qMap[ AlignmentByPosition(j, i, lk, mk) ] qValue = self.GetQValue(j, i, lk, mk) score = tValue * qValue if (bestAlignment.Score < score): bestAlignment = a3p1.Alignment(score, k + 1, fi, i, ej, j) alignments.append(bestAlignment) return alignments
def Align(self, sourceFile, targetFile): sourceLines = u.readFromFile(sourceFile) targetLines = u.readFromFile(targetFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % (len(sourceLines), len(targetLines)) lineCount = min(len(sourceLines), len(targetLines)) # SentenceIndex, EnglishIndex, ForeignIndex alignments = [] for k in xrange(0, lineCount): sourceWords = sourceLines[k].split() sourceWords.insert(0, self.NULLTAG) targetWords = targetLines[k].split() targetWords.insert(0,self.NULLTAG) mk = len(sourceWords) - 1 lk = len(targetWords) - 1 for i in xrange(1, mk+1): bestAlignment = a3p1.Alignment(0, 0, "", 0, "", 0) qList = self.__qMap.GetQValues(lk, mk) for j in xrange(0, lk+1): fi = sourceWords[i] ej = targetWords[j] tValue = self._IBM1__tMap[ a3p1.WordMap(fi, ej) ] qValue = qList[(j, i)] score = tValue * qValue if (bestAlignment.Score < score): bestAlignment = a3p1.Alignment(score, k+1, fi, i, ej, j) alignments.append(bestAlignment) return alignments
def Align(self, sourceFile, targetFile): sourceLines = u.readFromFile(sourceFile) targetLines = u.readFromFile(targetFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % ( len(sourceLines), len(targetLines)) lineCount = min(len(sourceLines), len(targetLines)) # SentenceIndex, EnglishIndex, ForeignIndex alignments = [] for k in xrange(0, lineCount): sourceLine = sourceLines[k] targetLine = targetLines[k] sourceWords = sourceLine.split() sourceWords.insert(0, self.NULLTAG) targetWords = targetLine.split() targetWords.insert(0, self.NULLTAG) mk = len(sourceWords) lk = len(targetWords) for i in xrange(1, mk): bestAlignment = Alignment(0, 0, "", 0, "", 0) for j in xrange(0, lk): fi = sourceWords[i] ej = targetWords[j] tValue = self.__tMap[WordMap(fi, ej)] if (bestAlignment.Score < tValue): bestAlignment = Alignment(tValue, k + 1, fi, i, ej, j) alignments.append(bestAlignment) return alignments
def Train(self, sourceCorpusFile, targetCorpusFile, iterations, tFile): sourceLines = u.readFromFile(sourceCorpusFile) targetLines = u.readFromFile(targetCorpusFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % (len(sourceLines), len(targetLines)) if (tFile == ""): print u.now(), "Initializing" self._IBM1__Initialize(sourceLines, targetLines) else: print u.now(), "Loading Initial T Values" self.LoadTValues(tFile) print u.now(), "Initializing Q Values" self.__qMap.Initialize(sourceLines, targetLines) for s in xrange(0, iterations): # for x in xrange(0,9): # print "Iteration", s, ":", x, "2 8 8", self.__qMap.GetQValue(x, 2, 8, 8) start1 = u.now() print start1, "Computing Counts for iteration", s+1 self._IBM1__ComputeCounts(sourceLines, targetLines) start2 = u.now() print start2, "Computing t values for iteration", s+1 self._IBM1__ComputeTValues(sourceLines, targetLines) start3 = u.now() print start3, "Computing q values for iteration", s+1 self.__ComputeQValues(sourceLines, targetLines) end = u.now() print u.now(), "Iteration", s+1, "complete." print u.now(), "Started count computations at %s." % start1 print u.now(), "Started t value computations at %s." % start2 print u.now(), "Started q value computations at %s." % start3 print u.now(), "Iteration", s+1, "finished at %s" % end return self._IBM1__tMap
def Align(self, sourceFile, targetFile): sourceLines = u.readFromFile(sourceFile) targetLines = u.readFromFile(targetFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % (len(sourceLines), len(targetLines)) lineCount = min(len(sourceLines), len(targetLines)) # SentenceIndex, EnglishIndex, ForeignIndex alignments = [] for k in xrange(0, lineCount): sourceLine = sourceLines[ k ] targetLine = targetLines[ k ] sourceWords = sourceLine.split() sourceWords.insert(0, self.NULLTAG) targetWords = targetLine.split() targetWords.insert(0, self.NULLTAG) mk = len(sourceWords) lk = len(targetWords) for i in xrange(1, mk): bestAlignment = Alignment(0, 0, "", 0, "", 0) for j in xrange(0, lk): fi = sourceWords[i] ej = targetWords[j] tValue = self.__tMap[ WordMap(fi, ej) ] if (bestAlignment.Score < tValue): bestAlignment = Alignment(tValue, k+1, fi, i, ej, j) alignments.append(bestAlignment) return alignments
def LoadAlignments(alignmentFile, sourceFile, targetFile): lines = u.readFromFile(alignmentFile) sourceLines = u.readFromFile(sourceFile) targetLines = u.readFromFile(targetFile) lineIndex = 0 alignments = dict() for line in lines: lineIndex += 1 alignmentLine = line.split() if len(alignmentLine) != 3: print "Invalid line %s: %s" % ( lineIndex, line ) k = int(alignmentLine[0]) i = int(alignmentLine[2]) j = int(alignmentLine[1]) sourceLine = sourceLines[k-1].split() sourceLine.insert(0, SentenceAlignment.NULLTAG) targetLine = targetLines[k-1].split() targetLine.insert(0, SentenceAlignment.NULLTAG) if (alignments.has_key( k )): alignment = alignments[ k ] else: alignment = SentenceAlignment(sourceLine, targetLine) alignment.Align(i, j) alignments[ k ] = alignment return alignments
def LoadAlignments(alignmentFile, sourceFile, targetFile): lines = u.readFromFile(alignmentFile) sourceLines = u.readFromFile(sourceFile) targetLines = u.readFromFile(targetFile) lineIndex = 0 alignments = dict() for line in lines: lineIndex += 1 alignmentLine = line.split() if len(alignmentLine) != 3: print "Invalid line %s: %s" % (lineIndex, line) k = int(alignmentLine[0]) i = int(alignmentLine[2]) j = int(alignmentLine[1]) sourceLine = sourceLines[k - 1].split() sourceLine.insert(0, SentenceAlignment.NULLTAG) targetLine = targetLines[k - 1].split() targetLine.insert(0, SentenceAlignment.NULLTAG) if (alignments.has_key(k)): alignment = alignments[k] else: alignment = SentenceAlignment(sourceLine, targetLine) alignment.Align(i, j) alignments[k] = alignment return alignments
def LoadTValues(self, tFile): lines = u.readFromFile(tFile) i = 0 for line in lines: i += 1 tMapLine = line.split() if len(tMapLine) != 3: print "Invalid line %s: %s" % (i, line) key = WordMap(tMapLine[0], tMapLine[1]) val = float(tMapLine[2]) self.__tMap[key] = val return self.__tMap
def LoadQValues(self, qFile): lines = u.readFromFile(qFile) i = 0 for line in lines: i += 1 qMapLine = line.split() if len(qMapLine) != 5: print "Invalid line %s: %s" % ( i, line ) key = AlignmentByPosition(int(qMapLine[0]), int(qMapLine[1]), int(qMapLine[2]), int(qMapLine[3])) val = float(qMapLine[4]) self.__qMap[key] = val return self.__qMap
def LoadTValues(self, tFile): lines = u.readFromFile(tFile) i = 0 for line in lines: i += 1 tMapLine = line.split() if len(tMapLine) != 3: print "Invalid line %s: %s" % ( i, line ) key = WordMap(tMapLine[0], tMapLine[1]) val = float(tMapLine[2]) self.__tMap[key] = val return self.__tMap
def LoadQValues(self, qFile): lines = u.readFromFile(qFile) i = 0 for line in lines: i += 1 qMapLine = line.split() if len(qMapLine) != 5: print "Invalid line %s: %s" % (i, line) key = AlignmentByPosition(int(qMapLine[0]), int(qMapLine[1]), int(qMapLine[2]), int(qMapLine[3])) val = float(qMapLine[4]) self.__qMap[key] = val return self.__qMap
def Load(self, qFile): lines = u.readFromFile(qFile) i = 0 for line in lines: i += 1 qMapLine = line.split() if len(qMapLine) != 5: print "Invalid line %s: %s" % ( i, line ) key = AlignmentByPosition(int(qMapLine[0]), int(qMapLine[1]), int(qMapLine[2]), int(qMapLine[3])) val = float(qMapLine[4]) self.SetQValue(key.j, key.iCondition, key.lCondition, key.mCondition, val) return self.__qMap
def LoadQValues2(self, qFile): lines = u.readFromFile(qFile) i = 0 qMap = dict() for line in lines: i += 1 qMapLine = line.split() if len(qMapLine) != 5: print "Invalid line %s: %s" % ( i, line ) key = AlignmentByPosition(qMapLine[0], qMapLine[1], qMapLine[2], qMapLine[3]) val = float(qMapLine[4]) qMap[key] = val return qMap
def LoadQValues2(self, qFile): lines = u.readFromFile(qFile) i = 0 qMap = dict() for line in lines: i += 1 qMapLine = line.split() if len(qMapLine) != 5: print "Invalid line %s: %s" % (i, line) key = AlignmentByPosition(qMapLine[0], qMapLine[1], qMapLine[2], qMapLine[3]) val = float(qMapLine[4]) qMap[key] = val return qMap