def Train(self, sourceCorpusFile, targetCorpusFile, iterations): sourceLines = u.readFromFile(sourceCorpusFile) targetLines = u.readFromFile(targetCorpusFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % ( len(sourceLines), len(targetLines)) print u.now(), "Initializing" self.__Initialize(sourceLines, targetLines) for s in xrange(0, iterations): start1 = u.now() print start1, "Computing Counts for iteration", s + 1 self.__ComputeCounts(sourceLines, targetLines) start2 = u.now() print start2, "Computing t values for iteration", s + 1 self.__ComputeTValues(sourceLines, targetLines) end = u.now() print u.now(), "Iteration", s + 1, "complete." print u.now(), "Started count computations at %s." % start1 print u.now(), "Started t value computations at %s." % start2 print u.now(), "Finished at %s" % end return self.__tMap
def Train(self, sourceCorpusFile, targetCorpusFile, iterations, tFile): sourceLines = u.readFromFile(sourceCorpusFile) targetLines = u.readFromFile(targetCorpusFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % (len(sourceLines), len(targetLines)) if (tFile == ""): print u.now(), "Initializing" self._IBM1__Initialize(sourceLines, targetLines) else: print u.now(), "Loading Initial T Values" self.LoadTValues(tFile) for s in xrange(0, iterations): for x in xrange(0,9): print "Iteration", s, ":", x, "2 8 8", self.GetQValue(x, 2, 8, 8) start1 = u.now() print start1, "Computing Counts for iteration", s+1 self._IBM1__ComputeCounts(sourceLines, targetLines) start2 = u.now() print start2, "Computing t values for iteration", s+1 self._IBM1__ComputeTValues(sourceLines, targetLines) start3 = u.now() print start3, "Computing q values for iteration", s+1 self.__ComputeQValues(sourceLines, targetLines) end = u.now() print u.now(), "Iteration", s+1, "complete." print u.now(), "Started count computations at %s." % start1 print u.now(), "Started t value computations at %s." % start2 print u.now(), "Started q value computations at %s." % start3 print u.now(), "Iteration", s+1, "finished at %s" % end return self._IBM1__tMap
tFile = "tValues.txt" # sourceAlignmentFile = "test.es" # targetAlignmentFile = "test.en" # aFile = "alignment_test.p1.out" model = IBM1() tMap = model.Train(sourceCorpusFile, targetCorpusFile, 5) model.SaveTValues(tFile) # # print u.now(), "Loading t values" # tMap = model.LoadTValues(tFile) print u.now(), "Aligning words" alignments = model.Align(sourceAlignmentFile, targetAlignmentFile) print u.now(), "Saving alignments" SaveAlignments(aFile, alignments) print u.now(), "Found %s alignments" % len(alignments) print u.now(), "Done" # for key in tMap.keys(): # if key.eCondition == "cyprus": # print key.f, tMap[key] print u.now(), "Found %s possible alignment pairings" % len(tMap) #for mapCount in tMap.Items():
def Train(self, sourceCorpusFile, targetCorpusFile, iterations): sourceLines = u.readFromFile(sourceCorpusFile) targetLines = u.readFromFile(targetCorpusFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % (len(sourceLines), len(targetLines)) print u.now(), "Initializing" self.__Initialize(sourceLines, targetLines) for s in xrange(0, iterations): start1 = u.now() print start1, "Computing Counts for iteration", s+1 self.__ComputeCounts(sourceLines, targetLines) start2 = u.now() print start2, "Computing t values for iteration", s+1 self.__ComputeTValues(sourceLines, targetLines) end = u.now() print u.now(), "Iteration", s+1, "complete." print u.now(), "Started count computations at %s." % start1 print u.now(), "Started t value computations at %s." % start2 print u.now(), "Finished at %s" % end return self.__tMap
sourceAlignmentFile = "test.es" targetAlignmentFile = "test.en" aFile = "alignment_test.p2.out" model = IBM2() # print u.now(), "Training start" # tMap = model.Train(sourceCorpusFile, targetCorpusFile, 5, tFile) # # print u.now(), "Saving t values" # model.SaveTValues(newTFile) # # print u.now(), "Saving q values" # model.SaveQValues(qFile) print u.now(), "Loading t values" tMap = model.LoadTValues(newTFile) print u.now(), "Loading q values" #qMap0 = model.QMap() #qMap1 = model.LoadQValues2(qFile) qMap = model.LoadQValues(qFile) print u.now(), "Aligning words" alignments = model.Align(sourceAlignmentFile, targetAlignmentFile) print u.now(), "Saving alignments to", aFile a3p1.SaveAlignments(aFile, alignments)
tFile = "tValues.txt" # sourceAlignmentFile = "test.es" # targetAlignmentFile = "test.en" # aFile = "alignment_test.p1.out" model = IBM1() tMap = model.Train(sourceCorpusFile, targetCorpusFile,5) model.SaveTValues(tFile) # # print u.now(), "Loading t values" # tMap = model.LoadTValues(tFile) print u.now(), "Aligning words" alignments = model.Align(sourceAlignmentFile, targetAlignmentFile) print u.now(), "Saving alignments" SaveAlignments(aFile, alignments) print u.now(), "Found %s alignments" % len(alignments) print u.now(), "Done" # for key in tMap.keys(): # if key.eCondition == "cyprus": # print key.f, tMap[key] print u.now(), "Found %s possible alignment pairings" % len(tMap)
def Train(self, sourceCorpusFile, targetCorpusFile, iterations, tFile): sourceLines = u.readFromFile(sourceCorpusFile) targetLines = u.readFromFile(targetCorpusFile) if (len(sourceLines) != len(targetLines)): print "Source(%s) and target(%s) corpus lengths differ." % ( len(sourceLines), len(targetLines)) if (tFile == ""): print u.now(), "Initializing" self._IBM1__Initialize(sourceLines, targetLines) else: print u.now(), "Loading Initial T Values" self.LoadTValues(tFile) for s in xrange(0, iterations): for x in xrange(0, 9): print "Iteration", s, ":", x, "2 8 8", self.GetQValue( x, 2, 8, 8) start1 = u.now() print start1, "Computing Counts for iteration", s + 1 self._IBM1__ComputeCounts(sourceLines, targetLines) start2 = u.now() print start2, "Computing t values for iteration", s + 1 self._IBM1__ComputeTValues(sourceLines, targetLines) start3 = u.now() print start3, "Computing q values for iteration", s + 1 self.__ComputeQValues(sourceLines, targetLines) end = u.now() print u.now(), "Iteration", s + 1, "complete." print u.now(), "Started count computations at %s." % start1 print u.now(), "Started t value computations at %s." % start2 print u.now(), "Started q value computations at %s." % start3 print u.now(), "Iteration", s + 1, "finished at %s" % end return self._IBM1__tMap
sourceAlignmentFile = "test.es" targetAlignmentFile = "test.en" aFile = "alignment_test.p2.out" model = IBM2() # print u.now(), "Training start" # tMap = model.Train(sourceCorpusFile, targetCorpusFile, 5, tFile) # # print u.now(), "Saving t values" # model.SaveTValues(newTFile) # # print u.now(), "Saving q values" # model.SaveQValues(qFile) print u.now(), "Loading t values" tMap = model.LoadTValues(newTFile) print u.now(), "Loading q values" #qMap0 = model.QMap() #qMap1 = model.LoadQValues2(qFile) qMap = model.LoadQValues(qFile) print u.now(), "Aligning words" alignments = model.Align(sourceAlignmentFile, targetAlignmentFile) print u.now(), "Saving alignments to", aFile a3p1.SaveAlignments(aFile, alignments) # print u.now(), "Found %s alignments" % len(alignments) print u.now(), "Done"
# esTargetModel = a3p2_opt.IBM2A() # esTargetModel.LoadTValues(enSourceM2TFile) # esTargetModel.LoadQValues(enSourceM2QFile) # # print u.now(), "Getting first alignments" # alignments_enTarget = enTargetModel.Align(esSentencesFile, enSentencesFile) # alignments_esTarget = esTargetModel.Align(enSentencesFile, esSentencesFile) # # a3p1.SaveAlignments(enTargetAlignmentsFile, alignments_enTarget) # a3p1.SaveAlignments(esTargetAlignmentsFile, alignments_esTarget) ########################################################### print u.now(), "Loading..." # alignments_enTarget = LoadAlignments(esSourceAlignmentsFile, esSentencesFile, enSentencesFile) # alignments_esTarget = LoadAlignments(enSourceAlignmentsFile, enSentencesFile, esSentencesFile) alignments_enTarget = LoadAlignments(enTargetAlignmentsFile, esSentencesFile, enSentencesFile) alignments_esTarget = LoadAlignments(esTargetAlignmentsFile, enSentencesFile, esSentencesFile) if len(alignments_enTarget) != len(alignments_esTarget): print "Sentence counts do not match: %s for p(f|e) model, %s for p(e|f) model" % \ ( len(alignments_enTarget), len(alignments_esTarget) ) sentenceCount = len(alignments_esTarget) fullAlignments = []
# # print u.now(), "Load Model p(e|f)" # esTargetModel = a3p2_opt.IBM2A() # esTargetModel.LoadTValues(enSourceM2TFile) # esTargetModel.LoadQValues(enSourceM2QFile) # # print u.now(), "Getting first alignments" # alignments_enTarget = enTargetModel.Align(esSentencesFile, enSentencesFile) # alignments_esTarget = esTargetModel.Align(enSentencesFile, esSentencesFile) # # a3p1.SaveAlignments(enTargetAlignmentsFile, alignments_enTarget) # a3p1.SaveAlignments(esTargetAlignmentsFile, alignments_esTarget) ########################################################### print u.now(), "Loading..." # alignments_enTarget = LoadAlignments(esSourceAlignmentsFile, esSentencesFile, enSentencesFile) # alignments_esTarget = LoadAlignments(enSourceAlignmentsFile, enSentencesFile, esSentencesFile) alignments_enTarget = LoadAlignments(enTargetAlignmentsFile, esSentencesFile, enSentencesFile) alignments_esTarget = LoadAlignments(esTargetAlignmentsFile, enSentencesFile, esSentencesFile) if len(alignments_enTarget) != len(alignments_esTarget): print "Sentence counts do not match: %s for p(f|e) model, %s for p(e|f) model" % \ ( len(alignments_enTarget), len(alignments_esTarget) ) sentenceCount = len(alignments_esTarget)