def ComputeParseTree(self, sentence): print u.now(), "Initializing tables" self.__PiTable.Initialize(sentence, self) print u.now(), "Tables initialized" splitSentence = sentence.split() n = len(splitSentence) for l in xrange(1, n): for i in xrange(1, n - l + 1): j = i + l bestPi = MaxProbability(0, None, None) for X in self.N: #print l, i, j, X pi = self.FindMaxPi(i, j, X, splitSentence) if bestPi.P <= pi.P: bestPi = pi #print bestPi print u.now(), "Tables computed" tree = self.__BuildTree(splitSentence) print u.now(), tree return tree
def Preprocess(countsFile="", trainingFile="", adjustedTrainingFile=""): if countsFile=="" or trainingFile=="" or adjustedTrainingFile=="": return RARETAG = '_RARE_' print "Preprocess started at", u.now() rareWords = readCounts(countsFile)[4] lines = u.readFromFile(trainingFile) newTrees = [] processed = 0 progressThreshold = 100 for line in lines: #print "Processing:", line tree = json.loads(line) newTree = replaceLeaves(tree, rareWords, RARETAG) outputLine = json.dumps(newTree) newTrees.append(outputLine) processed += 1 if progressThreshold <= processed: print u.now(), "Processed", processed progressThreshold += 100 outputText = '\n'.join(newTrees) u.saveToFile(adjustedTrainingFile, outputText) print "Preprocess ended at", u.now()
def ParseFile(fileToParse="", adjustedCountsFile="", resultFile=""): if fileToParse == "" or adjustedCountsFile == "" or resultFile == "": return counts = a2p1.readCounts(adjustedCountsFile) nonTerminalCounts = counts[0] binaryCounts = counts[1] unaryCounts = counts[2] wordCounts = counts[3] print u.now(), "Building Grammar" grammar = VertMarkovGrammar(nonTerminalCounts, binaryCounts, unaryCounts, wordCounts) print u.now(), "Grammar created" lines = u.readFromFile(fileToParse) outputLines = [] progress = 0 for sentence in lines: progress += 1 print "%s Parsing (%s/%s): %s" % (u.now(), progress, len(lines), sentence[:-1]) tree = grammar.ComputeParseTree(sentence) outputLine = json.dumps(tree) outputLines.append(outputLine) outputText = '\n'.join(outputLines) print u.now(), "Saving output to", resultFile u.saveToFile(resultFile, outputText) print u.now(), "Done"
def ParseFile(fileToParse="", adjustedCountsFile="", resultFile=""): if fileToParse=="" or adjustedCountsFile=="" or resultFile=="": return counts = a2p1.readCounts(adjustedCountsFile) nonTerminalCounts = counts[0] binaryCounts = counts[1] unaryCounts = counts[2] wordCounts = counts[3] print u.now(), "Building Grammar" grammar = VertMarkovGrammar(nonTerminalCounts, binaryCounts, unaryCounts, wordCounts) print u.now(), "Grammar created" lines = u.readFromFile(fileToParse) outputLines = [] progress = 0 for sentence in lines: progress += 1 print "%s Parsing (%s/%s): %s" % ( u.now(), progress, len(lines), sentence[:-1]) tree = grammar.ComputeParseTree(sentence) outputLine = json.dumps(tree) outputLines.append(outputLine) outputText = '\n'.join(outputLines) print u.now(), "Saving output to", resultFile u.saveToFile(resultFile, outputText) print u.now(), "Done"
# a2p1.Preprocess(countsFile, trainingFile, adjustedTrainingFile) # # targetFile = "parse_dev.dat" # adjustedCountsFile = "cfg.vert.adjusted.counts" # resultFile = "parse_dev.vert.out" targetFile = "parse_test.dat" adjustedCountsFile = "cfg.vert.adjusted.counts" resultFile = "parse_test.p3.out" # # targetFile = "test.dat" # adjustedCountsFile = "cfg.vert.adjusted.counts" # resultFile = "test.vert.out" start = u.now() # targetFile = "shorttest.dat" # adjustedCountsFile = "cfg.vert.adjusted.counts" # resultFile = "shorttest.vert.out" # targetFile = "shorttest.dat" # adjustedCountsFile = "parse_train.counts.out" # resultFile = "shorttest.out" #a2p2.ParseFile(targetFile, adjustedCountsFile, resultFile) ParseFile(targetFile, adjustedCountsFile, resultFile) end = u.now()