if __name__ == "__main__": (out, trees, validation) = sys.argv[1:] print "Output:", out, "Training:", trees, "Validation:", validation grammar = HierGrammar(out, mode='w') binarizeTo = 0 gstats = GrammarStats() vstats = ValidationEvents() for ct,line in enumerate(file(trees)): if ct % 100 == 0: print "read trees", ct tree = markovBinarizeTree(treeToTuple(line.strip()), to=0) # print treeToStr(tree) gstats.record(tree) for ct,line in enumerate(file(validation)): if ct % 100 == 0: print "read validation trees", ct tree = markovBinarizeTree(treeToTuple(line.strip()), to=0) vstats.record(tree) gstats.normalize() gstats.learnLambdas(vstats) gstats.addToGrammar(grammar, 0) grammar.writeback("hierarchy")
# print grammar.hierarchy grammar.makeMapping(topLevel) # print grammar.pennToLevel for level in range(len(ec05)): gstats = GrammarStats() vstats = ValidationEvents() for ct,line in enumerate(file(trees)): if ct % 100 == 0: print "read trees", ct tree = grammar.transform(level + 1, zeroSplit( binarizeTree( treeToTuple(line.strip())))) # print treeToStr(tree) gstats.record(tree) for ct,line in enumerate(file(validation)): if ct % 1000 == 0: print "read validation trees", ct tree = grammar.transform(level + 1, zeroSplit( binarizeTree( treeToTuple(line.strip())))) vstats.record(tree) gstats.normalize() gstats.learnLambdas(vstats)
from treeUtils import treeToDeriv, treeToTuple, untransform # tt = "(ROOT (S (NP (DT The) (@NP (ADJP (RBS most) (JJ troublesome)) (NN report))) (@S (VP (MD may) (VP (VB be) (NP (NP (DT the) (@NP (NNP August) (@NP (NN merchandise) (@NP (NN trade) (NN deficit))))) (ADJP (JJ due) (@ADJP (ADVP (IN out)) (NP (NN tomorrow))))))) (. .))))" # print treeToTuple(tt) # print treeToDeriv(treeToTuple(tt)) tt = """(ROOT (RS (DT The) (ROOTlcDT (NN government) (POS 's) (ROOTlcNP (NN plan) (ROOTlcNP (VP (VBZ is) (VPlcVBZ (ADJP (JJ stupid) (ADJPlcJJ )) (VPlcVP ))) (. .) (ROOTlcS ))))))""" print treeToTuple(tt) print untransform(treeToTuple(tt))