def nfold(trees, n=5, stackwindow=2, qwindow=2, tagsize=2, precision=0.95, threshold=500): print "nfold, tagsize=%s"%(tagsize) a = 0 ca = 0 for i in range(0, n): training, testing = makefold(i, n, trees) if training == []: training, testing = testing, training print "makeclassifier(%s)"%(i) parser = onefold(i, n, training, testing, stackwindow=stackwindow, qwindow=qwindow, tagsize=tagsize, precision=precision, threshold=threshold) print "testing[0].dtree before testparser" testing[0].showDTree() print "testing[0].parsed" print tb.showDTree(malt.buildtree(testing[0].parsed, testing[0].leaves)) a += parser.parseraccuracy c = parser.classifier ca += c.accuracy return a/n, ca/n, c, parser, training, testing
def text2term(text): t = parser.parse(text, tagger=tagger) return tree2term(malt.buildtree(t.parsed, t.leaves))
def parse(self, sentence, tagger=False, keepscore=True, justParse=True): t = parse(sentence, self.classifier, tagger=tagger, tagsize=self.tagsize, stackwindow=self.stackwindow, qwindow=self.qwindow, keepscore=keepscore, preclassified=False, justParse=justParse) if justParse: t.dtree = malt.buildtree(t.parsed, t.leaves) return t
def parse(tree, classifier, tagger=False, tagsize=5, singlestep=False, silent=True, stackwindow=2, qwindow=3, keepscore=True, preclassified=False, justParse=False): malt.SILENT = silent if type(tree) == "SENTENCE": if tree.dtree: s, goldstandard = malt.dtree2state(tree) else: goldstandard = tree.goldstandard s = malt.STATE(text=tree.leaves) words = s.words else: words = [malt.WORD(w[0], w[1]) for w in tagger.tag(tree)] tree = tb.SENTENCE(False, False, False, False) tree.leaves = words s = malt.STATE(text=words) goldstandard = False for w in s.words: w.tag = w.tag[:tagsize] features = sorted(classifier.features.keys()) agenda = [s] s.score = 0.0 while not agenda == []: s = agenda.pop() if s.queue == []: """ What we'd like to do (pace Sardar) is to insist that the stack should have only one item on it, and if not then we go on to the next task on the agenda. That works nicely if we use the head percolation table that has CC as the head whenever possible; but overall the one that has CC as worst choice for the head does better (as with Maytham), and in that case trying to get to a nice terminal state by choosing other options from the agenda gets stuck. So we have to do something more simple-minded: attaching the things that haven't been attached to anything to their neighbours seems to work quite nicely. It's not exactly systematic, but it gets a surprising number of things right. """ for i in range(len(s.stack)-1): hd = s.stack[i+1] dtr = s.stack[i] s.relations[dtr.position] = malt.RELATION(hd.position, dtr.position, 'mod') d = s.stateDescriptor(False, qwindow=classifier.qwindow, stackwindow=classifier.stackwindow) t = id3.INSTANCE(features, d) """ The classifier can return several options: it would be nice to weigh them up, using influences from the grammar, and then order the agenda to pay due attention to the confidence of the classifier and the constraints from the grammar, but I can't find any useful influences. So I'm just choosing the best and working with that. I've left it as a loop, even though it's currently a pointless loop because the list only ever has one item on it, so that I can revisit it some time if I have the energy. """ actions = sortTable(classifier.classify(t, printing=False)) if actions == []: print "no action found" for action in actions: s1 = s.copy() s1.score = s.score+10 if singlestep: print '***********************************' s1.showState() ss("action %s\ns.relations %s\nd %s\n"%(action, s.relations, d)) action = eval("malt.STATE.%s"%(action[0])) WARNINGS = True try: if action(s1, warnings=WARNINGS, throwException=True): agenda.append(s1) break except Exception as e: if agenda == []: if not s1.queue == []: malt.STATE.shift(s1, warnings=WARNINGS) agenda.append(s1) else: break agenda.sort(cmp=compstates) right = 0 tree.parsed = s.relations if justParse: return tree if type(tree) == "str" or not keepscore: return malt.buildtree(s.relations, s.words) else: right1 = scoreState(goldstandard, s, tree) if preclassified: usePreclassifiedHDs(preclassified, s.relations) right2 = scoreState(goldstandard, s, tree) return right2, len(goldstandard), len(s.stack) else: return right1, len(goldstandard), len(s.stack)
def showtrees(trees, outfile=sys.stdout): with safeout(outfile) as out: out(r""" \documentclass[10pt]{article} \usepackage[a4paper,landscape]{geometry} \usepackage{headerfooter} \usepackage{defns} \usepackage{lscape} \usepackage{ifthen} \usepackage{natbib} \usepackage{lscape} \usepackage{examples} \usepackage{multicol} \usepackage[usenames,dvipsnames,svgnames,table]{xcolor} \usepackage{pstricks, pst-node, pst-tree} \usepackage{graphicx} \oddsidemargin=0in \evensidemargin=0in \begin{document} \begin{examples} """) for tree in trees: for leaf in tree.leaves: leaf.colour = "black" t = simplifytree(tree.dtree) d = depth(t) out(r""" \newpage \item %s \noindent DTREE (= Gold Standard) \noindent %s """%(" ".join(map(lambda x: x.form, tree.leaves)).replace("$", r"\$"), pstree(t, lsep=min(70, int(350.0/d)), tsep=20))) goldstandard = tree.goldstandard parsed = tree.parsed out(showTreeAsArcs(tree.leaves, goldstandard)) for leaf in tree.leaves: i = leaf.position if i in goldstandard and i in parsed: if not goldstandard[i].hd == parsed[i].hd: leaf.colour = "red" elif i in goldstandard or i in parsed: leaf.colour = "red" t = simplifytree(buildtree(tree.parsed, tree.leaves)) d = depth(t) out(r""" \newpage \noindent PARSED \noindent %s """%(pstree(t, lsep=min(70, int(350/d)), tsep=20))) out(plantlinks(tree)) out(showTreeAsArcs(tree.leaves, parsed)) out(r""" \end{examples} \end{document} """) if not outfile == sys.stdout: subprocess.Popen(["latex", outfile]).wait() subprocess.Popen(["dvipdf", outfile[:-4]]).wait() print "dvipdf complete"
def readconll(conllfile="%s/ud-treebanks-v1.1/UD_English/wholething.txt"%(programs)): specials = {"-":"DASH", "that":"THAT", "if":"IF", "his":"PX", "my":"PX", "our":"PX", "your":"PX", "their":"PX", "sure":"JJ", "ago":"NI", "of":"OF", "has":"VH", "had":"VH", "have":"VH", "having":"VH", "be":"VX", "am":"VX", "is":"VX", "are":"VX", "was":"VX", "were":"VX", "being":"VX", "been":"VX", } preps = ['@', 'about', 'across', 'after', 'although', 'as', 'at', 'because', 'before', 'behind', 'besides', 'between', 'beyond', 'by', 'due', 'during', 'for', 'in', 'into', 'near', 'on', 'over', 'out', 'per', 'since', 'than', 'through', 'till', 'under', 'up', 'vs', 'vs.', 'whereas', 'while', 'with'] for p in preps: specials[p] = "IN" sentences = [] words = [] sentence = [] n = 0 for line in open(conllfile).readlines(): line = line.strip() m = numPattern.match(line) line = line.replace("``", '"').replace("''", '"') if line == "": if len(sentence) > 1: relations = [RELATION(word.hd, word.position, rel=word.label) for word in sentence] preswap(sentence, relations) tree = buildtree(relations, sentence) leaves = sentence sentence = SENTENCE(leaves, conllfile, n, n) sentence.dtree = tree sentence.leaves = leaves relations = {r.dtr:r for r in relations} sentence.goldstandard = relations sentence.parsed = relations sentences.append(sentence) sentence = [] else: word = line.split("\t") POSITION = 0 FORM = 1 TAG = 4 HD = 6 LABEL = 7 form = word[FORM] if form[0] in "?.!*+-=": word[FORM] = form[0] form = word[FORM] if re.compile("\d+(\.\d+)").match(form): word[FORM] = "9999" form = word[FORM] if word[TAG] == "NNP": word[TAG] = "NP" else: word[FORM] = form.lower() form = word[FORM] if form in specials: word[TAG] = specials[form] word = WORD(form=word[FORM], tag=word[TAG], label=word[LABEL], hd=int(word[HD])-1, position=int(word[POSITION])-1) sentence.append(word) words.append(word) if len(sentence) > 1: preswap(sentence) tree = buildtree([RELATION(word.hd, word.position, rel=word.label) for word in sentence], sentence) sentence = SENTENCE(words, conllfile, n, n) sentence.dtree = tree sentence.leaves = words sentences.append(sentence) return sentences, words