def compare_hybridtrees(self, gold, system): """ :type gold: ConstituentTree :type system: ConstituentTree :return: :rtype: """ gtree, gsent = convert_tree(gold) stree, ssent = convert_tree(system) try: result = TreePairResult(0, gtree, gsent, stree, ssent, self.param).scores() f1 = float(result['LF']) if math.isnan(f1): return 0.0 else: return f1 except (KeyError, IndexError, ValueError): gtree, gsent = convert_tree(gold) stree, ssent = convert_tree(system) print('gold tree:') print(DrawTree(gtree, gsent)) print(gold) print(gold.root) print('system tree') print(DrawTree(stree, ssent)) print(system) print(system.root) result = TreePairResult(0, gtree, gsent, stree, ssent, self.param).scores() assert False
def newlabel(): """Re-draw tree with newly picked label.""" sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] senttok, _ = worker.postokenize(sent) treestr = request.args.get('tree', '') try: tree, _sent1 = validate(treestr, senttok) except ValueError as err: return str(err) # FIXME: re-factor; check label AFTER replacing it # now actually replace label at nodeid _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) dt = DrawTree(tree, senttok) match = LABELRE.match(dt.nodes[nodeid].label) if 'label' in request.args: label = request.args.get('label', '') dt.nodes[nodeid].label = (label + (match.group(2) or '') + (match.group(3) or '')) elif 'function' in request.args: label = request.args.get('function', '') if label == '': dt.nodes[nodeid].label = '%s%s' % (match.group(1), match.group(3) or '') else: dt.nodes[nodeid].label = '%s-%s%s' % (match.group(1), label, match.group(3) or '') elif 'morph' in request.args: label = request.args.get('morph', '') if label == '': dt.nodes[nodeid].label = '%s%s' % (match.group(1), match.group(2) or '') else: dt.nodes[nodeid].label = '%s%s/%s' % (match.group(1), match.group(2) or '', label) else: raise ValueError('expected label or function argument') tree = dt.nodes[0] dt = DrawTree(tree, senttok) # kludge.. treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip() link = ('<a href="/annotate/accept?%s">accept this tree</a>' % urlencode(dict(sentno=sentno, tree=treestr))) session['actions'][RELABEL] += 1 session.modified = True return Markup('%s\n\n%s\t%s' % (link, dt.text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t0'), treestr))
def main(): # train_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train/train.German.gold.xml' # corpus = sentence_names_to_hybridtrees(["s" + str(i) for i in range(1, 10)], file_name=train_path, hold=False) train_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml' names = ["s" + str(i) for i in range(40675, 40700)] names = ['s40564'] corpus = sentence_names_to_hybridtrees(names, path=train_path, hold=False) cp = TreeComparator() tree_sys = ConstituentTree() tree_sys.add_node('0', ConstituentCategory('PN')) tree_sys.add_node('1', corpus[0].token_yield()[0], order=True) tree_sys.add_punct("3", '$.', '.') tree_sys.add_to_root('0') tree_sys.add_child('0', '1') param = build_param() for i, hybridtree in enumerate(corpus): print(i) # discotree = convert_tree(hybridtree) tree, sent = convert_tree(hybridtree) tree2, sent2 = convert_tree(tree_sys) if i == 11: pass # print(discotree) # print(discotree.draw()) # print(DrawTree(discotree, discotree.sent)) print(DrawTree(tree, sent)) print(' '.join(map(lambda x: x.form(), hybridtree.full_token_yield()))) print(DrawTree(tree2, sent2)) print(tree[::-1]) print('POS', tree.pos()) result = TreePairResult(i, tree, sent, tree2, sent2, param) print(result.scores()) print("Comparator: ", cp.compare_hybridtrees(hybridtree, hybridtree))
def reparsesubtree(): """Re-parse selected subtree.""" sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] senttok, _ = worker.postokenize(sent) username = session['username'] treestr = request.args.get('tree', '') try: tree, _sent1 = validate(treestr, senttok) except ValueError as err: return str(err) error = '' dt = DrawTree(tree, senttok) _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) subseq = sorted(dt.nodes[nodeid].leaves()) subsent = ' '.join(senttok[n] for n in subseq) # FIXME only works when root label of tree matches label in grammar. # need a single label that works across all stages. root = dt.nodes[nodeid].label # root = grammar.tolabel[next(iter(grammar.tblabelmapping[root]))] resp = WORKERS[username].submit(worker.getparses, subsent, (), (), root=root).result() _senttok, parsetrees, _messages, _elapsed = resp app.logger.info('%d-%d. [parse trees=%d] %s', sentno, nodeid, len(parsetrees), subsent) print(parsetrees[0][1]) nbest = Markup( '<pre>%d parse trees\n' '<a href="javascript: toggle(\'nbest\'); ">cancel</a>\n' '%s</pre>' % (len(parsetrees), '\n'.join( '%(n)d. [%(prob)s] ' '<a href="#" onClick="picksubtree(%(n)d); ">' 'use this subtree</a>; ' '\n\n' '%(tree)s' % dict(n=n + 1, prob=probstr(prob), tree=DrawTree(tree, subsent.split()).text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t%d' % (n + 1))) for n, (prob, tree, _treestr, fragments) in enumerate(parsetrees)))) return nbest
def showderiv(): """Render derivation for a given parse tree in cache.""" username = session['username'] n = int(request.args.get('n')) # 1-indexed sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] require = request.args.get('require', '') block = request.args.get('block', '') require, block = parseconstraints(require, block) resp = WORKERS[username].submit(worker.getparses, sent, require, block).result() senttok, parsetrees, _messages, _elapsed = resp _prob, tree, _treestr, fragments = parsetrees[n - 1] return Markup( '<pre>Fragments used in the highest ranked derivation' ' of this parse tree:\n%s\n%s</pre>' % ('\n\n'.join( '%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=True)) for frag, w in fragments or ()), DrawTree(tree, senttok).text( unicodelines=True, html=True, funcsep='-')))
def filterparsetrees(): """For a parse tree in the cache, return a filtered set of its n-best parses matching current constraints.""" username = session['username'] session['actions'][CONSTRAINTS] += 1 session.modified = True sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] urlprm = dict(sentno=sentno) require = request.args.get('require', '') block = request.args.get('block', '') if require and require != '': urlprm['require'] = require if block and block != '': urlprm['block'] = block require, block = parseconstraints(require, block) frequire = request.args.get('frequire', '') fblock = request.args.get('fblock', '') frequire, fblock = parseconstraints(frequire, fblock) resp = WORKERS[username].submit(worker.getparses, sent, require, block).result() senttok, parsetrees, _messages, _elapsed = resp parsetrees_ = [ (n, prob, tree, treestr, frags) for n, (prob, tree, treestr, frags) in enumerate(parsetrees) if treestr is None or testconstraints(treestr, frequire, fblock) ] if len(parsetrees_) == 0: return ('No parse trees after filtering; try pressing Re-parse, ' 'or reload page to clear constraints.\n') nbest = Markup('%d parse trees\n%s' % (len(parsetrees_), '\n'.join( '%(n)d. [%(prob)s] ' '<a href="/annotate/accept?%(urlprm)s">accept this tree</a>; ' '<a href="/annotate/edit?%(urlprm)s">edit</a>; ' '<a href="/annotate/deriv?%(urlprm)s">derivation</a>\n\n' '%(tree)s' % dict(n=n + 1, prob=probstr(prob), urlprm=urlencode(dict(urlprm, n=n + 1)), tree=DrawTree(tree, senttok).text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t%d' % (n + 1))) for n, prob, tree, _treestr, fragments in parsetrees_))) return nbest
def browsetrees(): """Browse through trees in a file.""" chunk = 20 # number of trees to fetch for one request if 'text' in request.args and 'sent' in request.args: textno = int(request.args['text']) sentno = int(request.args['sent']) start = max(1, sentno - sentno % chunk) stop = start + chunk nofunc = 'nofunc' in request.args nomorph = 'nomorph' in request.args filename = os.path.join(CORPUS_DIR, TEXTS[textno] + '.export') trees = CORPORA[filename].itertrees(start, stop) results = [ '<pre id="t%s"%s>%s\n%s</pre>' % (n, ' style="display: none; "' if 'ajax' in request.args else '', ', '.join('%s: %.3g' % (f, addsentweight(FILTERS[f](item))[1]) for f in sorted(FILTERS)), DrawTree(item.tree, item.sent).text(unicodelines=True, html=True)) for n, (_key, item) in enumerate(trees, start) ] if 'ajax' in request.args: return '\n'.join(results) prevlink = '<a id=prev>prev</a>' if sentno > chunk: prevlink = '<a href="browse?text=%d;sent=%d" id=prev>prev</a>' % ( textno, sentno - chunk + 1) nextlink = '<a id=next>next</a>' nextlink = '<a href="browse?text=%d;sent=%d" id=next>next</a>' % ( textno, sentno + chunk + 1) return render_template('browse.html', textno=textno, sentno=sentno, text=TEXTS[textno], totalsents=1000, trees=results, prevlink=prevlink, nextlink=nextlink, chunk=chunk, nofunc=nofunc, nomorph=nomorph, mintree=start, maxtree=stop) return '<h1>Browse through trees</h1>\n<ol>\n%s</ol>\n' % '\n'.join( '<li><a href="browse?text=%d;sent=1;nomorph">%s</a> ' % (n, text) for n, text in enumerate(TEXTS))
def parse(args, stdinput): """Parse a given sentence after inducing a grammar from a given corpus. Parameters ---------- args : list(str) The list of arguments: corpus file, sentence. stdinput : list(str) The pruning policy which should be used for the parsing process. """ # assign the parameter values corpus = TigerXMLCorpusReader(args[0], encoding='utf8') sent = args[1] # create grammar and gold trees trees = [ ImmutableTree.convert(canonicalize(t)) for t in list(corpus.trees().values()) ] sentences = list(corpus.sents().values()) grammar = Grammar(trees, sentences) goldtrees = [t for s, t in zip(sentences, trees) if ' '.join(s) == sent] # create initial pruning policy pp = deserialize(stdinput, FEATURES, grammar) if\ stdinput else PruningPolicy() # create derivation tree parser = Parser(grammar) derivationgraph = parser.parse(sent, pp) derivationtree = derivationgraph.get_tree() stdout.flush() # print results if isinstance(derivationtree, Tree): # print graphical representation if the sentence could be parsed print(derivationtree.pprint()) drawtree = DrawTree(derivationtree, sent.split()) print("\n derivation tree: \n" + drawtree.text()) else: # otherwise print a error message print(derivationtree) if len(goldtrees) > 0: # print graphical representation if there is a gold tree drawgold = DrawTree(goldtrees[0], sent.split()) print("\n gold tree: \n" + drawgold.text()) # print recall if both trees are available if isinstance(derivationtree, Tree): print("\n recall: %f" % accuracy(derivationtree, goldtrees[0]))
def draw(): """ Wrapper to parse & draw tree(s). """ if len(request.args['tree']) > LIMIT: return 'Too much data. Limit: %d bytes' % LIMIT dts = [] try: trees = list(incrementaltreereader( request.args['tree'].splitlines(), morphology='add' if 'morph' in request.args else None, functions='add' if 'func' in request.args else None)) except Exception as err: # pylint: disable=broad-except return Response(str(err), mimetype='text/plain') for tree, sent, _rest in trees: try: dts.append(DrawTree(tree, sent, abbr='abbr' in request.args)) except Exception as err: # pylint: disable=broad-except return Response(str(err), mimetype='text/plain') if not dts: return Response('No trees!', mimetype='text/plain') return drawtrees(request.args, dts)
def redraw(): """Validate and re-draw tree.""" sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] senttok, _ = worker.postokenize(sent) treestr = request.args.get('tree') link = ('<a href="/annotate/accept?%s">accept this tree</a>' % urlencode(dict(sentno=sentno, tree=treestr))) try: tree, _sent1 = validate(treestr, senttok) except ValueError as err: return str(err) oldtree = request.args.get('oldtree', '') if oldtree and treestr != oldtree: session['actions'][EDITDIST] += editdistance(treestr, oldtree) session.modified = True return Markup('%s\n\n%s' % ( link, # DrawTree(tree, senttok).svg(funcsep='-', hscale=45) DrawTree(tree, senttok).text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t0')))
def replacesubtree(): n = int(request.args.get('n', 0)) sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] senttok, _ = worker.postokenize(sent) username = session['username'] treestr = request.args.get('tree', '') try: tree, _sent1 = validate(treestr, senttok) except ValueError as err: return str(err) error = '' dt = DrawTree(tree, senttok) _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) subseq = sorted(dt.nodes[nodeid].leaves()) subsent = ' '.join(senttok[n] for n in subseq) root = dt.nodes[nodeid].label resp = WORKERS[username].submit(worker.getparses, subsent, (), (), root=root).result() _senttok, parsetrees, _messages, _elapsed = resp newsubtree = parsetrees[n - 1][1] pos = sorted(list(newsubtree.subtrees(lambda n: isinstance(n[0], int))), key=lambda n: n[0]) for n, a in enumerate(pos): a[0] = subseq[n] dt.nodes[nodeid][:] = newsubtree[:] tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip() session['actions'][REPARSE] += 1 session.modified = True link = ('<a href="/annotate/accept?%s">accept this tree</a>' % urlencode(dict(sentno=sentno, tree=treestr))) return Markup('%s\n\n%s%s\t%s' % (link, error, dt.text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t0'), treestr))
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') objfun = request.args.get('objfun', 'mpp') coarse = request.args.get('coarse', None) html = 'html' in request.args lang = request.args.get('lang', 'detect') if not sent: return '' frags = nbest = None senttok = tokenize(sent) if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) key = (senttok, est, marg, objfun, coarse, lang) resp = CACHE.get(key) if resp is None: link = 'parse?' + url_encode( dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html)) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = coarse PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse(senttok)) if results[-1].noparse: parsetrees = [] result = 'no parse!' frags = nbest = '' else: if SHOWMORPH: replacemorph(results[-1].parsetree) if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] fragments = results[-1].fragments or () APP.logger.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup( DrawTree(tree, senttok).text(unicodelines=True, html=html, funcsep='-')) frags = Markup( 'Phrasal fragments used in the most probable ' 'derivation of the highest ranked parse tree:\n' + '\n\n'.join( DrawTree(frag).text(unicodelines=True, html=html) for frag in fragments if frag.count('(') > 1)) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: replacemorph(tree) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) nbest = Markup('\n\n'.join( '%d. [%s]\n%s' % (n + 1, probstr(prob), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, _) in enumerate(parsetrees_))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % (' '.join( '%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join( ('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', '\n'.join( '%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000) else: ( sent, result, frags, nbest, # pylint: disable=unpacking-non-sequence info, link) = resp # pylint: disable=unpacking-non-sequence if html: return render_template('parsetree.html', sent=sent, result=result, frags=frags, nbest=nbest, info=info, link=link, randid=randid()) else: return Response('\n'.join((nbest, frags, info, result)), mimetype='text/plain')
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) objfun = request.args.get('objfun', 'mpp') est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') coarse = request.args.get('coarse', 'pcfg') html = 'html' in request.args lang = request.args.get('lang', 'detect') require = request.args.get('require', None) block = request.args.get('block', None) if not sent: return '' nbest = None if POSTAGS.match(sent): senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split())) else: senttok, tags = tuple(tokenize(sent)), None if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) if require: require = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(require))) if block: block = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(block))) key = (senttok, tags, est, marg, objfun, coarse, lang, require, block) resp = CACHE.get(key) if resp is None: urlparams = dict(sent=sent, lang=lang, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html) if require: urlparams['require'] = json.dumps(require) if block: urlparams['block'] = json.dumps(block) link = '?' + url_encode(urlparams) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = ('pcfg' if coarse == 'pcfg-posterior' else coarse) if len(PARSERS[lang].stages) > 1: PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse(senttok, tags=tags, require=require, block=block)) if SHOWMORPH: replacemorph(results[-1].parsetree) if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] LOG.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup( DrawTree(tree, senttok).text(unicodelines=True, html=html, funcsep='-')) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: replacemorph(tree) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) if PARSERS[lang].headrules: xtree = PARSERS[lang].postprocess(parsetrees[0][0], senttok, -1)[0] dep = treebank.writedependencies(xtree, senttok, 'conll') depsvg = Markup(DrawDependencies.fromconll(dep).svg()) else: dep = depsvg = '' rid = randid() nbest = Markup('\n\n'.join( '%d. [%s] ' '<a href=\'javascript: toggle("f%s%d"); \'>' 'derivation</a>\n' '<span id=f%s%d style="display: none; margin-left: 3em; ">' 'Fragments used in the highest ranked derivation' ' of this parse tree:\n%s</span>\n%s' % ( n + 1, probstr(prob), rid, n + 1, rid, n + 1, '\n\n'.join( '%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=html)) for frag, w in fragments or () # if frag.count('(') > 1 ), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, fragments) in enumerate(parsetrees_))) deriv = Markup( 'Fragments used in the highest ranked derivation' ' of best parse tree:\n%s' % ( '\n\n'.join( '%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=html)) for frag, w in parsetrees_[0][2] or () # if frag.count('(') > 1 ))) if parsetrees_ else '' msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % (' '.join( '%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(( 'length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', ''.join('%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, nbest, deriv, info, link, dep, depsvg), timeout=5000) else: (sent, result, nbest, deriv, info, link, dep, depsvg) = resp if html: return render_template('parsetree.html', sent=sent, result=result, nbest=nbest, deriv=deriv, info=info, link=link, dep=dep, depsvg=depsvg, randid=randid()) else: return Response('\n'.join((nbest, info, result)), mimetype='text/plain')
def reattach(): """Re-draw tree after re-attaching node under new parent.""" sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] senttok, _ = worker.postokenize(sent) treestr = request.args.get('tree', '') try: tree, _sent1 = validate(treestr, senttok) except ValueError as err: return str(err) dt = DrawTree(tree, senttok) error = '' if request.args.get('newparent') == 'deletenode': # remove nodeid by replacing it with its children _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) x = dt.nodes[nodeid] if nodeid == 0 or isinstance(x[0], int): error = 'ERROR: cannot remove ROOT or POS node' else: children = list(x) x[:] = [] for y in dt.nodes[0].subtrees(): if any(child is x for child in y): i = y.index(x) y[i:i + 1] = children tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. break elif request.args.get('nodeid', '').startswith('newlabel_'): # splice in a new node under parentid _treeid, newparent = request.args.get('newparent', '').lstrip('t').split('_') newparent = int(newparent) label = request.args.get('nodeid').split('_', 1)[1] y = dt.nodes[newparent] if isinstance(y[0], int): error = 'ERROR: cannot add node under POS tag' else: children = list(y) y[:] = [] y[:] = [Tree(label, children)] tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. else: # re-attach existing node at existing new parent _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) _treeid, newparent = request.args.get('newparent', '').lstrip('t').split('_') newparent = int(newparent) # remove node from old parent # dt.nodes[nodeid].parent.pop(dt.nodes[nodeid].parent_index) x = dt.nodes[nodeid] y = dt.nodes[newparent] for node in x.subtrees(): if node is y: error = ('ERROR: cannot re-attach subtree' ' under (descendant of) itself\n') break else: for node in dt.nodes[0].subtrees(): if any(child is x for child in node): if len(node) > 1: node.remove(x) dt.nodes[newparent].append(x) tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. else: error = ('ERROR: re-attaching only child creates' ' empty node %s; remove manually\n' % node) break treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip() link = ('<a href="/annotate/accept?%s">accept this tree</a>' % urlencode(dict(sentno=sentno, tree=treestr))) if error == '': session['actions'][REATTACH] += 1 session.modified = True return Markup('%s\n\n%s%s\t%s' % (link, error, dt.text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t0'), treestr))
def test_treedraw(): """Draw some trees. Only tests whether no exception occurs.""" trees = '''(ROOT (S (ADV 0) (VVFIN 1) (NP (PDAT 2) (NN 3)) (PTKNEG 4) \ (PP (APPRART 5) (NN 6) (NP (ART 7) (ADJA 8) (NN 9)))) ($. 10)) (S (NP (NN 1) (EX 3)) (VP (VB 0) (JJ 2))) (S (VP (PDS 0) (ADV 3) (VVINF 4)) (PIS 2) (VMFIN 1)) (top (du (comp 0) (smain (noun 1) (verb 2) (inf (verb 8) (inf \ (adj 3) (pp (prep 4) (np (det 5) (noun 6))) (part 7) (verb 9) \ (pp (prep 10) (np (det 11) (noun 12) (pp (prep 13) (mwu \ (noun 14) (noun 15))))))))) (punct 16)) (top (smain (noun 0) (verb 1) (inf (verb 5) (inf (np (det 2) \ (adj 3) (noun 4)) (verb 6) (pp (prep 7) (noun 8))))) (punct 9)) (top (smain (noun 0) (verb 1) (noun 2) (inf (adv 3) (verb 4))) \ (punct 5)) (top (punct 5) (du (smain (noun 0) (verb 1) (ppart (np (det 2) \ (noun 3)) (verb 4))) (conj (sv1 (conj (noun 6) (vg 7) (np \ (det 8) (noun 9))) (verb 10) (noun 11) (part 12)) (vg 13) \ (sv1 (verb 14) (ti (comp 19) (inf (np (conj (det 15) (vg 16) \ (det 17)) (noun 18)) (verb 20)))))) (punct 21)) (top (punct 10) (punct 16) (punct 18) (smain (np (det 0) (noun 1) \ (pp (prep 2) (np (det 3) (noun 4)))) (verb 5) (adv 6) (np \ (noun 7) (noun 8)) (part 9) (np (det 11) (noun 12) (pp \ (prep 13) (np (det 14) (noun 15)))) (conj (vg 20) (ppres \ (adj 17) (pp (prep 22) (np (det 23) (adj 24) (noun 25)))) \ (ppres (adj 19)) (ppres (adj 21)))) (punct 26)) (top (punct 10) (punct 11) (punct 16) (smain (np (det 0) \ (noun 1)) (verb 2) (np (det 3) (noun 4)) (adv 5) (du (cp \ (comp 6) (ssub (noun 7) (verb 8) (inf (verb 9)))) (du \ (smain (noun 12) (verb 13) (adv 14) (part 15)) (noun 17)))) \ (punct 18) (punct 19)) (top (smain (noun 0) (verb 1) (inf (verb 8) (inf (verb 9) (inf \ (adv 2) (pp (prep 3) (noun 4)) (pp (prep 5) (np (det 6) \ (noun 7))) (verb 10))))) (punct 11)) (top (smain (noun 0) (verb 1) (pp (prep 2) (np (det 3) (adj 4) \ (noun 5) (rel (noun 6) (ssub (noun 7) (verb 10) (ppart \ (adj 8) (part 9) (verb 11))))))) (punct 12)) (top (smain (np (det 0) (noun 1)) (verb 2) (ap (adv 3) (num 4) \ (cp (comp 5) (np (det 6) (adj 7) (noun 8) (rel (noun 9) (ssub \ (noun 10) (verb 11) (pp (prep 12) (np (det 13) (adj 14) \ (adj 15) (noun 16))))))))) (punct 17)) (top (smain (np (det 0) (noun 1)) (verb 2) (adv 3) (pp (prep 4) \ (np (det 5) (noun 6)) (part 7))) (punct 8)) (top (punct 7) (conj (smain (noun 0) (verb 1) (np (det 2) \ (noun 3)) (pp (prep 4) (np (det 5) (noun 6)))) (smain \ (verb 8) (np (det 9) (num 10) (noun 11)) (part 12)) (vg 13) \ (smain (verb 14) (noun 15) (pp (prep 16) (np (det 17) \ (noun 18) (pp (prep 19) (np (det 20) (noun 21))))))) \ (punct 22)) (top (smain (np (det 0) (noun 1) (rel (noun 2) (ssub (np (num 3) \ (noun 4)) (adj 5) (verb 6)))) (verb 7) (ppart (verb 8) (pp \ (prep 9) (noun 10)))) (punct 11)) (top (conj (sv1 (np (det 0) (noun 1)) (verb 2) (ppart (verb 3))) \ (vg 4) (sv1 (verb 5) (pp (prep 6) (np (det 7) (adj 8) \ (noun 9))))) (punct 10)) (top (smain (noun 0) (verb 1) (np (det 2) (noun 3)) (inf (adj 4) \ (verb 5) (cp (comp 6) (ssub (noun 7) (adv 8) (verb 10) (ap \ (num 9) (cp (comp 11) (np (det 12) (adj 13) (noun 14) (pp \ (prep 15) (conj (np (det 16) (noun 17)) (vg 18) (np \ (noun 19))))))))))) (punct 20)) (top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) \ (inf (verb 6) (conj (inf (pp (prep 2) (np (det 3) (noun 4))) \ (verb 7)) (inf (verb 9)) (vg 10) (inf (verb 11)))))) \ (punct 12)) (top (smain (verb 2) (noun 3) (adv 4) (ppart (np (det 0) \ (noun 1)) (verb 5))) (punct 6)) (top (conj (smain (np (det 0) (noun 1)) (verb 2) (adj 3) (pp \ (prep 4) (np (det 5) (noun 6)))) (vg 7) (smain (np (det 8) \ (noun 9) (pp (prep 10) (np (det 11) (noun 12)))) (verb 13) \ (pp (prep 14) (np (det 15) (noun 16))))) (punct 17)) (top (conj (smain (noun 0) (verb 1) (inf (ppart (np (noun 2) \ (noun 3)) (verb 4)) (verb 5))) (vg 6) (smain (noun 7) \ (inf (ppart (np (det 8) (noun 9)))))) (punct 10)) (A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10)) (B3 (t 1) \ (t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8))) (A (B1 6 13) (B2 3 7 10) (B3 1 \ 9 11 14 16) (B4 0 5 8)) (VP (VB 0) (PRT 2)) (VP (VP 0 3) (NP (PRP 1) (NN 2))) (ROOT (S (VP_2 (PP (APPR 0) (ART 1) (NN 2) (PP (APPR 3) (ART 4) \ (ADJA 5) (NN 6))) (ADJD 10) (PP (APPR 11) (NN 12)) (VVPP 13)) \ (VAFIN 7) (NP (ART 8) (NN 9))) ($. 14))''' sents = '''Leider stehen diese Fragen nicht im Vordergrund der \ augenblicklichen Diskussion . is Mary happy there das muss man jetzt machen Of ze had gewoon met haar vriendinnen rond kunnen slenteren in de \ buurt van Trafalgar Square . Het had een prachtige dag kunnen zijn in Londen . Cathy zag hen wild zwaaien . Het was een spel geworden , zij en haar vriendinnen kozen iemand \ uit en probeerden zijn of haar nationaliteit te raden . Elk jaar in het hoogseizoen trokken daar massa's toeristen \ voorbij , hun fototoestel in de aanslag , pratend , gillend \ en lachend in de vreemdste talen . Haar vader stak zijn duim omhoog alsof hij wilde zeggen : " het \ komt wel goed , joch " . Ze hadden languit naast elkaar op de strandstoelen kunnen gaan \ liggen . Het hoorde bij de warme zomerdag die ze ginds achter had gelaten . De oprijlaan was niet meer dan een hobbelige zandstrook die zich \ voortslingerde tussen de hoge grijze boomstammen . Haar moeder kleefde bijna tegen het autoraampje aan . Ze veegde de tranen uit haar ooghoeken , tilde haar twee koffers \ op en begaf zich in de richting van het landhuis . Het meisje dat vijf keer juist raadde werd getrakteerd op ijs . Haar neus werd platgedrukt en leek op een jonge champignon . Cathy zag de BMW langzaam verdwijnen tot hij niet meer was dan \ een zilveren schijnsel tussen de bomen en struiken . Ze had met haar moeder kunnen gaan winkelen , zwemmen of \ terrassen . Dat werkwoord had ze zelf uitgevonden . De middagzon hing klein tussen de takken en de schaduwen van de \ wolken drentelden over het gras . Zij zou mams rug ingewreven hebben en mam de hare . 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 Mit einer Messe in der Sixtinischen Kapelle ist das Konklave \ offiziell zu Ende gegangen .''' from discodop.tree import DrawTree trees = [Tree(a) for a in trees.splitlines()] sents = [a.split() for a in sents.splitlines()] sents.extend([['Wake', None, 'up'], [None, 'your', 'friend', None]]) for n, (tree, sent) in enumerate(zip(trees, sents)): drawtree = DrawTree(tree, sent) print('\ntree, sent', n, tree, ' '.join('...' if a is None else a for a in sent), repr(drawtree), sep='\n') try: print(drawtree.text(unicodelines=True, ansi=True), sep='\n') except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False), sep='\n')
def parse(): """Display parse. To be invoked by an AJAX call.""" sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] username = session['username'] require = request.args.get('require', '') block = request.args.get('block', '') urlprm = dict(sentno=sentno) if require and require != '': urlprm['require'] = require if block and block != '': urlprm['block'] = block require, block = parseconstraints(require, block) if require or block: session['actions'][CONSTRAINTS] += 1 session.modified = True if False and app.config['DEBUG']: resp = worker.getparses(sent, require, block) else: resp = WORKERS[username].submit(worker.getparses, sent, require, block).result() senttok, parsetrees, messages, elapsed = resp maxdepth = '' if not parsetrees: result = ('no parse! reload page to clear constraints, ' 'or continue with next sentence.') nbest = dep = depsvg = '' else: dep = depsvg = '' if workerattr('headrules'): dep = writedependencies(parsetrees[0][1], senttok, 'conll') depsvg = Markup(DrawDependencies.fromconll(dep).svg()) result = '' dectree, maxdepth, _ = decisiontree(parsetrees, senttok, urlprm) prob, tree, _treestr, _fragments = parsetrees[0] nbest = Markup( '%s\nbest tree: %s' % (dectree, ('%(n)d. [%(prob)s] ' '<a href="/annotate/accept?%(urlprm)s">accept this tree</a>; ' '<a href="/annotate/edit?%(urlprm)s">edit</a>; ' '<a href="/annotate/deriv?%(urlprm)s">derivation</a>\n\n' '%(tree)s' % dict(n=1, prob=probstr(prob), urlprm=urlencode(dict(urlprm, n=1)), tree=DrawTree(tree, senttok).text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t1'))))) msg = '\n'.join(messages) elapsed = 'CPU time elapsed: %s => %gs' % (' '.join( '%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join( ('length: %d;' % len(senttok), msg, elapsed, 'most probable parse trees:', ''.join('%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(treestr, senttok)) for n, (prob, _tree, treestr, _deriv) in enumerate(parsetrees) if treestr is not None) + '\n')) return render_template('annotatetree.html', sent=sent, result=result, nbest=nbest, info=info, dep=dep, depsvg=depsvg, maxdepth=maxdepth, msg='%d parse trees' % len(parsetrees))
def decisiontree(parsetrees, sent, urlprm): """Create a decision tree to select among n trees.""" # The class labels are the n-best trees 0..n # The attributes are the labeled spans in the trees; they split the n-best # trees into two sets with and without that span. spans = {} if len(parsetrees) <= 1: return '', 0, None for n, (_prob, tree, _, _) in enumerate(parsetrees): for span in getspans(tree): # simplest strategy: store presence of span as binary feature # perhaps better: use weight from tree probability spans.setdefault(span, set()).add(n) # create decision tree with scikit-learn features = list(spans) featurenames = [ '[%s %s]' % (label, ' '.join(sent[n] for n in leaves)) for label, leaves in features ] data = np.array([[n in spans[span] for span in features] for n in range(len(parsetrees))], dtype=np.bool) estimator = DecisionTreeClassifier(random_state=0) estimator.fit(data, range(len(parsetrees)), sample_weight=[prob for prob, _, _, _ in parsetrees]) path = estimator.decision_path(data) def rec(tree, n=0, depth=0): """Recursively produce a string representation of a decision tree.""" if tree.children_left[n] == tree.children_right[n]: x = tree.value[n].nonzero()[1][0] prob, _tree, _treestr, _fragments = parsetrees[x] thistree = ( '%(n)d. [%(prob)s] ' '<a href="/annotate/accept?%(urlprm)s">accept this tree</a>; ' '<a href="/annotate/edit?%(urlprm)s">edit</a>; ' '<a href="/annotate/deriv?%(urlprm)s">derivation</a>\n\n' % dict(n=x + 1, prob=probstr(prob), urlprm=urlencode(dict(urlprm, n=x + 1, dec=depth)))) return ('<span id="d%d" style="display: none; ">%stree %d:\n' '%s</span>' % (n, depth * '\t', x + 1, thistree)) left = tree.children_left[n] right = tree.children_right[n] return ('<span id=d%(n)d style="display: %(display)s; ">' '%(indent)s%(constituent)s ' '<a href="javascript: showhide(\'d%(right)s\', \'d%(left)s\', ' '\'dd%(exright)s\', \'%(numtrees)s\'); ">' 'good constituent</a> ' '<a href="javascript: showhide(\'d%(left)s\', \'d%(right)s\', ' '\'dd%(exleft)s\', \'%(numtrees)s\'); ">' 'bad constituent</a> ' '%(subtree1)s%(subtree2)s</span>' % dict( n=n, display='block' if n == 0 else 'none', indent=depth * 4 * ' ', constituent=featurenames[tree.feature[n]], left=left, right=right, exleft=path[:, left].nonzero()[0][0], exright=path[:, right].nonzero()[0][0], numtrees=len(parsetrees), subtree1=rec(tree, left, depth + 1), subtree2=rec(tree, right, depth + 1), )) nodes = rec(estimator.tree_) leaves = [] seen = set() for n in range(estimator.tree_.node_count): x = estimator.tree_.value[n].nonzero()[1][0] if x in seen: continue seen.add(x) _prob, xtree, _treestr, _fragments = parsetrees[x] thistree = DrawTree(xtree, sent).text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t%d' % (x + 1)) leaves.append('<span id="dd%d" style="display: none; ">%s</span>' % (x, thistree)) return nodes + ''.join(leaves), estimator.tree_.max_depth, path