def test_mergedicsnodes(self): tree = Tree.parse( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4)' '(VVPP 5)) (VAINF 6)) (VMFIN 3))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes(tree))) == ( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) ' '(VAINF 6)) (VMFIN 3))') assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == ( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) ' '(VAINF 6)) (VMFIN 3))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes( tree, markorigin=True))) == ('(S (X (A 0) (A 2)) (X (A 1) (A 3)))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(splitdiscnodes(tree, markorigin=True)) == ( '(S (X*0 (A 0)) (X*0 (A 1)) (X*1 (A 2)) (X*1 (A 3)))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(mergediscnodes( splitdiscnodes(tree))) == ('(S (X (A 0) (A 1) (A 2) (A 3)))')
def test_optimalbinarize(): """Verify that all optimal parsing complexities are lower than or equal to the complexities of right-to-left binarizations.""" from discodop.treetransforms import optimalbinarize, complexityfanout from discodop.treebank import NegraCorpusReader corpus = NegraCorpusReader('alpinosample.export', punct='move') total = violations = violationshd = 0 for n, (tree, sent) in enumerate(zip(list( corpus.trees().values())[:-2000], corpus.sents().values())): t = addbitsets(tree) if all(fanout(x) == 1 for x in t.subtrees()): continue print(n, tree, '\n', ' '.join(sent)) total += 1 optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1) # undo head-ordering to get a normal right-to-left binarization normbin = addbitsets(binarize(canonicalize(Tree.convert(tree)))) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('non-hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violations += 1 optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1) normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1)) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violationshd += 1 print('opt. bin. violations normal: %d / %d; hd: %d / %d' % ( violations, total, violationshd, total)) assert violations == violationshd == 0
def brackettree(treestr, sent, brackets, strtermre): """ Parse a single tree presented in bracket format, whether with indices or not; sent may be None / empty. """ if strtermre.search(treestr): # terminals are not all indices treestr = FRONTIERNTRE.sub(' ...)', treestr) sent = TERMINALSRE.findall(treestr) cnt = count() tree = Tree.parse(treestr, brackets=brackets, parse_leaf=lambda x: next(cnt)) else: # disc. trees with integer indices as terminals tree = Tree.parse(treestr, parse_leaf=int, brackets=brackets) sent = (sent.split() if sent.strip() else map(str, range(max(tree.leaves()) + 1))) return tree, sent
def test_fragments(): from discodop._fragments import getctrees, extractfragments, exactcounts treebank = """\ (S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (JJ 4) (NN 5))))\ The cat saw the hungry dog (S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\ The cat saw the dog (S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\ The mouse saw the cat (S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (JJ 4) (NN 5))))\ The mouse saw the yellow cat (S (NP (DT 0) (JJ 1) (NN 2)) (VP (VBP 3) (NP (DT 4) (NN 5))))\ The little mouse saw the cat (S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\ The cat ate the dog (S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) (NN 4))))\ The mouse ate the cat""".splitlines() trees = [binarize(Tree(line.split('\t')[0])) for line in treebank] sents = [line.split('\t')[1].split() for line in treebank] for tree in trees: for n, idx in enumerate(tree.treepositions('leaves')): tree[idx] = n params = getctrees(zip(trees, sents)) fragments = extractfragments(params['trees1'], 0, 0, params['vocab'], disc=True, approx=False) counts = exactcounts(list(fragments.values()), params['trees1'], params['trees1']) assert len(fragments) == 25 assert sum(counts) == 100
def decorate(self, tree, sent): """Return a copy of tree with labels decorated with IDs. >>> d = TreeDecorator() >>> tree = Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))", parse_leaf=int) >>> d.decorate(tree, ['the', 'dog', 'walks']) ... # doctest: +NORMALIZE_WHITESPACE Tree('S', [Tree('NP@1-0', [Tree('DT@1-1', [0]), Tree('N@1-2', [1])]), Tree('VP@1-3', [2])]) >>> d = TreeDecorator(memoize=True) >>> print(d.decorate(Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))", ... parse_leaf=int), ['the', 'dog', 'walks'])) (S (NP@1-1 (DT@1-2 0) (N@1-3 1)) (VP@1-4 2)) >>> print(d.decorate(Tree.parse("(S (NP (DT 0) (N 1)) (VP 2))", ... parse_leaf=int), ['the', 'dog', 'barks'])) (S (NP@1-1 (DT@1-2 0) (N@1-3 1)) (VP@2-4 2))""" if self.memoize: self.ids = 0 # wrap tree to get equality wrt sent tree = DiscTree(tree.freeze(), sent) dectree = ImmutableTree(tree.label, map(self._recdecorate, tree)) else: dectree = Tree.convert(tree.copy(True)) # skip top node, should not get an ID for m, a in enumerate(islice(dectree.subtrees(), 1, None)): a.label = "%s@%d-%d" % (a.label, self.n, m) self.n += 1 return dectree
def __init__(self, tree, sent=None, highlight=(), abbr=False): self.tree = tree self.sent = sent if isinstance(tree, basestring): self.tree = Tree.parse(tree, parse_leaf=None if sent is None else int) if sent is None: leaves = self.tree.leaves() if (leaves and not any(len(a) == 0 for a in self.tree.subtrees()) and all(isinstance(a, int) for a in leaves)): self.sent = [str(a) for a in leaves] else: # this deals with empty nodes (frontier non-terminals) # and multiple/mixed terminals under non-terminals. self.tree = self.tree.copy(True) self.sent = [] for a in self.tree.subtrees(): if len(a) == 0: a.append(len(self.sent)) self.sent.append(None) elif any(not isinstance(b, Tree) for b in a): for n, b in enumerate(a): if not isinstance(b, Tree): a[n] = len(self.sent) self.sent.append('%s' % b) if abbr: if self.tree is tree: self.tree = self.tree.copy(True) for n in self.tree.subtrees(lambda x: len(x.label) > 5): n.label = n.label[:4] + u'\u2026' # unicode '...' ellipsis self.highlight = set() self.nodes, self.coords, self.edges = self.nodecoords( self.tree, self.sent, highlight)
def test_binarize(self): treestr = '(S (VP (PDS 0) (ADV 3) (VVINF 4)) (VMFIN 1) (PIS 2))' origtree = Tree(treestr) tree = Tree(treestr) tree[1].type = HEAD # set VMFIN as head assert str(binarize(tree, horzmarkov=0)) == ( '(S (VP (PDS 0) (VP|<> (ADV 3) (VVINF 4))) (S|<> (VMFIN 1)' ' (PIS 2)))') assert unbinarize(tree) == origtree assert str(binarize(tree, horzmarkov=1)) == ( '(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VVINF 4))) (S|<VMFIN> ' '(VMFIN 1) (PIS 2)))') assert unbinarize(tree) == origtree assert str( binarize(tree, horzmarkov=1, leftmostunary=False, rightmostunary=True, headoutward=True) ) == ('(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VP|<VVINF> (VVINF 4)))) ' '(S|<VMFIN> (S|<VMFIN> (VMFIN 1)) (PIS 2)))') assert unbinarize(tree) == origtree assert str( binarize(tree, horzmarkov=1, leftmostunary=True, rightmostunary=False, headoutward=True)) == ( '(S (S|<VP> (VP (VP|<PDS> (PDS 0) (VP|<ADV> (ADV 3) ' '(VVINF 4)))) (S|<VMFIN> (VMFIN 1) (PIS 2))))') assert unbinarize(tree) == origtree assert str( binarize(tree, factor='left', horzmarkov=2, headoutward=True)) == ( '(S (S|<VMFIN,PIS> (VP (VP|<PDS,ADV> (PDS 0) (ADV 3)) ' '(VVINF 4)) (VMFIN 1)) (PIS 2))') assert unbinarize(tree) == origtree tree = Tree('(S (A 0) (B 1) (C 2) (D 3) (E 4) (F 5))') assert str(binarize(tree, headoutward=True)) == ( '(S (A 0) (S|<B,C,D,E,F> (B 1) (S|<C,D,E,F> (C 2) (S|<D,E,F> ' '(D 3) (S|<E,F> (E 4) (F 5))))))')
def optimalbinarize(tree, sep='|', headdriven=False, h=None, v=1): """ Recursively binarize a tree, optimizing for complexity. v=0 is not implemented. Setting h to a nonzero integer restricts the possible binarizations to head driven binarizations. """ if h is None: tree = Tree.convert(tree) for a in list(tree.subtrees(lambda x: len(x) > 1))[::-1]: a.sort(key=lambda x: x.leaves()) return recbinarizetree(addbitsets(tree), sep, headdriven, h or 999, v, ())
def test(): """ Simple demonstration. """ a = Tree.parse("(f (d (a 0) (c (b 1))) (e 2))", parse_leaf=int) b = Tree.parse("(f (c (d (a 0) (b 1)) (e 2)))", parse_leaf=int) result1 = treedist(a, b, debug=True) assert result1 == 2 print('%s\n%s\ndistance: %d' % (a, b, result1)) result2 = newtreedist(a, b, debug=True) assert result2 == 2 print('%s\n%s\ndistance: %d' % (a, b, result2)) a = Tree.parse("(f (d (x (a 0)) (b 1) (c 2)) (z 3))", parse_leaf=int) b = Tree.parse("(f (c (d (a 0) (x (b 1)) (c 2)) (z 3)))", parse_leaf=int) result1 = treedist(a, b, debug=True) assert result1 == 3 print('%s\n%s\ndistance: %d' % (a, b, result1)) result2 = newtreedist(a, b, debug=True) assert result2 == 3 print('%s\n%s\ndistance: %d' % (a, b, result2))
def dobinarization(trees, sents, binarization, relationalrealizational): """Apply binarization.""" # fixme: this n should correspond to sentence id tbfanout, n = treebank.treebankfanout(trees) logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s', tbfanout, n, trees[n], ' '.join(sents[n])) # binarization begin = time.clock() msg = 'binarization: %s' % binarization.method if binarization.fanout_marks_before_bin: trees = [treetransforms.addfanoutmarkers(t) for t in trees] if binarization.method is None: pass elif binarization.method == 'default': msg += ' %s h=%d v=%d %s' % ( binarization.factor, binarization.h, binarization.v, 'tailmarker' if binarization.tailmarker else '') for a in trees: treetransforms.binarize(a, factor=binarization.factor, tailmarker=binarization.tailmarker, horzmarkov=binarization.h, vertmarkov=binarization.v, leftmostunary=binarization.leftmostunary, rightmostunary=binarization.rightmostunary, reverse=binarization.revmarkov, headidx=-1 if binarization.markhead else None, filterfuncs=(relationalrealizational['ignorefunctions'] + (relationalrealizational['adjunctionlabel'], )) if relationalrealizational else (), labelfun=binarization.labelfun) elif binarization.method == 'optimal': trees = [Tree.convert(treetransforms.optimalbinarize(tree)) for n, tree in enumerate(trees)] elif binarization.method == 'optimalhead': msg += ' h=%d v=%d' % ( binarization.h, binarization.v) trees = [Tree.convert(treetransforms.optimalbinarize( tree, headdriven=True, h=binarization.h, v=binarization.v)) for n, tree in enumerate(trees)] trees = [treetransforms.addfanoutmarkers(t) for t in trees] logging.info('%s; cpu time elapsed: %gs', msg, time.clock() - begin) trees = [treetransforms.canonicalize(a).freeze() for a in trees] return trees
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers from discodop.disambiguation import getderivations, marginalize corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, _backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(None, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) result, msg = grammar.testgrammar() assert result, 'RFE should sum to 1.\n%s' % msg for tree, sent in zip(corpus.trees().values(), sents): if debug: print('sentence:', ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: getderivations(chart, 100) _parses, _msg = marginalize('mpp', chart) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse('(ROOT (S (F (E (S (C (B (A 0))))))))', parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def test_mergedicsnodes(self): tree = Tree.parse('(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4)' '(VVPP 5)) (VAINF 6)) (VMFIN 3))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes(tree))) == ( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) ' '(VAINF 6)) (VMFIN 3))') assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == ( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) ' '(VAINF 6)) (VMFIN 3))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == ( '(S (X (A 0) (A 2)) (X (A 1) (A 3)))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(splitdiscnodes(tree, markorigin=True)) == ( '(S (X*0 (A 0)) (X*0 (A 1)) (X*1 (A 2)) (X*1 (A 3)))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes(tree))) == ( '(S (X (A 0) (A 1) (A 2) (A 3)))')
def noparse(self, stage, sent, tags, lastsuccessfulparse): """Return parse from previous stage or a dummy parse.""" # use successful parse from earlier stage if available if lastsuccessfulparse is not None: parsetree = lastsuccessfulparse.copy(True) else: # Produce a dummy parse for evaluation purposes. default = defaultparse([(n, t) for n, t in enumerate(tags or (len(sent) * ['NONE']))]) parsetree = Tree.parse('(%s %s)' % (stage.grammar.start, default), parse_leaf=int) noparse = True prob = 1.0 return parsetree, prob, noparse
def get_subtree(self, nt): """Return a derivation subtree. Parameters ---------- nt : str The nonterminal to start with. Returns ------- Tree The tree with the current node as root. """ edge = self.get_witness(nt)[0] if edge is None: raise ValueError("There is no witness for %s" % nt) if not edge.get_successors(): stdout.flush() return Tree(edge.get_nonterminal(), [self.get_label()[0]]) else: s = edge.get_successors() return Tree(edge.get_nonterminal(), [t.get_subtree(n) for t, n in s])
def postprocess(self, treestr, stage=-1): """Take parse tree and apply postprocessing.""" parsetree = Tree.parse(treestr, parse_leaf=int) if self.stages[stage].split: mergediscnodes(unbinarize(parsetree, childchar=':', expandunary=False)) saveheads(parsetree, self.binarization.tailmarker) unbinarize(parsetree, expandunary=False) removefanoutmarkers(parsetree) if self.relationalrealizational: parsetree = rrbacktransform(parsetree, self.relationalrealizational['adjunctionlabel']) if self.transformations: reversetransform(parsetree, self.transformations) return parsetree, False
def postprocess(self, treestr, stage=-1, derivs=None): """ Take parse tree and apply postprocessing. """ parsetree = Tree.parse(treestr, parse_leaf=int) if self.stages[stage].split: mergediscnodes(unbinarize(parsetree, childchar=':')) saveheads(parsetree, self.tailmarker) unbinarize(parsetree) removefanoutmarkers(parsetree) if self.relationalrealizational: parsetree = rrbacktransform(parsetree, self.relationalrealizational['adjunctionlabel']) if self.transformations: reversetransform(parsetree, self.transformations) fragments = derivs.get(treestr) if derivs else None return parsetree, fragments, False
def trees(self, query, subset=None, maxresults=10, nofunc=False, nomorph=False): subset = subset or self.files # %s the sentence number # %w complete tree in bracket notation # %h the matched subtree in bracket notation fmt = r'%s:::%w:::%h\n' result = [] jobs = {} for filename in subset: try: x, maxresults2 = self.cache['trees', query, filename, nofunc, nomorph] except KeyError: maxresults2 = 0 if not maxresults or maxresults > maxresults2: jobs[self._submit(lambda x: list(self._query( query, x, fmt, maxresults)), filename)] = filename else: result.extend(x[:maxresults]) for future in self._as_completed(jobs): filename = jobs[future] x = [] for sentno, line in future.result(): treestr, match = line.split(':::') treestr = filterlabels(treestr, nofunc, nomorph) treestr = treestr.replace(" )", " -NONE-)") cnt = count() if match.startswith('('): treestr = treestr.replace(match, '%s_HIGH %s' % tuple( match.split(None, 1)), 1) else: match = ' %s)' % match treestr = treestr.replace(match, '_HIGH%s' % match) tree = Tree.parse(treestr, parse_leaf=lambda _: next(cnt)) sent = re.findall(r" +([^ ()]+)(?=[ )])", treestr) high = list(tree.subtrees(lambda n: n.label.endswith("_HIGH"))) if high: high = high.pop() high.label = high.label.rsplit("_", 1)[0] high = list(high.subtrees()) + high.leaves() x.append((filename, sentno, tree, sent, high)) self.cache['trees', query, filename, nofunc, nomorph] = x, maxresults result.extend(x) return result
def test_balancedpunctraise(self): tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)' ' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP' ' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)' ' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)' ' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))' ' ($. 25))', parse_leaf=int) sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus " ". . . ' , die Hayko Siemens musikalisch leitet , bietet " "wieder ungewoehnliche Kombinationen .".split()) punctraise(tree, sent) balancedpunctraise(tree, sent) assert max(map(fanout, addbitsets(tree).subtrees())) == 1 nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP ' '(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S ' '(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) ' '(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int) assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
cp = ConfigParser() cp.read(argv[1]) config = corpusparam(**cp["Corpus"], **cp["Grammar"]) from discodop.tree import Tree from discodop.treebank import READERS from discodop.treetransforms import addfanoutmarkers, binarize, collapseunary from discodop.lexgrammar import SupertagCorpus, SupertagGrammar corpus = READERS[config.inputfmt](config.filename, encoding=config.inputenc, punct="move") trees = [ addfanoutmarkers( binarize( collapseunary( Tree.convert(t), collapseroot=True, collapsepos=True), horzmarkov=config.h, vertmarkov=config.v)) for t in corpus.trees().values()] sents = list(corpus.sents().values()) corpus = SupertagCorpus(trees, sents) size = len(corpus.sent_corpus) portions = config.split.split() names = "train dev test".split() assert len(portions) in [3,4] if portions[0] == "debug": portions = tuple(int(portion) for portion in portions[1:2]+portions[1:]) limits = tuple((name, slice(0, end)) for name, end in zip(names, portions)) else:
def reattach(): """Re-draw tree after re-attaching node under new parent.""" sentno = int(request.args.get('sentno')) # 1-indexed sent = SENTENCES[QUEUE[sentno - 1][0]] senttok, _ = worker.postokenize(sent) treestr = request.args.get('tree', '') try: tree, _sent1 = validate(treestr, senttok) except ValueError as err: return str(err) dt = DrawTree(tree, senttok) error = '' if request.args.get('newparent') == 'deletenode': # remove nodeid by replacing it with its children _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) x = dt.nodes[nodeid] if nodeid == 0 or isinstance(x[0], int): error = 'ERROR: cannot remove ROOT or POS node' else: children = list(x) x[:] = [] for y in dt.nodes[0].subtrees(): if any(child is x for child in y): i = y.index(x) y[i:i + 1] = children tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. break elif request.args.get('nodeid', '').startswith('newlabel_'): # splice in a new node under parentid _treeid, newparent = request.args.get('newparent', '').lstrip('t').split('_') newparent = int(newparent) label = request.args.get('nodeid').split('_', 1)[1] y = dt.nodes[newparent] if isinstance(y[0], int): error = 'ERROR: cannot add node under POS tag' else: children = list(y) y[:] = [] y[:] = [Tree(label, children)] tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. else: # re-attach existing node at existing new parent _treeid, nodeid = request.args.get('nodeid', '').lstrip('t').split('_') nodeid = int(nodeid) _treeid, newparent = request.args.get('newparent', '').lstrip('t').split('_') newparent = int(newparent) # remove node from old parent # dt.nodes[nodeid].parent.pop(dt.nodes[nodeid].parent_index) x = dt.nodes[nodeid] y = dt.nodes[newparent] for node in x.subtrees(): if node is y: error = ('ERROR: cannot re-attach subtree' ' under (descendant of) itself\n') break else: for node in dt.nodes[0].subtrees(): if any(child is x for child in node): if len(node) > 1: node.remove(x) dt.nodes[newparent].append(x) tree = canonicalize(dt.nodes[0]) dt = DrawTree(tree, senttok) # kludge.. else: error = ('ERROR: re-attaching only child creates' ' empty node %s; remove manually\n' % node) break treestr = writediscbrackettree(tree, senttok, pretty=True).rstrip() link = ('<a href="/annotate/accept?%s">accept this tree</a>' % urlencode(dict(sentno=sentno, tree=treestr))) if error == '': session['actions'][REATTACH] += 1 session.modified = True return Markup('%s\n\n%s%s\t%s' % (link, error, dt.text(unicodelines=True, html=True, funcsep='-', morphsep='/', nodeprops='t0'), treestr))
def test_treedraw(): """Draw some trees. Only tests whether no exception occurs.""" trees = '''(ROOT (S (ADV 0) (VVFIN 1) (NP (PDAT 2) (NN 3)) (PTKNEG 4) \ (PP (APPRART 5) (NN 6) (NP (ART 7) (ADJA 8) (NN 9)))) ($. 10)) (S (NP (NN 1) (EX 3)) (VP (VB 0) (JJ 2))) (S (VP (PDS 0) (ADV 3) (VVINF 4)) (PIS 2) (VMFIN 1)) (top (du (comp 0) (smain (noun 1) (verb 2) (inf (verb 8) (inf \ (adj 3) (pp (prep 4) (np (det 5) (noun 6))) (part 7) (verb 9) \ (pp (prep 10) (np (det 11) (noun 12) (pp (prep 13) (mwu \ (noun 14) (noun 15))))))))) (punct 16)) (top (smain (noun 0) (verb 1) (inf (verb 5) (inf (np (det 2) \ (adj 3) (noun 4)) (verb 6) (pp (prep 7) (noun 8))))) (punct 9)) (top (smain (noun 0) (verb 1) (noun 2) (inf (adv 3) (verb 4))) \ (punct 5)) (top (punct 5) (du (smain (noun 0) (verb 1) (ppart (np (det 2) \ (noun 3)) (verb 4))) (conj (sv1 (conj (noun 6) (vg 7) (np \ (det 8) (noun 9))) (verb 10) (noun 11) (part 12)) (vg 13) \ (sv1 (verb 14) (ti (comp 19) (inf (np (conj (det 15) (vg 16) \ (det 17)) (noun 18)) (verb 20)))))) (punct 21)) (top (punct 10) (punct 16) (punct 18) (smain (np (det 0) (noun 1) \ (pp (prep 2) (np (det 3) (noun 4)))) (verb 5) (adv 6) (np \ (noun 7) (noun 8)) (part 9) (np (det 11) (noun 12) (pp \ (prep 13) (np (det 14) (noun 15)))) (conj (vg 20) (ppres \ (adj 17) (pp (prep 22) (np (det 23) (adj 24) (noun 25)))) \ (ppres (adj 19)) (ppres (adj 21)))) (punct 26)) (top (punct 10) (punct 11) (punct 16) (smain (np (det 0) \ (noun 1)) (verb 2) (np (det 3) (noun 4)) (adv 5) (du (cp \ (comp 6) (ssub (noun 7) (verb 8) (inf (verb 9)))) (du \ (smain (noun 12) (verb 13) (adv 14) (part 15)) (noun 17)))) \ (punct 18) (punct 19)) (top (smain (noun 0) (verb 1) (inf (verb 8) (inf (verb 9) (inf \ (adv 2) (pp (prep 3) (noun 4)) (pp (prep 5) (np (det 6) \ (noun 7))) (verb 10))))) (punct 11)) (top (smain (noun 0) (verb 1) (pp (prep 2) (np (det 3) (adj 4) \ (noun 5) (rel (noun 6) (ssub (noun 7) (verb 10) (ppart \ (adj 8) (part 9) (verb 11))))))) (punct 12)) (top (smain (np (det 0) (noun 1)) (verb 2) (ap (adv 3) (num 4) \ (cp (comp 5) (np (det 6) (adj 7) (noun 8) (rel (noun 9) (ssub \ (noun 10) (verb 11) (pp (prep 12) (np (det 13) (adj 14) \ (adj 15) (noun 16))))))))) (punct 17)) (top (smain (np (det 0) (noun 1)) (verb 2) (adv 3) (pp (prep 4) \ (np (det 5) (noun 6)) (part 7))) (punct 8)) (top (punct 7) (conj (smain (noun 0) (verb 1) (np (det 2) \ (noun 3)) (pp (prep 4) (np (det 5) (noun 6)))) (smain \ (verb 8) (np (det 9) (num 10) (noun 11)) (part 12)) (vg 13) \ (smain (verb 14) (noun 15) (pp (prep 16) (np (det 17) \ (noun 18) (pp (prep 19) (np (det 20) (noun 21))))))) \ (punct 22)) (top (smain (np (det 0) (noun 1) (rel (noun 2) (ssub (np (num 3) \ (noun 4)) (adj 5) (verb 6)))) (verb 7) (ppart (verb 8) (pp \ (prep 9) (noun 10)))) (punct 11)) (top (conj (sv1 (np (det 0) (noun 1)) (verb 2) (ppart (verb 3))) \ (vg 4) (sv1 (verb 5) (pp (prep 6) (np (det 7) (adj 8) \ (noun 9))))) (punct 10)) (top (smain (noun 0) (verb 1) (np (det 2) (noun 3)) (inf (adj 4) \ (verb 5) (cp (comp 6) (ssub (noun 7) (adv 8) (verb 10) (ap \ (num 9) (cp (comp 11) (np (det 12) (adj 13) (noun 14) (pp \ (prep 15) (conj (np (det 16) (noun 17)) (vg 18) (np \ (noun 19))))))))))) (punct 20)) (top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) \ (inf (verb 6) (conj (inf (pp (prep 2) (np (det 3) (noun 4))) \ (verb 7)) (inf (verb 9)) (vg 10) (inf (verb 11)))))) \ (punct 12)) (top (smain (verb 2) (noun 3) (adv 4) (ppart (np (det 0) \ (noun 1)) (verb 5))) (punct 6)) (top (conj (smain (np (det 0) (noun 1)) (verb 2) (adj 3) (pp \ (prep 4) (np (det 5) (noun 6)))) (vg 7) (smain (np (det 8) \ (noun 9) (pp (prep 10) (np (det 11) (noun 12)))) (verb 13) \ (pp (prep 14) (np (det 15) (noun 16))))) (punct 17)) (top (conj (smain (noun 0) (verb 1) (inf (ppart (np (noun 2) \ (noun 3)) (verb 4)) (verb 5))) (vg 6) (smain (noun 7) \ (inf (ppart (np (det 8) (noun 9)))))) (punct 10)) (A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10)) (B3 (t 1) \ (t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8))) (A (B1 6 13) (B2 3 7 10) (B3 1 \ 9 11 14 16) (B4 0 5 8)) (VP (VB 0) (PRT 2)) (VP (VP 0 3) (NP (PRP 1) (NN 2))) (ROOT (S (VP_2 (PP (APPR 0) (ART 1) (NN 2) (PP (APPR 3) (ART 4) \ (ADJA 5) (NN 6))) (ADJD 10) (PP (APPR 11) (NN 12)) (VVPP 13)) \ (VAFIN 7) (NP (ART 8) (NN 9))) ($. 14))''' sents = '''Leider stehen diese Fragen nicht im Vordergrund der \ augenblicklichen Diskussion . is Mary happy there das muss man jetzt machen Of ze had gewoon met haar vriendinnen rond kunnen slenteren in de \ buurt van Trafalgar Square . Het had een prachtige dag kunnen zijn in Londen . Cathy zag hen wild zwaaien . Het was een spel geworden , zij en haar vriendinnen kozen iemand \ uit en probeerden zijn of haar nationaliteit te raden . Elk jaar in het hoogseizoen trokken daar massa's toeristen \ voorbij , hun fototoestel in de aanslag , pratend , gillend \ en lachend in de vreemdste talen . Haar vader stak zijn duim omhoog alsof hij wilde zeggen : " het \ komt wel goed , joch " . Ze hadden languit naast elkaar op de strandstoelen kunnen gaan \ liggen . Het hoorde bij de warme zomerdag die ze ginds achter had gelaten . De oprijlaan was niet meer dan een hobbelige zandstrook die zich \ voortslingerde tussen de hoge grijze boomstammen . Haar moeder kleefde bijna tegen het autoraampje aan . Ze veegde de tranen uit haar ooghoeken , tilde haar twee koffers \ op en begaf zich in de richting van het landhuis . Het meisje dat vijf keer juist raadde werd getrakteerd op ijs . Haar neus werd platgedrukt en leek op een jonge champignon . Cathy zag de BMW langzaam verdwijnen tot hij niet meer was dan \ een zilveren schijnsel tussen de bomen en struiken . Ze had met haar moeder kunnen gaan winkelen , zwemmen of \ terrassen . Dat werkwoord had ze zelf uitgevonden . De middagzon hing klein tussen de takken en de schaduwen van de \ wolken drentelden over het gras . Zij zou mams rug ingewreven hebben en mam de hare . 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 Mit einer Messe in der Sixtinischen Kapelle ist das Konklave \ offiziell zu Ende gegangen .''' from discodop.tree import DrawTree trees = [Tree(a) for a in trees.splitlines()] sents = [a.split() for a in sents.splitlines()] sents.extend([['Wake', None, 'up'], [None, 'your', 'friend', None]]) for n, (tree, sent) in enumerate(zip(trees, sents)): drawtree = DrawTree(tree, sent) print('\ntree, sent', n, tree, ' '.join('...' if a is None else a for a in sent), repr(drawtree), sep='\n') try: print(drawtree.text(unicodelines=True, ansi=True), sep='\n') except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False), sep='\n')
def getgrammars(trees, sents, stages, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top): """Read off the requested grammars.""" tbfanout, n = treebank.treebankfanout(trees) logging.info('binarized treebank fan-out: %d #%d', tbfanout, n) for n, stage in enumerate(stages): if stage.split: traintrees = [treetransforms.binarize( treetransforms.splitdiscnodes( Tree.convert(a), stage.markorigin), childchar=':', dot=True, ids=grammar.UniqueIDs()).freeze() for a in trees] logging.info('splitted discontinuous nodes') else: traintrees = trees if stage.mode.startswith('pcfg'): if tbfanout != 1 and not stage.split: raise ValueError('Cannot extract PCFG from treebank ' 'with discontinuities.') backtransform = extrarules = None if lexmodel and simplelexsmooth: extrarules = lexicon.simplesmoothlexicon(lexmodel) if stage.dop: if stage.dop == 'doubledop': (xgrammar, backtransform, altweights, fragments ) = grammar.doubledop( traintrees, sents, binarized=stage.binarized, iterate=stage.iterate, complement=stage.complement, numproc=numproc, extrarules=extrarules) # dump fragments with codecs.getwriter('utf-8')(gzip.open('%s/%s.fragments.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\t%d\n' % (treebank.writetree(a, b, 0, 'bracket' if stage.mode.startswith('pcfg') else 'discbracket').rstrip(), sum(c.values())) for (a, b), c in fragments.items()) elif stage.dop == 'reduction': xgrammar, altweights = grammar.dopreduction( traintrees, sents, packedgraph=stage.packedgraph, extrarules=extrarules) else: raise ValueError('unrecognized DOP model: %r' % stage.dop) nodes = sum(len(list(a.subtrees())) for a in traintrees) if lexmodel and not simplelexsmooth: # FIXME: altweights? xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel) msg = grammar.grammarinfo(xgrammar) rules, lex = grammar.write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) gram = Grammar(rules, lex, start=top, bitpar=stage.mode.startswith('pcfg'), binarized=stage.binarized) for name in altweights: gram.register(u'%s' % name, altweights[name]) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lex) logging.info('DOP model based on %d sentences, %d nodes, ' '%d nonterminals', len(traintrees), nodes, len(gram.toid)) logging.info(msg) if stage.estimator != 'rfe': gram.switch(u'%s' % stage.estimator) logging.info(gram.testgrammar()[1]) if stage.dop == 'doubledop': # backtransform keys are line numbers to rules file; # to see them together do: # $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz) with codecs.getwriter('ascii')(gzip.open( '%s/%s.backtransform.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\n' % a for a in backtransform) if n and stage.prune: msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop else re.compile(b'@.+$'), neverblockre=re.compile(b'.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes msg = gram.getmapping(None, striplabelre=None, neverblockre=re.compile(b'.+}<'), splitprune=False, markorigin=False) logging.info(msg) elif n and stage.prune: # dop reduction msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop and stages[n - 1].dop != 'doubledop' else re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': gram.getrulemapping( stages[n - 1].grammar, re.compile(br'@[-0-9]+\b')) logging.info(msg) # write prob models np.savez_compressed( # pylint: disable=no-member '%s/%s.probs.npz' % (resultdir, stage.name), **{name: mod for name, mod in zip(gram.modelnames, gram.models)}) else: # not stage.dop xgrammar = grammar.treebankgrammar(traintrees, sents, extrarules=extrarules) logging.info('induced %s based on %d sentences', ('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'), len(traintrees)) if stage.split or os.path.exists('%s/pcdist.txt' % resultdir): logging.info(grammar.grammarinfo(xgrammar)) else: logging.info(grammar.grammarinfo(xgrammar, dump='%s/pcdist.txt' % resultdir)) if lexmodel and not simplelexsmooth: xgrammar = lexicon.smoothlexicon(xgrammar, lexmodel) rules, lex = grammar.write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) gram = Grammar(rules, lex, start=top, bitpar=stage.mode.startswith('pcfg')) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lex) logging.info(gram.testgrammar()[1]) if n and stage.prune: msg = gram.getmapping(stages[n - 1].grammar, striplabelre=None, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) logging.info(msg) logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir, stage.name, ',backtransform' if stage.dop == 'doubledop' else '') outside = None if stage.estimates in ('SX', 'SXlrgaps'): if stage.estimates == 'SX' and tbfanout != 1 and not stage.split: raise ValueError('SX estimate requires PCFG.') elif stage.mode != 'plcfrs': raise ValueError('estimates require parser w/agenda.') begin = time.clock() logging.info('computing %s estimates', stage.estimates) if stage.estimates == 'SX': outside = estimates.getpcfgestimates(gram, testmaxwords, gram.toid[trees[0].label]) elif stage.estimates == 'SXlrgaps': outside = estimates.getestimates(gram, testmaxwords, gram.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez_compressed( # pylint: disable=no-member '%s/%s.outside.npz' % (resultdir, stage.name), outside=outside) logging.info('saved %s estimates', stage.estimates) elif stage.estimates: raise ValueError('unrecognized value; specify SX or SXlrgaps.') stage.update(grammar=gram, backtransform=backtransform, outside=outside)
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10]] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=debug, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def test_allfragments(): from discodop.fragments import recurringfragments model = """\ (DT the) 1 (DT The) 1 (JJ hungry) 1 (NN cat) 1 (NN dog) 1 (NP|<DT.JJ,NN> (JJ hungry) (NN )) 1 (NP|<DT.JJ,NN> (JJ hungry) (NN dog)) 1 (NP|<DT.JJ,NN> (JJ ) (NN )) 1 (NP|<DT.JJ,NN> (JJ ) (NN dog)) 1 (NP (DT ) (NN )) 1 (NP (DT ) (NN cat)) 1 (NP (DT ) (NP|<DT.JJ,NN> )) 1 (NP (DT ) (NP|<DT.JJ,NN> (JJ hungry) (NN ))) 1 (NP (DT ) (NP|<DT.JJ,NN> (JJ hungry) (NN dog))) 1 (NP (DT ) (NP|<DT.JJ,NN> (JJ ) (NN ))) 1 (NP (DT ) (NP|<DT.JJ,NN> (JJ ) (NN dog))) 1 (NP (DT The) (NN )) 1 (NP (DT The) (NN cat)) 1 (NP (DT the) (NP|<DT.JJ,NN> )) 1 (NP (DT the) (NP|<DT.JJ,NN> (JJ hungry) (NN ))) 1 (NP (DT the) (NP|<DT.JJ,NN> (JJ hungry) (NN dog))) 1 (NP (DT the) (NP|<DT.JJ,NN> (JJ ) (NN ))) 1 (NP (DT the) (NP|<DT.JJ,NN> (JJ ) (NN dog))) 1 (S (NP (DT ) (NN cat)) (VP )) 1 (S (NP (DT ) (NN cat)) (VP (VBP ) (NP ))) 1 (S (NP (DT ) (NN cat)) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP (DT ) (NN cat)) (VP (VBP saw) (NP ))) 1 (S (NP (DT ) (NN cat)) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP (DT ) (NN )) (VP )) 1 (S (NP (DT ) (NN )) (VP (VBP ) (NP ))) 1 (S (NP (DT ) (NN )) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP (DT ) (NN )) (VP (VBP saw) (NP ))) 1 (S (NP (DT ) (NN )) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP (DT The) (NN cat)) (VP )) 1 (S (NP (DT The) (NN cat)) (VP (VBP ) (NP ))) 1 (S (NP (DT The) (NN cat)) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP (DT The) (NN cat)) (VP (VBP saw) (NP ))) 1 (S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP (DT The) (NN )) (VP )) 1 (S (NP (DT The) (NN )) (VP (VBP ) (NP ))) 1 (S (NP (DT The) (NN )) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP (DT The) (NN )) (VP (VBP saw) (NP ))) 1 (S (NP (DT The) (NN )) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP ) (VP )) 1 (S (NP ) (VP (VBP ) (NP ))) 1 (S (NP ) (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (S (NP ) (VP (VBP saw) (NP ))) 1 (S (NP ) (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> )))) 1 (VBP saw) 1 (VP (VBP ) (NP )) 1 (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> ))) 1 (VP (VBP ) (NP (DT ) (NP|<DT.JJ,NN> (JJ ) (NN )))) 1 (VP (VBP ) (NP (DT the) (NP|<DT.JJ,NN> ))) 1 (VP (VBP ) (NP (DT the) (NP|<DT.JJ,NN> (JJ ) (NN )))) 1 (VP (VBP saw) (NP )) 1 (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> ))) 1 (VP (VBP saw) (NP (DT ) (NP|<DT.JJ,NN> (JJ ) (NN )))) 1 (VP (VBP saw) (NP (DT the) (NP|<DT.JJ,NN> ))) 1 (VP (VBP saw) (NP (DT the) (NP|<DT.JJ,NN> (JJ ) (NN )))) 1""" model = { a.split('\t')[0]: int(a.split('\t')[1]) for a in model.splitlines() } answers = recurringfragments([ Tree('(S (NP (DT 0) (NN 1)) (VP (VBP 2) (NP (DT 3) ' '(NP|<DT.JJ,NN> (JJ 4) (NN 5)))))') ], [['The', 'cat', 'saw', 'the', 'hungry', 'dog']], disc=False, indices=False, maxdepth=3, maxfrontier=999) assert model assert answers assert answers == model
def test(): """ Run some tests. """ from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import binarize, unbinarize, \ addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from discodop.fragments import getfragments logging.basicConfig(level=logging.DEBUG, format='%(message)s') filename = "alpinosample.export" corpus = NegraCorpusReader('.', filename, punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.parsed_sents().values())[:10]] print('plcfrs') lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label) print(lcfrs) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) print(grammar) grammar.testgrammar() fragments = getfragments(trees, sents, 1) debug = '--debug' in sys.argv grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug) print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) print(grammar) assert grammar.testgrammar(), "DOP1 should sum to 1." for tree, sent in zip(corpus.parsed_sents().values(), sents): print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) print('\n', msg, end='') print("\ngold ", tree) print("double dop", end='') if chart: mpp = {} parsetrees = {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.getkey(), chart, grammar, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) print(len(mpp), 'parsetrees', end='') print(sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): print(tp, '\n', t, end='') print("match:", t == str(tree)) assert len(set(parsetrees[t])) == len(parsetrees[t]) if not debug: continue for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) else: print("no parse") print(chart) print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) objfun = request.args.get('objfun', 'mpp') est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') coarse = request.args.get('coarse', 'pcfg') html = 'html' in request.args lang = request.args.get('lang', 'detect') require = request.args.get('require', None) block = request.args.get('block', None) if not sent: return '' nbest = None if POSTAGS.match(sent): senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split())) else: senttok, tags = tuple(tokenize(sent)), None if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) if require: require = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(require))) if block: block = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(block))) key = (senttok, tags, est, marg, objfun, coarse, lang, require, block) resp = CACHE.get(key) if resp is None: urlparams = dict(sent=sent, lang=lang, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html) if require: urlparams['require'] = json.dumps(require) if block: urlparams['block'] = json.dumps(block) link = '?' + url_encode(urlparams) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = ('pcfg' if coarse == 'pcfg-posterior' else coarse) if len(PARSERS[lang].stages) > 1: PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse(senttok, tags=tags, require=require, block=block)) if SHOWMORPH: replacemorph(results[-1].parsetree) if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] LOG.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup( DrawTree(tree, senttok).text(unicodelines=True, html=html, funcsep='-')) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: replacemorph(tree) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) if PARSERS[lang].headrules: xtree = PARSERS[lang].postprocess(parsetrees[0][0], senttok, -1)[0] dep = treebank.writedependencies(xtree, senttok, 'conll') depsvg = Markup(DrawDependencies.fromconll(dep).svg()) else: dep = depsvg = '' rid = randid() nbest = Markup('\n\n'.join( '%d. [%s] ' '<a href=\'javascript: toggle("f%s%d"); \'>' 'derivation</a>\n' '<span id=f%s%d style="display: none; margin-left: 3em; ">' 'Fragments used in the highest ranked derivation' ' of this parse tree:\n%s</span>\n%s' % ( n + 1, probstr(prob), rid, n + 1, rid, n + 1, '\n\n'.join( '%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=html)) for frag, w in fragments or () # if frag.count('(') > 1 ), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, fragments) in enumerate(parsetrees_))) deriv = Markup( 'Fragments used in the highest ranked derivation' ' of best parse tree:\n%s' % ( '\n\n'.join( '%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=html)) for frag, w in parsetrees_[0][2] or () # if frag.count('(') > 1 ))) if parsetrees_ else '' msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % (' '.join( '%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(( 'length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', ''.join('%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, nbest, deriv, info, link, dep, depsvg), timeout=5000) else: (sent, result, nbest, deriv, info, link, dep, depsvg) = resp if html: return render_template('parsetree.html', sent=sent, result=result, nbest=nbest, deriv=deriv, info=info, link=link, dep=dep, depsvg=depsvg, randid=randid()) else: return Response('\n'.join((nbest, info, result)), mimetype='text/plain')
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') objfun = request.args.get('objfun', 'mpp') coarse = request.args.get('coarse', None) html = 'html' in request.args lang = request.args.get('lang', 'detect') if not sent: return '' frags = nbest = None senttok = tokenize(sent) if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) key = (senttok, est, marg, objfun, coarse, lang) resp = CACHE.get(key) if resp is None: link = 'parse?' + url_encode(dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html)) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = coarse PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse(senttok)) if results[-1].noparse: parsetrees = [] result = 'no parse!' frags = nbest = '' else: if SHOWMORPH: for node in results[-1].parsetree.subtrees( lambda n: n and not isinstance(n[0], Tree)): treebank.handlemorphology( 'replace', None, node, node.source) node.label = node.label.replace('[]', '') if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] fragments = results[-1].fragments or () APP.logger.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup(DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) frags = Markup('Phrasal fragments used in the most probable ' 'derivation of the highest ranked parse tree:\n' + '\n\n'.join( DrawTree(frag).text(unicodelines=True, html=html) for frag in fragments if frag.count('(') > 1)) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: for node in tree.subtrees( lambda n: n and not isinstance(n[0], Tree)): treebank.handlemorphology( 'replace', None, node, node.source) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) nbest = Markup('\n\n'.join('%d. [%s]\n%s' % (n + 1, probstr(prob), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, _) in enumerate(parsetrees_))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % ( ' '.join('%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % ( len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', '\n'.join('%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000) else: (sent, result, frags, nbest, # pylint: disable=unpacking-non-sequence info, link) = resp # pylint: disable=unpacking-non-sequence if html: return render_template('parsetree.html', sent=sent, result=result, frags=frags, nbest=nbest, info=info, link=link, randid=randid()) else: return Response('\n'.join((nbest, frags, info, result)), mimetype='text/plain')
def evaluate(self, sentences: SupertagParseDataset, mini_batch_size: int = 32, num_workers: int = 1, embedding_storage_mode: str = "none", out_path=None, only_disc: str = "both", accuracy: str = "both", pos_accuracy: bool = True, return_loss: bool = True) -> Tuple[Result, float]: """ Predicts supertags, pos tags and parse trees, and reports the predictions scores for a set of sentences. :param sentences: a ``DataSet`` of sentences. For each sentence a gold parse tree is expected as value of the `tree` label, as provided by ``SupertagParseDataset``. :param only_disc: If set, overrides the setting `DISC_ONLY` in the evaluation parameter file ``self.evalparam``, i.e. only evaluates discontinuous constituents if True. Pass "both" to report both results. :param accuracy: either 'none', 'best', 'kbest' or 'both'. Determines if the accuracy is computed from the best, or k-best predicted tags. :param pos_accuracy: if set, reports acc. of predicted pos tags. :param return_loss: if set, nll loss wrt. gold tags is reported, otherwise the second component in the returned tuple is 0. :returns: tuple with evaluation ``Result``, where the main score is the f1-score (for all constituents, if only_disc == "both"). """ from flair.datasets import DataLoader from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from timeit import default_timer from collections import Counter if self.__evalparam__ is None: raise Exception( "Need to specify evaluator parameter file before evaluating") if only_disc == "both": evaluators = { "F1-all": Evaluator({ **self.evalparam, "DISC_ONLY": False }), "F1-disc": Evaluator({ **self.evalparam, "DISC_ONLY": True }) } else: mode = self.evalparam["DISC_ONLY"] if only_disc == "param" else ( only_disc == "true") strmode = "F1-disc" if mode else "F1-all" evaluators = { strmode: Evaluator({ **self.evalparam, "DISC_ONLY": mode }) } data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # predict supertags and parse trees eval_loss = 0 start_time = default_timer() for batch in data_loader: loss = self.predict(batch, embedding_storage_mode=embedding_storage_mode, supertag_storage_mode=accuracy, postag_storage_mode=pos_accuracy, label_name='predicted', return_loss=return_loss) eval_loss += loss if return_loss else 0 end_time = default_timer() i = 0 batches = 0 noparses = 0 acc_ctr = Counter() for batch in data_loader: for sentence in batch: for token in sentence: if accuracy in ("kbest", "both") and token.get_tag("supertag").value in \ (l.value for l in token.get_tags_proba_dist('predicted-supertag')): acc_ctr["kbest"] += 1 if accuracy in ("best", "both") and token.get_tag("supertag").value == \ token.get_tag('predicted-supertag').value: acc_ctr["best"] += 1 if pos_accuracy and token.get_tag( "pos").value == token.get_tag( "predicted-pos").value: acc_ctr["pos"] += 1 acc_ctr["all"] += len(sentence) sent = [token.text for token in sentence] gold = Tree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(gold))) parse = Tree(sentence.get_labels("predicted")[0].value) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(parse))) if parse.label == "NOPARSE": noparses += 1 for evaluator in evaluators.values(): evaluator.add(i, gold.copy(deep=True), list(sent), parse.copy(deep=True), list(sent)) i += 1 batches += 1 scores = { strmode: float_or_zero(evaluator.acc.scores()['lf']) for strmode, evaluator in evaluators.items() } if accuracy in ("both", "kbest"): scores["accuracy-kbest"] = acc_ctr["kbest"] / acc_ctr["all"] if accuracy in ("both", "best"): scores["accuracy-best"] = acc_ctr["best"] / acc_ctr["all"] if pos_accuracy: scores["accuracy-pos"] = acc_ctr["pos"] / acc_ctr["all"] scores["coverage"] = 1 - (noparses / i) scores["time"] = end_time - start_time return (Result( scores['F1-all'] if 'F1-all' in scores else scores['F1-disc'], "\t".join(f"{mode}" for mode in scores), "\t".join(f"{s}" for s in scores.values()), '\n\n'.join(evaluator.summary() for evaluator in evaluators.values())), eval_loss / batches)
def getfragments(trees, sents, numproc=1, disc=True, iterate=False, complement=False, indices=True, cover=True): """Get recurring fragments with exact counts in a single treebank. :returns: a dictionary whose keys are fragments as strings, and indices as values. When ``disc`` is ``True``, keys are of the form ``(frag, sent)`` where ``frag`` is a unicode string, and ``sent`` is a list of words as unicode strings; when ``disc`` is ``False``, keys are of the form ``frag`` where ``frag`` is a unicode string. :param trees: a sequence of binarized Tree objects. :param numproc: number of processes to use; pass 0 to use detected # CPUs. :param disc: when disc=True, assume trees with discontinuous constituents. :param iterate, complement: see :func:`_fragments.extractfragments`""" if numproc == 0: numproc = cpu_count() numtrees = len(trees) if not numtrees: raise ValueError('no trees.') mult = 1 # 3 if numproc > 1 else 1 fragments = {} trees = trees[:] work = workload(numtrees, mult, numproc) PARAMS.update(disc=disc, indices=indices, approx=False, complete=False, complement=complement, debug=False, adjacent=False, twoterms=False) initworkersimple(trees, list(sents), disc) if numproc == 1: mymap = map myapply = APPLY else: logging.info("work division:\n%s", "\n".join(" %s: %r" % kv for kv in sorted(dict(numchunks=len(work), numproc=numproc).items()))) # start worker processes pool = Pool(processes=numproc, initializer=initworkersimple, initargs=(trees, list(sents), disc)) mymap = pool.map myapply = pool.apply # collect recurring fragments logging.info("extracting recurring fragments") for a in mymap(worker, work): fragments.update(a) # add 'cover' fragments corresponding to single productions if cover: cover = myapply(coverfragworker, ()) before = len(fragments) fragments.update(cover) logging.info("merged %d unseen cover fragments", len(fragments) - before) fragmentkeys = list(fragments) bitsets = [fragments[a] for a in fragmentkeys] countchunk = len(bitsets) // numproc + 1 work = list(range(0, len(bitsets), countchunk)) work = [(n, len(work), bitsets[a:a + countchunk]) for n, a in enumerate(work)] logging.info("getting exact counts for %d fragments", len(bitsets)) counts = [] for a in mymap(exactcountworker, work): counts.extend(a) if numproc != 1: pool.close() pool.join() del pool if iterate: # optionally collect fragments of fragments logging.info("extracting fragments of recurring fragments") PARAMS['complement'] = False # needs to be turned off if it was on newfrags = fragments trees, sents = None, None ids = count() for _ in range(10): # up to 10 iterations newtrees = [binarize( introducepreterminals(Tree.parse(tree, parse_leaf=int), ids=ids), childchar="}") for tree, _ in newfrags] newsents = [["#%d" % next(ids) if word is None else word for word in sent] for _, sent in newfrags] newfrags, newcounts = iteratefragments( fragments, newtrees, newsents, trees, sents, numproc) if len(newfrags) == 0: break if trees is None: trees = [] sents = [] trees.extend(newtrees) sents.extend(newsents) fragmentkeys.extend(newfrags) counts.extend(newcounts) fragments.update(zip(newfrags, newcounts)) logging.info("found %d fragments", len(fragmentkeys)) if not disc: return {a.decode('utf-8'): b for a, b in zip(fragmentkeys, counts)} return {(a.decode('utf-8'), b): c for (a, b), c in zip(fragmentkeys, counts)}
def bitext(): """ Bitext parsing with a synchronous CFG. Translation would require a special decoder (instead of normal kbest derivations where the whole sentence is given). """ print("bitext parsing with a synchronous CFG") trees = [Tree.parse(a, parse_leaf=int) for a in """\ (ROOT (S (NP (NNP (John 0) (John 7))) (VP (VB (misses 1) (manque 5))\ (PP (IN (a` 6)) (NP (NNP (Mary 2) (Mary 4)))))) (SEP (| 3))) (ROOT (S (NP (NNP (Mary 0) (Mary 4))) (VP (VB (likes 1) (aimes 5))\ (NP (DT (la 6)) (NN (pizza 2) (pizza 7))))) (SEP (| 3)))""".split('\n')] sents = [["0"] * len(a.leaves()) for a in trees] for a in trees: treetransforms.binarize(a) compiled_scfg = Grammar(treebankgrammar(trees, sents)) print("sentences:") for tree in trees: print(' '.join(w for _, w in sorted(tree.pos()))) print("treebank:") for tree in trees: print(tree) print(compiled_scfg, "\n") print("correct translations:") assert parse(compiled_scfg, ["0"] * 7, "John likes Mary | John aimes Mary".split()) assert parse(compiled_scfg, ["0"] * 9, "John misses pizza | la pizza manque a` John".split()) print("incorrect translations:") assert not parse(compiled_scfg, ["0"] * 7, "John likes Mary | Mary aimes John".split()) assert not parse(compiled_scfg, ["0"] * 9, "John misses pizza | John manque a` la pizza".split()) # the following SCFG is taken from: # http://cdec-decoder.org/index.php?title=SCFG_translation # the grammar has been binarized and some new non-terminals had to be # introduced because terminals cannot appear in binary rules. lexicon = ("|", "ein", "ich", "Haus", "kleines", "grosses", "sah", "fand", "small", "little", "big", "large", "house", "shell", "a", "I", "saw", "found") another_scfg = Grammar([ ((('DT', '_ein', '_a'), ((0, ), (1, ))), 0.5), ((('JJ', '_kleines', '_small'), ((0, ), (1, ))), 0.1), ((('JJ', '_kleines', '_little'), ((0, ), (1, ))), 0.9), ((('JJ', '_grosses', '_big'), ((0, ), (1, ))), 0.8), ((('JJ', '_grosses', '_large'), ((0, ), (1, ))), 0.2345), ((('NN_house', '_Haus', '_house'), ((0, ), (1, ))), 1), ((('NN_shell', '_Haus', '_shell'), ((0, ), (1, ))), 1), ((('NP', '_ich', '_I'), ((0, ), (1, ), )), 0.6), ((('NP', 'DT', 'NP|<JJ-NN>'), ((0, 1), (0, 1))), 0.5), ((('NP|<JJ-NN>', 'JJ', 'NN_house'), ((0, 1), (0, 1))), 0.1), ((('NP|<JJ-NN>', 'JJ', 'NN_shell'), ((0, 1), (0, 1))), 1.3), ((('ROOT', 'S', '_|'), ((0, 1, 0), )), 1), ((('S', 'NP', 'VP'), ((0, 1), (0, 1))), 0.2), ((('VP', 'V', 'NP'), ((0, 1), (0, 1))), 0.1), ((('V', '_sah', '_saw'), ((0, ), (1, ))), 0.4), ((('V', '_fand', '_found'), ((0, ), (1, ))), 0.4)] + [((('_%s' % word, 'Epsilon'), (word, )), 1) for word in lexicon]) print(another_scfg) sents = [ "ich sah ein kleines Haus | I saw a small house".split(), "ich sah ein kleines Haus | I saw a little house".split(), "ich sah ein kleines Haus | I saw a small shell".split(), "ich sah ein kleines Haus | I saw a little shell".split()] for sent in sents: assert parse(another_scfg, sent), sent
config.read(argv[1]) data = SupertagParseDataset(f"{config['Corpus']['filename']}.train") from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from discodop.lexgrammar import SupertagGrammar grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb")) i = 0 evaluator = Evaluator(readparam("proper.prm")) for sentence in data: words = tuple(t.text for t in sentence) poss = tuple(t.get_tag("pos").value for t in sentence) tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence) parses = grammar.parse(poss, tags, posmode=True) try: parse = next(parses) except StopIteration: leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words)))) parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})") gold = ParentedTree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(gold)))) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(parse)))) evaluator.add(i, gold.copy(deep=True), list(words), parse.copy(deep=True), list(words)) i += 1 print(evaluator.summary())
def getgrammars(trees, sents, stages, bintype, horzmarkov, vertmarkov, factor, tailmarker, revmarkov, leftmostunary, rightmostunary, pospa, markhead, fanout_marks_before_bin, testmaxwords, resultdir, numproc, lexmodel, simplelexsmooth, top, relationalrealizational): """ Apply binarization and read off the requested grammars. """ # fixme: this n should correspond to sentence id tbfanout, n = treebankfanout(trees) logging.info('treebank fan-out before binarization: %d #%d\n%s\n%s', tbfanout, n, trees[n], ' '.join(sents[n])) # binarization begin = time.clock() if fanout_marks_before_bin: trees = [addfanoutmarkers(t) for t in trees] if bintype == 'binarize': bintype += ' %s h=%d v=%d %s' % (factor, horzmarkov, vertmarkov, 'tailmarker' if tailmarker else '') for a in trees: binarize(a, factor=factor, tailmarker=tailmarker, horzmarkov=horzmarkov, vertmarkov=vertmarkov, leftmostunary=leftmostunary, rightmostunary=rightmostunary, reverse=revmarkov, pospa=pospa, headidx=-1 if markhead else None, filterfuncs=(relationalrealizational['ignorefunctions'] + (relationalrealizational['adjunctionlabel'], )) if relationalrealizational else ()) elif bintype == 'optimal': trees = [Tree.convert(optimalbinarize(tree)) for n, tree in enumerate(trees)] elif bintype == 'optimalhead': trees = [Tree.convert(optimalbinarize(tree, headdriven=True, h=horzmarkov, v=vertmarkov)) for n, tree in enumerate(trees)] trees = [addfanoutmarkers(t) for t in trees] logging.info('binarized %s cpu time elapsed: %gs', bintype, time.clock() - begin) logging.info('binarized treebank fan-out: %d #%d', *treebankfanout(trees)) trees = [canonicalize(a).freeze() for a in trees] for n, stage in enumerate(stages): if stage.split: traintrees = [binarize(splitdiscnodes(Tree.convert(a), stage.markorigin), childchar=':').freeze() for a in trees] logging.info('splitted discontinuous nodes') else: traintrees = trees if stage.mode.startswith('pcfg'): assert tbfanout == 1 or stage.split backtransform = None if stage.dop: if stage.usedoubledop: # find recurring fragments in treebank, # as well as depth 1 'cover' fragments fragments = getfragments(traintrees, sents, numproc, iterate=stage.iterate, complement=stage.complement) xgrammar, backtransform, altweights = doubledop( traintrees, fragments) else: # DOP reduction xgrammar, altweights = dopreduction( traintrees, sents, packedgraph=stage.packedgraph) nodes = sum(len(list(a.subtrees())) for a in traintrees) if lexmodel and simplelexsmooth: newrules = simplesmoothlexicon(lexmodel) xgrammar.extend(newrules) for weights in altweights.values(): weights.extend(w for _, w in newrules) elif lexmodel: xgrammar = smoothlexicon(xgrammar, lexmodel) msg = grammarinfo(xgrammar) rules, lexicon = write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) grammar = Grammar(rules, lexicon, start=top, bitpar=stage.mode.startswith('pcfg')) for name in altweights: grammar.register(u'%s' % name, altweights[name]) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lexicon) logging.info('DOP model based on %d sentences, %d nodes, ' '%d nonterminals', len(traintrees), nodes, len(grammar.toid)) logging.info(msg) if stage.estimator != 'dop1': grammar.switch(u'%s' % stage.estimator) _sumsto1 = grammar.testgrammar() if stage.usedoubledop: # backtransform keys are line numbers to rules file; # to see them together do: # $ paste <(zcat dop.rules.gz) <(zcat dop.backtransform.gz) with codecs.getwriter('ascii')(gzip.open( '%s/%s.backtransform.gz' % (resultdir, stage.name), 'w')) as out: out.writelines('%s\n' % a for a in backtransform) if n and stage.prune: msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop else re.compile(b'@.+$'), neverblockre=re.compile(b'.+}<'), splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) else: # recoverfragments() relies on this mapping to identify # binarization nodes msg = grammar.getmapping(None, striplabelre=None, neverblockre=re.compile(b'.+}<'), splitprune=False, markorigin=False) logging.info(msg) elif n and stage.prune: # dop reduction msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None if stages[n - 1].dop and not stages[n - 1].usedoubledop else re.compile(b'@[-0-9]+$'), neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) if stage.mode == 'dop-rerank': grammar.getrulemapping(stages[n - 1].grammar) logging.info(msg) # write prob models np.savez_compressed('%s/%s.probs.npz' % (resultdir, stage.name), **{name: mod for name, mod in zip(grammar.modelnames, grammar.models)}) else: # not stage.dop xgrammar = treebankgrammar(traintrees, sents) logging.info('induced %s based on %d sentences', ('PCFG' if tbfanout == 1 or stage.split else 'PLCFRS'), len(traintrees)) if stage.split or os.path.exists('%s/pcdist.txt' % resultdir): logging.info(grammarinfo(xgrammar)) else: logging.info(grammarinfo(xgrammar, dump='%s/pcdist.txt' % resultdir)) if lexmodel and simplelexsmooth: newrules = simplesmoothlexicon(lexmodel) xgrammar.extend(newrules) elif lexmodel: xgrammar = smoothlexicon(xgrammar, lexmodel) rules, lexicon = write_lcfrs_grammar( xgrammar, bitpar=stage.mode.startswith('pcfg')) grammar = Grammar(rules, lexicon, start=top, bitpar=stage.mode.startswith('pcfg')) with gzip.open('%s/%s.rules.gz' % ( resultdir, stage.name), 'wb') as rulesfile: rulesfile.write(rules) with codecs.getwriter('utf-8')(gzip.open('%s/%s.lex.gz' % ( resultdir, stage.name), 'wb')) as lexiconfile: lexiconfile.write(lexicon) _sumsto1 = grammar.testgrammar() if n and stage.prune: msg = grammar.getmapping(stages[n - 1].grammar, striplabelre=None, neverblockre=re.compile(stage.neverblockre) if stage.neverblockre else None, splitprune=stage.splitprune and stages[n - 1].split, markorigin=stages[n - 1].markorigin) logging.info(msg) logging.info('wrote grammar to %s/%s.{rules,lex%s}.gz', resultdir, stage.name, ',backtransform' if stage.usedoubledop else '') outside = None if stage.getestimates == 'SX': assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.' logging.info('computing PCFG estimates') begin = time.clock() outside = getpcfgestimates(grammar, testmaxwords, grammar.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez('pcfgoutside.npz', outside=outside) logging.info('saved PCFG estimates') elif stage.useestimates == 'SX': assert tbfanout == 1 or stage.split, 'SX estimate requires PCFG.' assert stage.mode != 'pcfg', ( 'estimates require agenda-based parser.') outside = np.load('pcfgoutside.npz')['outside'] logging.info('loaded PCFG estimates') if stage.getestimates == 'SXlrgaps': logging.info('computing PLCFRS estimates') begin = time.clock() outside = getestimates(grammar, testmaxwords, grammar.toid[trees[0].label]) logging.info('estimates done. cpu time elapsed: %gs', time.clock() - begin) np.savez('outside.npz', outside=outside) logging.info('saved estimates') elif stage.useestimates == 'SXlrgaps': outside = np.load('outside.npz')['outside'] logging.info('loaded PLCFRS estimates') stage.update(grammar=grammar, backtransform=backtransform, outside=outside)
def getfragments(trees, sents, numproc=1, iterate=False, complement=False): """ Get recurring fragments with exact counts in a single treebank. :returns: a dictionary whose keys are fragments as strings, and frequencies / indices as values. :param trees: a sequence of binarized Tree objects. """ if numproc == 0: numproc = cpu_count() numtrees = len(trees) assert numtrees mult = 1 # 3 if numproc > 1 else 1 fragments = {} trees = trees[:] work = workload(numtrees, mult, numproc) PARAMS.update(disc=True, indices=True, approx=False, complete=False, quadratic=False, complement=complement) if numproc == 1: initworkersimple(trees, list(sents)) mymap = map myapply = APPLY else: logging.info("work division:\n%s", "\n".join(" %s: %r" % kv for kv in sorted(dict(numchunks=len(work), numproc=numproc).items()))) # start worker processes pool = Pool(processes=numproc, initializer=initworkersimple, initargs=(trees, list(sents))) mymap = pool.map myapply = pool.apply # collect recurring fragments logging.info("extracting recurring fragments") for a in mymap(worker, work): fragments.update(a) # add 'cover' fragments corresponding to single productions cover = myapply(coverfragworker, ()) before = len(fragments) fragments.update(cover) logging.info("merged %d unseen cover fragments", len(fragments) - before) fragmentkeys = list(fragments) bitsets = [fragments[a] for a in fragmentkeys] countchunk = len(bitsets) // numproc + 1 work = list(range(0, len(bitsets), countchunk)) work = [(n, len(work), bitsets[a:a + countchunk]) for n, a in enumerate(work)] logging.info("getting exact counts for %d fragments", len(bitsets)) counts = [] for a in mymap(exactcountworker, work): counts.extend(a) if numproc != 1: pool.close() pool.join() del pool if iterate: # optionally collect fragments of fragments logging.info("extracting fragments of recurring fragments") PARAMS['complement'] = False # needs to be turned off if it was on newfrags = fragments trees, sents = None, None ids = count() for _ in range(10): # up to 10 iterations newtrees = [binarize( introducepreterminals(Tree.parse(tree, parse_leaf=int), ids=ids), childchar="}") for tree, _ in newfrags] newsents = [["#%d" % next(ids) if word is None else word for word in sent] for _, sent in newfrags] newfrags, newcounts = iteratefragments( fragments, newtrees, newsents, trees, sents, numproc) if len(newfrags) == 0: break if trees is None: trees = [] sents = [] trees.extend(newtrees) sents.extend(newsents) fragmentkeys.extend(newfrags) counts.extend(newcounts) fragments.update(zip(newfrags, newcounts)) logging.info("found %d fragments", len(fragmentkeys)) return dict(zip(fragmentkeys, counts))
def main(): """Command line interface for applying tree(bank) transforms.""" import io from getopt import gnu_getopt, GetoptError from discodop import treebanktransforms actions = {'none': None, 'introducepreterminals': introducepreterminals, 'splitdisc': None, 'mergedisc': mergediscnodes, 'transform': None, 'unbinarize': unbinarize, 'binarize': None, 'optimalbinarize': None} flags = ('markorigin markheads leftunary rightunary tailmarker ' 'renumber reverse'.split()) options = ('inputfmt= outputfmt= inputenc= outputenc= slice= ensureroot= ' 'punct= headrules= functions= morphology= lemmas= factor= ' 'markorigin= maxlen= fmt= enc= transforms=').split() try: opts, args = gnu_getopt(sys.argv[1:], 'h:v:', flags + options) if not 1 <= len(args) <= 3: raise GetoptError('error: expected 1, 2, or 3 positional arguments') except GetoptError as err: print('error: %r\n%s' % (err, USAGE), file=sys.stderr) sys.exit(2) opts, action = dict(opts), args[0] if action not in actions: print('unrecognized action: %r\navailable actions: %s' % ( action, ', '.join(actions)), file=sys.stderr) sys.exit(2) if '--fmt' in opts: opts['--inputfmt'] = opts['--outputfmt'] = opts['--fmt'] if '--enc' in opts: opts['--inputenc'] = opts['--outputenc'] = opts['--enc'] if opts.get('--outputfmt', WRITERS[0]) not in WRITERS: print('unrecognized output format: %r\navailable formats: %s' % ( opts.get('--outputfmt'), ' '.join(WRITERS)), file=sys.stderr) sys.exit(2) infilename = args[1] if len(args) >= 2 and args[1] != '-' else '/dev/stdin' outfilename = args[2] if len(args) == 3 and args[2] != '-' else '/dev/stdout' # open corpus corpus = READERS[opts.get('--inputfmt', 'export')]( infilename, encoding=opts.get('--inputenc', 'utf-8'), headrules=opts.get('--headrules'), markheads='--markheads' in opts, ensureroot=opts.get('--ensureroot'), punct=opts.get('--punct'), functions=opts.get('--functions'), morphology=opts.get('--morphology'), lemmas=opts.get('--lemmas')) start, end = opts.get('--slice', ':').split(':') start, end = (int(start) if start else None), (int(end) if end else None) trees = corpus.itertrees(start, end) if '--maxlen' in opts: maxlen = int(opts['--maxlen']) trees = ((key, (tree, sent)) for key, (tree, sent) in trees if len(sent) <= maxlen) if '--renumber' in opts: trees = (('%8d' % n, treesent) for n, (_, treesent) in enumerate(trees, 1)) # select transformation transform = actions[action] if action in ('binarize', 'optimalbinarize'): h = int(opts.get('-h', 999)) v = int(opts.get('-v', 1)) if action == 'binarize': factor = opts.get('--factor', 'right') transform = lambda t, _: binarize(t, factor, h, v, leftmostunary='--leftunary' in opts, rightmostunary='--rightunary' in opts, tailmarker='$' if '--tailmarker' in opts else '') elif action == 'optimalbinarize': headdriven = '--headrules' in opts transform = lambda t, _: optimalbinarize(t, '|', headdriven, h, v) elif action == 'splitdisc': transform = lambda t, _: splitdiscnodes(t, '--markorigin' in opts) elif action == 'unbinarize': transform = lambda t, _: unbinarize(Tree.convert(t)) elif action == 'transform': tfs = opts['--transforms'].split(',') transform = lambda t, s: (treebanktransforms.reversetransform(t, tfs) if '--reverse' in opts else treebanktransforms.transform(t, s, tfs)) if transform is not None: # NB: transform cannot affect (no. of) terminals trees = ((key, (transform(tree, sent), sent)) for key, (tree, sent) in trees) # read, transform, & write trees headrules = None if opts.get('--outputfmt') in ('mst', 'conll'): if not opts.get('--headrules'): raise ValueError('need head rules for dependency conversion') headrules = treebanktransforms.readheadrules(opts.get('--headrules')) cnt = 0 if opts.get('--outputfmt') == 'dact': import alpinocorpus outfile = alpinocorpus.CorpusWriter(outfilename) if (action == 'none' and opts.get('--inputfmt') in ('alpino', 'dact') and set(opts) <= {'--slice', '--inputfmt', '--outputfmt', '--renumber'}): for n, (key, block) in islice(enumerate( corpus.blocks().items(), 1), start, end): outfile.write('%8d' % n if '--renumber' in opts else key, block) cnt += 1 else: for key, (tree, sent) in trees: outfile.write(str(key), writetree(tree, sent, key, 'alpino')) cnt += 1 else: encoding = opts.get('outputenc', 'utf-8') outfile = io.open(outfilename, 'w', encoding=encoding) # copy trees verbatim when only taking slice or converting encoding if (action == 'none' and opts.get('--inputfmt') == opts.get( '--outputfmt') and set(opts) <= {'--slice', '--inputenc', '--outputenc', '--inputfmt', '--outputfmt'}): for block in islice(corpus.blocks().values(), start, end): outfile.write(block) cnt += 1 else: for key, (tree, sent) in trees: outfile.write(writetree(tree, sent, key, opts.get('--outputfmt', 'export'), headrules)) cnt += 1 print('%sed %d trees with action %r' % ('convert' if action == 'none' else 'transform', cnt, action), file=sys.stderr)
def test(): """Do some tests.""" trees = '''(ROOT (S (ADV 0) (VVFIN 1) (NP (PDAT 2) (NN 3)) (PTKNEG 4) \ (PP (APPRART 5) (NN 6) (NP (ART 7) (ADJA 8) (NN 9)))) ($. 10)) (S (NP (NN 1) (EX 3)) (VP (VB 0) (JJ 2))) (S (VP (PDS 0) (ADV 3) (VVINF 4)) (PIS 2) (VMFIN 1)) (top (du (comp 0) (smain (noun 1) (verb 2) (inf (verb 8) (inf \ (adj 3) (pp (prep 4) (np (det 5) (noun 6))) (part 7) (verb 9) \ (pp (prep 10) (np (det 11) (noun 12) (pp (prep 13) (mwu \ (noun 14) (noun 15))))))))) (punct 16)) (top (smain (noun 0) (verb 1) (inf (verb 5) (inf (np (det 2) \ (adj 3) (noun 4)) (verb 6) (pp (prep 7) (noun 8))))) (punct 9)) (top (smain (noun 0) (verb 1) (noun 2) (inf (adv 3) (verb 4))) \ (punct 5)) (top (punct 5) (du (smain (noun 0) (verb 1) (ppart (np (det 2) \ (noun 3)) (verb 4))) (conj (sv1 (conj (noun 6) (vg 7) (np \ (det 8) (noun 9))) (verb 10) (noun 11) (part 12)) (vg 13) \ (sv1 (verb 14) (ti (comp 19) (inf (np (conj (det 15) (vg 16) \ (det 17)) (noun 18)) (verb 20)))))) (punct 21)) (top (punct 10) (punct 16) (punct 18) (smain (np (det 0) (noun 1) \ (pp (prep 2) (np (det 3) (noun 4)))) (verb 5) (adv 6) (np \ (noun 7) (noun 8)) (part 9) (np (det 11) (noun 12) (pp \ (prep 13) (np (det 14) (noun 15)))) (conj (vg 20) (ppres \ (adj 17) (pp (prep 22) (np (det 23) (adj 24) (noun 25)))) \ (ppres (adj 19)) (ppres (adj 21)))) (punct 26)) (top (punct 10) (punct 11) (punct 16) (smain (np (det 0) \ (noun 1)) (verb 2) (np (det 3) (noun 4)) (adv 5) (du (cp \ (comp 6) (ssub (noun 7) (verb 8) (inf (verb 9)))) (du \ (smain (noun 12) (verb 13) (adv 14) (part 15)) (noun 17)))) \ (punct 18) (punct 19)) (top (smain (noun 0) (verb 1) (inf (verb 8) (inf (verb 9) (inf \ (adv 2) (pp (prep 3) (noun 4)) (pp (prep 5) (np (det 6) \ (noun 7))) (verb 10))))) (punct 11)) (top (smain (noun 0) (verb 1) (pp (prep 2) (np (det 3) (adj 4) \ (noun 5) (rel (noun 6) (ssub (noun 7) (verb 10) (ppart \ (adj 8) (part 9) (verb 11))))))) (punct 12)) (top (smain (np (det 0) (noun 1)) (verb 2) (ap (adv 3) (num 4) \ (cp (comp 5) (np (det 6) (adj 7) (noun 8) (rel (noun 9) (ssub \ (noun 10) (verb 11) (pp (prep 12) (np (det 13) (adj 14) \ (adj 15) (noun 16))))))))) (punct 17)) (top (smain (np (det 0) (noun 1)) (verb 2) (adv 3) (pp (prep 4) \ (np (det 5) (noun 6)) (part 7))) (punct 8)) (top (punct 7) (conj (smain (noun 0) (verb 1) (np (det 2) \ (noun 3)) (pp (prep 4) (np (det 5) (noun 6)))) (smain \ (verb 8) (np (det 9) (num 10) (noun 11)) (part 12)) (vg 13) \ (smain (verb 14) (noun 15) (pp (prep 16) (np (det 17) \ (noun 18) (pp (prep 19) (np (det 20) (noun 21))))))) \ (punct 22)) (top (smain (np (det 0) (noun 1) (rel (noun 2) (ssub (np (num 3) \ (noun 4)) (adj 5) (verb 6)))) (verb 7) (ppart (verb 8) (pp \ (prep 9) (noun 10)))) (punct 11)) (top (conj (sv1 (np (det 0) (noun 1)) (verb 2) (ppart (verb 3))) \ (vg 4) (sv1 (verb 5) (pp (prep 6) (np (det 7) (adj 8) \ (noun 9))))) (punct 10)) (top (smain (noun 0) (verb 1) (np (det 2) (noun 3)) (inf (adj 4) \ (verb 5) (cp (comp 6) (ssub (noun 7) (adv 8) (verb 10) (ap \ (num 9) (cp (comp 11) (np (det 12) (adj 13) (noun 14) (pp \ (prep 15) (conj (np (det 16) (noun 17)) (vg 18) (np \ (noun 19))))))))))) (punct 20)) (top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) \ (inf (verb 6) (conj (inf (pp (prep 2) (np (det 3) (noun 4))) \ (verb 7)) (inf (verb 9)) (vg 10) (inf (verb 11)))))) \ (punct 12)) (top (smain (verb 2) (noun 3) (adv 4) (ppart (np (det 0) \ (noun 1)) (verb 5))) (punct 6)) (top (conj (smain (np (det 0) (noun 1)) (verb 2) (adj 3) (pp \ (prep 4) (np (det 5) (noun 6)))) (vg 7) (smain (np (det 8) \ (noun 9) (pp (prep 10) (np (det 11) (noun 12)))) (verb 13) \ (pp (prep 14) (np (det 15) (noun 16))))) (punct 17)) (top (conj (smain (noun 0) (verb 1) (inf (ppart (np (noun 2) \ (noun 3)) (verb 4)) (verb 5))) (vg 6) (smain (noun 7) \ (inf (ppart (np (det 8) (noun 9)))))) (punct 10)) (A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10)) (B3 (t 1) \ (t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8))) (A (B1 6 13) (B2 3 7 10) (B3 1 \ 9 11 14 16) (B4 0 5 8)) (VP (VB 0) (PRT 2)) (VP (VP 0 3) (NP (PRP 1) (NN 2))) (ROOT (S (VP_2 (PP (APPR 0) (ART 1) (NN 2) (PP (APPR 3) (ART 4) \ (ADJA 5) (NN 6))) (ADJD 10) (PP (APPR 11) (NN 12)) (VVPP 13)) \ (VAFIN 7) (NP (ART 8) (NN 9))) ($. 14))''' sents = '''Leider stehen diese Fragen nicht im Vordergrund der \ augenblicklichen Diskussion . is Mary happy there das muss man jetzt machen Of ze had gewoon met haar vriendinnen rond kunnen slenteren in de \ buurt van Trafalgar Square . Het had een prachtige dag kunnen zijn in Londen . Cathy zag hen wild zwaaien . Het was een spel geworden , zij en haar vriendinnen kozen iemand \ uit en probeerden zijn of haar nationaliteit te raden . Elk jaar in het hoogseizoen trokken daar massa's toeristen \ voorbij , hun fototoestel in de aanslag , pratend , gillend \ en lachend in de vreemdste talen . Haar vader stak zijn duim omhoog alsof hij wilde zeggen : " het \ komt wel goed , joch " . Ze hadden languit naast elkaar op de strandstoelen kunnen gaan \ liggen . Het hoorde bij de warme zomerdag die ze ginds achter had gelaten . De oprijlaan was niet meer dan een hobbelige zandstrook die zich \ voortslingerde tussen de hoge grijze boomstammen . Haar moeder kleefde bijna tegen het autoraampje aan . Ze veegde de tranen uit haar ooghoeken , tilde haar twee koffers \ op en begaf zich in de richting van het landhuis . Het meisje dat vijf keer juist raadde werd getrakteerd op ijs . Haar neus werd platgedrukt en leek op een jonge champignon . Cathy zag de BMW langzaam verdwijnen tot hij niet meer was dan \ een zilveren schijnsel tussen de bomen en struiken . Ze had met haar moeder kunnen gaan winkelen , zwemmen of \ terrassen . Dat werkwoord had ze zelf uitgevonden . De middagzon hing klein tussen de takken en de schaduwen van de \ wolken drentelden over het gras . Zij zou mams rug ingewreven hebben en mam de hare . 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 Mit einer Messe in der Sixtinischen Kapelle ist das Konklave \ offiziell zu Ende gegangen .''' trees = [Tree.parse(a, parse_leaf=int) for a in trees.splitlines()] sents = [a.split() for a in sents.splitlines()] sents.extend([['Wake', None, 'up'], [None, 'your', 'friend', None]]) for n, (tree, sent) in enumerate(zip(trees, sents)): drawtree = DrawTree(tree, sent) print('\ntree, sent', n, tree, ' '.join('...' if a is None else a for a in sent), repr(drawtree), sep='\n') try: print(drawtree.text(unicodelines=True, ansi=True), sep='\n') except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False), sep='\n')
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') objfun = request.args.get('objfun', 'mpp') coarse = request.args.get('coarse', None) html = 'html' in request.args lang = request.args.get('lang', 'detect') if not sent: return '' frags = nbest = None senttok = tokenize(sent) if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) key = (senttok, est, marg, objfun, coarse, lang) resp = CACHE.get(key) if resp is None: link = 'parse?' + url_encode( dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html)) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = coarse PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse(senttok)) if results[-1].noparse: parsetrees = [] result = 'no parse!' frags = nbest = '' else: if SHOWMORPH: replacemorph(results[-1].parsetree) if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] fragments = results[-1].fragments or () APP.logger.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup( DrawTree(tree, senttok).text(unicodelines=True, html=html, funcsep='-')) frags = Markup( 'Phrasal fragments used in the most probable ' 'derivation of the highest ranked parse tree:\n' + '\n\n'.join( DrawTree(frag).text(unicodelines=True, html=html) for frag in fragments if frag.count('(') > 1)) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: replacemorph(tree) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) nbest = Markup('\n\n'.join( '%d. [%s]\n%s' % (n + 1, probstr(prob), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, _) in enumerate(parsetrees_))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % (' '.join( '%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join( ('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', '\n'.join( '%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000) else: ( sent, result, frags, nbest, # pylint: disable=unpacking-non-sequence info, link) = resp # pylint: disable=unpacking-non-sequence if html: return render_template('parsetree.html', sent=sent, result=result, frags=frags, nbest=nbest, info=info, link=link, randid=randid()) else: return Response('\n'.join((nbest, frags, info, result)), mimetype='text/plain')
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, '}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) objfun = request.args.get('objfun', 'mpp') est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') coarse = request.args.get('coarse', 'pcfg') html = 'html' in request.args lang = request.args.get('lang', 'detect') require = request.args.get('require', None) block = request.args.get('block', None) if not sent: return '' nbest = None if POSTAGS.match(sent): senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split())) else: senttok, tags = tuple(tokenize(sent)), None if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) if require: require = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(require))) if block: block = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(block))) key = (senttok, tags, est, marg, objfun, coarse, lang, require, block) resp = CACHE.get(key) if resp is None: urlparams = dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html) if require: urlparams['require'] = json.dumps(require) if block: urlparams['block'] = json.dumps(block) link = 'parse?' + url_encode(urlparams) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = ( 'pcfg' if coarse == 'pcfg-posterior' else coarse) if len(PARSERS[lang].stages) > 1: PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse( senttok, tags=tags, require=require, block=block)) if results[-1].noparse: parsetrees = [] result = 'no parse!' nbest = dep = depsvg = '' else: if SHOWMORPH: replacemorph(results[-1].parsetree) if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] APP.logger.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup(DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: replacemorph(tree) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) if PARSERS[lang].headrules: xtree = PARSERS[lang].postprocess( parsetrees[0][0], senttok, -1)[0] dep = treebank.writedependencies(xtree, senttok, 'conll') depsvg = Markup(DrawDependencies.fromconll(dep).svg()) else: dep = depsvg = '' rid = randid() nbest = Markup('\n\n'.join('%d. [%s] ' '<a href=\'javascript: toggle("f%s%d"); \'>' 'derivation</a>\n' '<span id=f%s%d style="display: none; margin-left: 3em; ">' 'Fragments used in the highest ranked derivation' ' of this parse tree:\n%s</span>\n%s' % ( n + 1, probstr(prob), rid, n + 1, rid, n + 1, '\n\n'.join('%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=html)) for frag, w in fragments or () # if frag.count('(') > 1 ), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, fragments) in enumerate(parsetrees_))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % ( ' '.join('%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % ( len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', ''.join('%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, nbest, info, link, dep, depsvg), timeout=5000) else: (sent, result, nbest, info, link, dep, depsvg) = resp if html: return render_template('parsetree.html', sent=sent, result=result, nbest=nbest, info=info, link=link, dep=dep, depsvg=depsvg, randid=randid()) else: return Response('\n'.join((nbest, info, result)), mimetype='text/plain')
def parse(): """ Parse sentence and return a textual representation of a parse tree, in a HTML fragment or plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) est = request.args.get('est', 'dop1') marg = request.args.get('marg', 'nbest') objfun = request.args.get('objfun', 'mpp') coarse = request.args.get('coarse', None) html = 'html' in request.args lang = request.args.get('lang', 'detect') if not sent: return '' frags = nbest = None senttok = tokenize(sent) if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) key = (senttok, est, marg, objfun, coarse, lang, html) if CACHE.get(key) is not None: return CACHE.get(key) link = url_encode(dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html)) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = coarse PARSERS[lang].stages[1].k = 1e-5 if coarse == 'pcfg-posterior' else 50 results = list(PARSERS[lang].parse(senttok)) if results[-1].noparse: parsetrees = {} result = 'no parse!' frags = nbest = '' else: if PARSERS[lang].relationalrealizational: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or {} parsetrees = heapq.nlargest(10, parsetrees.items(), key=itemgetter(1)) fragments = results[-1].fragments or () APP.logger.info('[%s] %s' % (probstr(prob), tree)) tree = Tree.parse(tree, parse_leaf=int) result = Markup(DrawTree(tree, senttok, abbr=True).text( unicodelines=True, html=html)) frags = Markup('Phrasal fragments used in the most probable derivation' ' of the highest ranked parse tree:\n' + '\n\n'.join( DrawTree(Tree.parse(frag, parse_leaf=int), terminals).text( unicodelines=True, html=html) for frag, terminals in fragments)) nbest = Markup('\n\n'.join('%d. [%s]\n%s' % (n + 1, probstr(prob), DrawTree(PARSERS[lang].postprocess(tree)[0], senttok, abbr=True).text(unicodelines=True, html=html)) for n, (tree, prob) in enumerate(parsetrees))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % ( ' '.join('%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % ( len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', '\n'.join('%d. [%s] %s' % (n + 1, probstr(prob), tree) for n, (tree, prob) in enumerate(parsetrees)) + '\n')) if html: CACHE.set(key, render_template('parsetree.html', sent=sent, result=result, frags=frags, nbest=nbest, info=info, link=link, randid=randid()), timeout=5000) else: CACHE.set(key, Response('\n'.join((nbest, frags, info, result)), mimetype='text/plain'), timeout=5000) return CACHE.get(key)
def trees(form): """ Return visualization of parse trees in search results. """ # TODO: show context of x sentences around result, offer pagination. gotresults = False for n, (_textno, results, stderr) in enumerate( doqueries(form, lines=True)): if n == 0: # NB: we do not hide function or morphology tags when exporting url = 'trees?query=%s&texts=%s&engine=%s&export=1' % ( quote(form['query']), form['texts'], form.get('engine', 'tgrep2')) yield ('Query: %s\n' 'Trees (showing up to %d per text; ' 'export: <a href="%s">plain</a>, ' '<a href="%s">with line numbers</a>):\n' % ( stderr, TREELIMIT, url, url + '&linenos=1')) for m, line in enumerate(islice(results, TREELIMIT)): lineno, text, treestr, match = line.split(":::") if m == 0: gotresults = True yield ("==> %s: [<a href=\"javascript: toggle('n%d'); \">" "toggle</a>]\n<span id=n%d>" % (text, n + 1, n + 1)) if form.get('engine', 'tgrep2') == 'tgrep2': cnt = count() treestr = treestr.replace(" )", " -NONE-)") match = match.strip() if match.startswith('('): treestr = treestr.replace(match, '%s_HIGH %s' % tuple( match.split(None, 1))) else: match = ' %s)' % match treestr = treestr.replace(match, '_HIGH%s' % match) tree = Tree.parse(treestr, parse_leaf=lambda _: next(cnt)) sent = re.findall(r" +([^ ()]+)(?=[ )])", treestr) high = list(tree.subtrees(lambda n: n.label.endswith("_HIGH"))) if high: high = high.pop() high.label = high.label.rsplit("_", 1)[0] high = list(high.subtrees()) + high.leaves() elif form.get('engine', 'tgrep2') == 'xpath': tree, sent = treebank.alpinotree( ElementTree.fromstring(treestr)) # morphology='replace') highwords = re.findall('<node[^>]*begin="([0-9]+)"[^>]*/>', match) high = set(re.findall(r'\bid="(.+?)"', match)) high = list(tree.subtrees(lambda n: n.source[treebank.PARENT] in high or n.source[treebank.WORD].lstrip('#') in high)) high += [int(a) for a in highwords] try: treerepr = DrawTree(tree, sent, highlight=high).text( unicodelines=True, html=True) except ValueError as err: line = "#%s \nERROR: %s\n%s\n%s\n" % ( lineno, err, treestr, tree) else: line = "#%s\n%s\n" % (lineno, treerepr) yield line yield "</span>" if not gotresults: yield "No matches."