def test_mergedicsnodes(self): tree = Tree.parse( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4)' '(VVPP 5)) (VAINF 6)) (VMFIN 3))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes(tree))) == ( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) ' '(VAINF 6)) (VMFIN 3))') assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == ( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) ' '(VAINF 6)) (VMFIN 3))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes( tree, markorigin=True))) == ('(S (X (A 0) (A 2)) (X (A 1) (A 3)))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(splitdiscnodes(tree, markorigin=True)) == ( '(S (X*0 (A 0)) (X*0 (A 1)) (X*1 (A 2)) (X*1 (A 3)))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(mergediscnodes( splitdiscnodes(tree))) == ('(S (X (A 0) (A 1) (A 2) (A 3)))')
def brackettree(treestr, sent, brackets, strtermre): """ Parse a single tree presented in bracket format, whether with indices or not; sent may be None / empty. """ if strtermre.search(treestr): # terminals are not all indices treestr = FRONTIERNTRE.sub(' ...)', treestr) sent = TERMINALSRE.findall(treestr) cnt = count() tree = Tree.parse(treestr, brackets=brackets, parse_leaf=lambda x: next(cnt)) else: # disc. trees with integer indices as terminals tree = Tree.parse(treestr, parse_leaf=int, brackets=brackets) sent = (sent.split() if sent.strip() else map(str, range(max(tree.leaves()) + 1))) return tree, sent
def __init__(self, tree, sent=None, highlight=(), abbr=False): self.tree = tree self.sent = sent if isinstance(tree, basestring): self.tree = Tree.parse(tree, parse_leaf=None if sent is None else int) if sent is None: leaves = self.tree.leaves() if (leaves and not any(len(a) == 0 for a in self.tree.subtrees()) and all(isinstance(a, int) for a in leaves)): self.sent = [str(a) for a in leaves] else: # this deals with empty nodes (frontier non-terminals) # and multiple/mixed terminals under non-terminals. self.tree = self.tree.copy(True) self.sent = [] for a in self.tree.subtrees(): if len(a) == 0: a.append(len(self.sent)) self.sent.append(None) elif any(not isinstance(b, Tree) for b in a): for n, b in enumerate(a): if not isinstance(b, Tree): a[n] = len(self.sent) self.sent.append('%s' % b) if abbr: if self.tree is tree: self.tree = self.tree.copy(True) for n in self.tree.subtrees(lambda x: len(x.label) > 5): n.label = n.label[:4] + u'\u2026' # unicode '...' ellipsis self.highlight = set() self.nodes, self.coords, self.edges = self.nodecoords( self.tree, self.sent, highlight)
def test(): """ Simple demonstration. """ a = Tree.parse("(f (d (a 0) (c (b 1))) (e 2))", parse_leaf=int) b = Tree.parse("(f (c (d (a 0) (b 1)) (e 2)))", parse_leaf=int) result1 = treedist(a, b, debug=True) assert result1 == 2 print('%s\n%s\ndistance: %d' % (a, b, result1)) result2 = newtreedist(a, b, debug=True) assert result2 == 2 print('%s\n%s\ndistance: %d' % (a, b, result2)) a = Tree.parse("(f (d (x (a 0)) (b 1) (c 2)) (z 3))", parse_leaf=int) b = Tree.parse("(f (c (d (a 0) (x (b 1)) (c 2)) (z 3)))", parse_leaf=int) result1 = treedist(a, b, debug=True) assert result1 == 3 print('%s\n%s\ndistance: %d' % (a, b, result1)) result2 = newtreedist(a, b, debug=True) assert result2 == 3 print('%s\n%s\ndistance: %d' % (a, b, result2))
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers from discodop.disambiguation import getderivations, marginalize corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, _backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(None, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) result, msg = grammar.testgrammar() assert result, 'RFE should sum to 1.\n%s' % msg for tree, sent in zip(corpus.trees().values(), sents): if debug: print('sentence:', ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: getderivations(chart, 100) _parses, _msg = marginalize('mpp', chart) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse('(ROOT (S (F (E (S (C (B (A 0))))))))', parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def test_mergedicsnodes(self): tree = Tree.parse('(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4)' '(VVPP 5)) (VAINF 6)) (VMFIN 3))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes(tree))) == ( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) ' '(VAINF 6)) (VMFIN 3))') assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == ( '(S (VP (VP (PP (APPR 0) (ART 1) (NN 2)) (CARD 4) (VVPP 5)) ' '(VAINF 6)) (VMFIN 3))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes(tree, markorigin=True))) == ( '(S (X (A 0) (A 2)) (X (A 1) (A 3)))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(splitdiscnodes(tree, markorigin=True)) == ( '(S (X*0 (A 0)) (X*0 (A 1)) (X*1 (A 2)) (X*1 (A 3)))') tree = Tree.parse('(S (X (A 0) (A 2)) (X (A 1) (A 3)))', parse_leaf=int) assert str(mergediscnodes(splitdiscnodes(tree))) == ( '(S (X (A 0) (A 1) (A 2) (A 3)))')
def noparse(self, stage, sent, tags, lastsuccessfulparse): """Return parse from previous stage or a dummy parse.""" # use successful parse from earlier stage if available if lastsuccessfulparse is not None: parsetree = lastsuccessfulparse.copy(True) else: # Produce a dummy parse for evaluation purposes. default = defaultparse([(n, t) for n, t in enumerate(tags or (len(sent) * ['NONE']))]) parsetree = Tree.parse('(%s %s)' % (stage.grammar.start, default), parse_leaf=int) noparse = True prob = 1.0 return parsetree, prob, noparse
def postprocess(self, treestr, stage=-1, derivs=None): """ Take parse tree and apply postprocessing. """ parsetree = Tree.parse(treestr, parse_leaf=int) if self.stages[stage].split: mergediscnodes(unbinarize(parsetree, childchar=':')) saveheads(parsetree, self.tailmarker) unbinarize(parsetree) removefanoutmarkers(parsetree) if self.relationalrealizational: parsetree = rrbacktransform(parsetree, self.relationalrealizational['adjunctionlabel']) if self.transformations: reversetransform(parsetree, self.transformations) fragments = derivs.get(treestr) if derivs else None return parsetree, fragments, False
def postprocess(self, treestr, stage=-1): """Take parse tree and apply postprocessing.""" parsetree = Tree.parse(treestr, parse_leaf=int) if self.stages[stage].split: mergediscnodes(unbinarize(parsetree, childchar=':', expandunary=False)) saveheads(parsetree, self.binarization.tailmarker) unbinarize(parsetree, expandunary=False) removefanoutmarkers(parsetree) if self.relationalrealizational: parsetree = rrbacktransform(parsetree, self.relationalrealizational['adjunctionlabel']) if self.transformations: reversetransform(parsetree, self.transformations) return parsetree, False
def trees(self, query, subset=None, maxresults=10, nofunc=False, nomorph=False): subset = subset or self.files # %s the sentence number # %w complete tree in bracket notation # %h the matched subtree in bracket notation fmt = r'%s:::%w:::%h\n' result = [] jobs = {} for filename in subset: try: x, maxresults2 = self.cache['trees', query, filename, nofunc, nomorph] except KeyError: maxresults2 = 0 if not maxresults or maxresults > maxresults2: jobs[self._submit(lambda x: list(self._query( query, x, fmt, maxresults)), filename)] = filename else: result.extend(x[:maxresults]) for future in self._as_completed(jobs): filename = jobs[future] x = [] for sentno, line in future.result(): treestr, match = line.split(':::') treestr = filterlabels(treestr, nofunc, nomorph) treestr = treestr.replace(" )", " -NONE-)") cnt = count() if match.startswith('('): treestr = treestr.replace(match, '%s_HIGH %s' % tuple( match.split(None, 1)), 1) else: match = ' %s)' % match treestr = treestr.replace(match, '_HIGH%s' % match) tree = Tree.parse(treestr, parse_leaf=lambda _: next(cnt)) sent = re.findall(r" +([^ ()]+)(?=[ )])", treestr) high = list(tree.subtrees(lambda n: n.label.endswith("_HIGH"))) if high: high = high.pop() high.label = high.label.rsplit("_", 1)[0] high = list(high.subtrees()) + high.leaves() x.append((filename, sentno, tree, sent, high)) self.cache['trees', query, filename, nofunc, nomorph] = x, maxresults result.extend(x) return result
def test_balancedpunctraise(self): tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)' ' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP' ' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)' ' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)' ' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))' ' ($. 25))', parse_leaf=int) sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus " ". . . ' , die Hayko Siemens musikalisch leitet , bietet " "wieder ungewoehnliche Kombinationen .".split()) punctraise(tree, sent) balancedpunctraise(tree, sent) assert max(map(fanout, addbitsets(tree).subtrees())) == 1 nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP ' '(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S ' '(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) ' '(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int) assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) objfun = request.args.get('objfun', 'mpp') est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') coarse = request.args.get('coarse', 'pcfg') html = 'html' in request.args lang = request.args.get('lang', 'detect') require = request.args.get('require', None) block = request.args.get('block', None) if not sent: return '' nbest = None if POSTAGS.match(sent): senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split())) else: senttok, tags = tuple(tokenize(sent)), None if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) if require: require = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(require))) if block: block = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(block))) key = (senttok, tags, est, marg, objfun, coarse, lang, require, block) resp = CACHE.get(key) if resp is None: urlparams = dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html) if require: urlparams['require'] = json.dumps(require) if block: urlparams['block'] = json.dumps(block) link = 'parse?' + url_encode(urlparams) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = ( 'pcfg' if coarse == 'pcfg-posterior' else coarse) if len(PARSERS[lang].stages) > 1: PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse( senttok, tags=tags, require=require, block=block)) if results[-1].noparse: parsetrees = [] result = 'no parse!' nbest = dep = depsvg = '' else: if SHOWMORPH: replacemorph(results[-1].parsetree) if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] APP.logger.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup(DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: replacemorph(tree) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) if PARSERS[lang].headrules: xtree = PARSERS[lang].postprocess( parsetrees[0][0], senttok, -1)[0] dep = treebank.writedependencies(xtree, senttok, 'conll') depsvg = Markup(DrawDependencies.fromconll(dep).svg()) else: dep = depsvg = '' rid = randid() nbest = Markup('\n\n'.join('%d. [%s] ' '<a href=\'javascript: toggle("f%s%d"); \'>' 'derivation</a>\n' '<span id=f%s%d style="display: none; margin-left: 3em; ">' 'Fragments used in the highest ranked derivation' ' of this parse tree:\n%s</span>\n%s' % ( n + 1, probstr(prob), rid, n + 1, rid, n + 1, '\n\n'.join('%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=html)) for frag, w in fragments or () # if frag.count('(') > 1 ), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, fragments) in enumerate(parsetrees_))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % ( ' '.join('%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % ( len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', ''.join('%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, nbest, info, link, dep, depsvg), timeout=5000) else: (sent, result, nbest, info, link, dep, depsvg) = resp if html: return render_template('parsetree.html', sent=sent, result=result, nbest=nbest, info=info, link=link, dep=dep, depsvg=depsvg, randid=randid()) else: return Response('\n'.join((nbest, info, result)), mimetype='text/plain')
def getfragments(trees, sents, numproc=1, iterate=False, complement=False): """ Get recurring fragments with exact counts in a single treebank. :returns: a dictionary whose keys are fragments as strings, and frequencies / indices as values. :param trees: a sequence of binarized Tree objects. """ if numproc == 0: numproc = cpu_count() numtrees = len(trees) assert numtrees mult = 1 # 3 if numproc > 1 else 1 fragments = {} trees = trees[:] work = workload(numtrees, mult, numproc) PARAMS.update(disc=True, indices=True, approx=False, complete=False, quadratic=False, complement=complement) if numproc == 1: initworkersimple(trees, list(sents)) mymap = map myapply = APPLY else: logging.info("work division:\n%s", "\n".join(" %s: %r" % kv for kv in sorted(dict(numchunks=len(work), numproc=numproc).items()))) # start worker processes pool = Pool(processes=numproc, initializer=initworkersimple, initargs=(trees, list(sents))) mymap = pool.map myapply = pool.apply # collect recurring fragments logging.info("extracting recurring fragments") for a in mymap(worker, work): fragments.update(a) # add 'cover' fragments corresponding to single productions cover = myapply(coverfragworker, ()) before = len(fragments) fragments.update(cover) logging.info("merged %d unseen cover fragments", len(fragments) - before) fragmentkeys = list(fragments) bitsets = [fragments[a] for a in fragmentkeys] countchunk = len(bitsets) // numproc + 1 work = list(range(0, len(bitsets), countchunk)) work = [(n, len(work), bitsets[a:a + countchunk]) for n, a in enumerate(work)] logging.info("getting exact counts for %d fragments", len(bitsets)) counts = [] for a in mymap(exactcountworker, work): counts.extend(a) if numproc != 1: pool.close() pool.join() del pool if iterate: # optionally collect fragments of fragments logging.info("extracting fragments of recurring fragments") PARAMS['complement'] = False # needs to be turned off if it was on newfrags = fragments trees, sents = None, None ids = count() for _ in range(10): # up to 10 iterations newtrees = [binarize( introducepreterminals(Tree.parse(tree, parse_leaf=int), ids=ids), childchar="}") for tree, _ in newfrags] newsents = [["#%d" % next(ids) if word is None else word for word in sent] for _, sent in newfrags] newfrags, newcounts = iteratefragments( fragments, newtrees, newsents, trees, sents, numproc) if len(newfrags) == 0: break if trees is None: trees = [] sents = [] trees.extend(newtrees) sents.extend(newsents) fragmentkeys.extend(newfrags) counts.extend(newcounts) fragments.update(zip(newfrags, newcounts)) logging.info("found %d fragments", len(fragmentkeys)) return dict(zip(fragmentkeys, counts))
def parse(): """ Parse sentence and return a textual representation of a parse tree, in a HTML fragment or plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) est = request.args.get('est', 'dop1') marg = request.args.get('marg', 'nbest') objfun = request.args.get('objfun', 'mpp') coarse = request.args.get('coarse', None) html = 'html' in request.args lang = request.args.get('lang', 'detect') if not sent: return '' frags = nbest = None senttok = tokenize(sent) if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) key = (senttok, est, marg, objfun, coarse, lang, html) if CACHE.get(key) is not None: return CACHE.get(key) link = url_encode(dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html)) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = coarse PARSERS[lang].stages[1].k = 1e-5 if coarse == 'pcfg-posterior' else 50 results = list(PARSERS[lang].parse(senttok)) if results[-1].noparse: parsetrees = {} result = 'no parse!' frags = nbest = '' else: if PARSERS[lang].relationalrealizational: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or {} parsetrees = heapq.nlargest(10, parsetrees.items(), key=itemgetter(1)) fragments = results[-1].fragments or () APP.logger.info('[%s] %s' % (probstr(prob), tree)) tree = Tree.parse(tree, parse_leaf=int) result = Markup(DrawTree(tree, senttok, abbr=True).text( unicodelines=True, html=html)) frags = Markup('Phrasal fragments used in the most probable derivation' ' of the highest ranked parse tree:\n' + '\n\n'.join( DrawTree(Tree.parse(frag, parse_leaf=int), terminals).text( unicodelines=True, html=html) for frag, terminals in fragments)) nbest = Markup('\n\n'.join('%d. [%s]\n%s' % (n + 1, probstr(prob), DrawTree(PARSERS[lang].postprocess(tree)[0], senttok, abbr=True).text(unicodelines=True, html=html)) for n, (tree, prob) in enumerate(parsetrees))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % ( ' '.join('%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % ( len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', '\n'.join('%d. [%s] %s' % (n + 1, probstr(prob), tree) for n, (tree, prob) in enumerate(parsetrees)) + '\n')) if html: CACHE.set(key, render_template('parsetree.html', sent=sent, result=result, frags=frags, nbest=nbest, info=info, link=link, randid=randid()), timeout=5000) else: CACHE.set(key, Response('\n'.join((nbest, frags, info, result)), mimetype='text/plain'), timeout=5000) return CACHE.get(key)
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10]] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=debug, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, '}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) objfun = request.args.get('objfun', 'mpp') est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') coarse = request.args.get('coarse', 'pcfg') html = 'html' in request.args lang = request.args.get('lang', 'detect') require = request.args.get('require', None) block = request.args.get('block', None) if not sent: return '' nbest = None if POSTAGS.match(sent): senttok, tags = zip(*(a.rsplit('/', 1) for a in sent.split())) else: senttok, tags = tuple(tokenize(sent)), None if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) if require: require = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(require))) if block: block = tuple((label, tuple(indices)) for label, indices in sorted(json.loads(block))) key = (senttok, tags, est, marg, objfun, coarse, lang, require, block) resp = CACHE.get(key) if resp is None: urlparams = dict(sent=sent, lang=lang, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html) if require: urlparams['require'] = json.dumps(require) if block: urlparams['block'] = json.dumps(block) link = '?' + url_encode(urlparams) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = ('pcfg' if coarse == 'pcfg-posterior' else coarse) if len(PARSERS[lang].stages) > 1: PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse(senttok, tags=tags, require=require, block=block)) if SHOWMORPH: replacemorph(results[-1].parsetree) if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] LOG.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup( DrawTree(tree, senttok).text(unicodelines=True, html=html, funcsep='-')) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: replacemorph(tree) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) if PARSERS[lang].headrules: xtree = PARSERS[lang].postprocess(parsetrees[0][0], senttok, -1)[0] dep = treebank.writedependencies(xtree, senttok, 'conll') depsvg = Markup(DrawDependencies.fromconll(dep).svg()) else: dep = depsvg = '' rid = randid() nbest = Markup('\n\n'.join( '%d. [%s] ' '<a href=\'javascript: toggle("f%s%d"); \'>' 'derivation</a>\n' '<span id=f%s%d style="display: none; margin-left: 3em; ">' 'Fragments used in the highest ranked derivation' ' of this parse tree:\n%s</span>\n%s' % ( n + 1, probstr(prob), rid, n + 1, rid, n + 1, '\n\n'.join( '%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=html)) for frag, w in fragments or () # if frag.count('(') > 1 ), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, fragments) in enumerate(parsetrees_))) deriv = Markup( 'Fragments used in the highest ranked derivation' ' of best parse tree:\n%s' % ( '\n\n'.join( '%s\n%s' % (w, DrawTree(frag).text(unicodelines=True, html=html)) for frag, w in parsetrees_[0][2] or () # if frag.count('(') > 1 ))) if parsetrees_ else '' msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % (' '.join( '%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(( 'length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', ''.join('%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, nbest, deriv, info, link, dep, depsvg), timeout=5000) else: (sent, result, nbest, deriv, info, link, dep, depsvg) = resp if html: return render_template('parsetree.html', sent=sent, result=result, nbest=nbest, deriv=deriv, info=info, link=link, dep=dep, depsvg=depsvg, randid=randid()) else: return Response('\n'.join((nbest, info, result)), mimetype='text/plain')
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') objfun = request.args.get('objfun', 'mpp') coarse = request.args.get('coarse', None) html = 'html' in request.args lang = request.args.get('lang', 'detect') if not sent: return '' frags = nbest = None senttok = tokenize(sent) if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) key = (senttok, est, marg, objfun, coarse, lang) resp = CACHE.get(key) if resp is None: link = 'parse?' + url_encode(dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html)) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = coarse PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse(senttok)) if results[-1].noparse: parsetrees = [] result = 'no parse!' frags = nbest = '' else: if SHOWMORPH: for node in results[-1].parsetree.subtrees( lambda n: n and not isinstance(n[0], Tree)): treebank.handlemorphology( 'replace', None, node, node.source) node.label = node.label.replace('[]', '') if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] fragments = results[-1].fragments or () APP.logger.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup(DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) frags = Markup('Phrasal fragments used in the most probable ' 'derivation of the highest ranked parse tree:\n' + '\n\n'.join( DrawTree(frag).text(unicodelines=True, html=html) for frag in fragments if frag.count('(') > 1)) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: for node in tree.subtrees( lambda n: n and not isinstance(n[0], Tree)): treebank.handlemorphology( 'replace', None, node, node.source) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) nbest = Markup('\n\n'.join('%d. [%s]\n%s' % (n + 1, probstr(prob), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, _) in enumerate(parsetrees_))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % ( ' '.join('%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join(('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % ( len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', '\n'.join('%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000) else: (sent, result, frags, nbest, # pylint: disable=unpacking-non-sequence info, link) = resp # pylint: disable=unpacking-non-sequence if html: return render_template('parsetree.html', sent=sent, result=result, frags=frags, nbest=nbest, info=info, link=link, randid=randid()) else: return Response('\n'.join((nbest, frags, info, result)), mimetype='text/plain')
def trees(form): """ Return visualization of parse trees in search results. """ # TODO: show context of x sentences around result, offer pagination. gotresults = False for n, (_textno, results, stderr) in enumerate( doqueries(form, lines=True)): if n == 0: # NB: we do not hide function or morphology tags when exporting url = 'trees?query=%s&texts=%s&engine=%s&export=1' % ( quote(form['query']), form['texts'], form.get('engine', 'tgrep2')) yield ('Query: %s\n' 'Trees (showing up to %d per text; ' 'export: <a href="%s">plain</a>, ' '<a href="%s">with line numbers</a>):\n' % ( stderr, TREELIMIT, url, url + '&linenos=1')) for m, line in enumerate(islice(results, TREELIMIT)): lineno, text, treestr, match = line.split(":::") if m == 0: gotresults = True yield ("==> %s: [<a href=\"javascript: toggle('n%d'); \">" "toggle</a>]\n<span id=n%d>" % (text, n + 1, n + 1)) if form.get('engine', 'tgrep2') == 'tgrep2': cnt = count() treestr = treestr.replace(" )", " -NONE-)") match = match.strip() if match.startswith('('): treestr = treestr.replace(match, '%s_HIGH %s' % tuple( match.split(None, 1))) else: match = ' %s)' % match treestr = treestr.replace(match, '_HIGH%s' % match) tree = Tree.parse(treestr, parse_leaf=lambda _: next(cnt)) sent = re.findall(r" +([^ ()]+)(?=[ )])", treestr) high = list(tree.subtrees(lambda n: n.label.endswith("_HIGH"))) if high: high = high.pop() high.label = high.label.rsplit("_", 1)[0] high = list(high.subtrees()) + high.leaves() elif form.get('engine', 'tgrep2') == 'xpath': tree, sent = treebank.alpinotree( ElementTree.fromstring(treestr)) # morphology='replace') highwords = re.findall('<node[^>]*begin="([0-9]+)"[^>]*/>', match) high = set(re.findall(r'\bid="(.+?)"', match)) high = list(tree.subtrees(lambda n: n.source[treebank.PARENT] in high or n.source[treebank.WORD].lstrip('#') in high)) high += [int(a) for a in highwords] try: treerepr = DrawTree(tree, sent, highlight=high).text( unicodelines=True, html=True) except ValueError as err: line = "#%s \nERROR: %s\n%s\n%s\n" % ( lineno, err, treestr, tree) else: line = "#%s\n%s\n" % (lineno, treerepr) yield line yield "</span>" if not gotresults: yield "No matches."
def test(): """Do some tests.""" trees = '''(ROOT (S (ADV 0) (VVFIN 1) (NP (PDAT 2) (NN 3)) (PTKNEG 4) \ (PP (APPRART 5) (NN 6) (NP (ART 7) (ADJA 8) (NN 9)))) ($. 10)) (S (NP (NN 1) (EX 3)) (VP (VB 0) (JJ 2))) (S (VP (PDS 0) (ADV 3) (VVINF 4)) (PIS 2) (VMFIN 1)) (top (du (comp 0) (smain (noun 1) (verb 2) (inf (verb 8) (inf \ (adj 3) (pp (prep 4) (np (det 5) (noun 6))) (part 7) (verb 9) \ (pp (prep 10) (np (det 11) (noun 12) (pp (prep 13) (mwu \ (noun 14) (noun 15))))))))) (punct 16)) (top (smain (noun 0) (verb 1) (inf (verb 5) (inf (np (det 2) \ (adj 3) (noun 4)) (verb 6) (pp (prep 7) (noun 8))))) (punct 9)) (top (smain (noun 0) (verb 1) (noun 2) (inf (adv 3) (verb 4))) \ (punct 5)) (top (punct 5) (du (smain (noun 0) (verb 1) (ppart (np (det 2) \ (noun 3)) (verb 4))) (conj (sv1 (conj (noun 6) (vg 7) (np \ (det 8) (noun 9))) (verb 10) (noun 11) (part 12)) (vg 13) \ (sv1 (verb 14) (ti (comp 19) (inf (np (conj (det 15) (vg 16) \ (det 17)) (noun 18)) (verb 20)))))) (punct 21)) (top (punct 10) (punct 16) (punct 18) (smain (np (det 0) (noun 1) \ (pp (prep 2) (np (det 3) (noun 4)))) (verb 5) (adv 6) (np \ (noun 7) (noun 8)) (part 9) (np (det 11) (noun 12) (pp \ (prep 13) (np (det 14) (noun 15)))) (conj (vg 20) (ppres \ (adj 17) (pp (prep 22) (np (det 23) (adj 24) (noun 25)))) \ (ppres (adj 19)) (ppres (adj 21)))) (punct 26)) (top (punct 10) (punct 11) (punct 16) (smain (np (det 0) \ (noun 1)) (verb 2) (np (det 3) (noun 4)) (adv 5) (du (cp \ (comp 6) (ssub (noun 7) (verb 8) (inf (verb 9)))) (du \ (smain (noun 12) (verb 13) (adv 14) (part 15)) (noun 17)))) \ (punct 18) (punct 19)) (top (smain (noun 0) (verb 1) (inf (verb 8) (inf (verb 9) (inf \ (adv 2) (pp (prep 3) (noun 4)) (pp (prep 5) (np (det 6) \ (noun 7))) (verb 10))))) (punct 11)) (top (smain (noun 0) (verb 1) (pp (prep 2) (np (det 3) (adj 4) \ (noun 5) (rel (noun 6) (ssub (noun 7) (verb 10) (ppart \ (adj 8) (part 9) (verb 11))))))) (punct 12)) (top (smain (np (det 0) (noun 1)) (verb 2) (ap (adv 3) (num 4) \ (cp (comp 5) (np (det 6) (adj 7) (noun 8) (rel (noun 9) (ssub \ (noun 10) (verb 11) (pp (prep 12) (np (det 13) (adj 14) \ (adj 15) (noun 16))))))))) (punct 17)) (top (smain (np (det 0) (noun 1)) (verb 2) (adv 3) (pp (prep 4) \ (np (det 5) (noun 6)) (part 7))) (punct 8)) (top (punct 7) (conj (smain (noun 0) (verb 1) (np (det 2) \ (noun 3)) (pp (prep 4) (np (det 5) (noun 6)))) (smain \ (verb 8) (np (det 9) (num 10) (noun 11)) (part 12)) (vg 13) \ (smain (verb 14) (noun 15) (pp (prep 16) (np (det 17) \ (noun 18) (pp (prep 19) (np (det 20) (noun 21))))))) \ (punct 22)) (top (smain (np (det 0) (noun 1) (rel (noun 2) (ssub (np (num 3) \ (noun 4)) (adj 5) (verb 6)))) (verb 7) (ppart (verb 8) (pp \ (prep 9) (noun 10)))) (punct 11)) (top (conj (sv1 (np (det 0) (noun 1)) (verb 2) (ppart (verb 3))) \ (vg 4) (sv1 (verb 5) (pp (prep 6) (np (det 7) (adj 8) \ (noun 9))))) (punct 10)) (top (smain (noun 0) (verb 1) (np (det 2) (noun 3)) (inf (adj 4) \ (verb 5) (cp (comp 6) (ssub (noun 7) (adv 8) (verb 10) (ap \ (num 9) (cp (comp 11) (np (det 12) (adj 13) (noun 14) (pp \ (prep 15) (conj (np (det 16) (noun 17)) (vg 18) (np \ (noun 19))))))))))) (punct 20)) (top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) \ (inf (verb 6) (conj (inf (pp (prep 2) (np (det 3) (noun 4))) \ (verb 7)) (inf (verb 9)) (vg 10) (inf (verb 11)))))) \ (punct 12)) (top (smain (verb 2) (noun 3) (adv 4) (ppart (np (det 0) \ (noun 1)) (verb 5))) (punct 6)) (top (conj (smain (np (det 0) (noun 1)) (verb 2) (adj 3) (pp \ (prep 4) (np (det 5) (noun 6)))) (vg 7) (smain (np (det 8) \ (noun 9) (pp (prep 10) (np (det 11) (noun 12)))) (verb 13) \ (pp (prep 14) (np (det 15) (noun 16))))) (punct 17)) (top (conj (smain (noun 0) (verb 1) (inf (ppart (np (noun 2) \ (noun 3)) (verb 4)) (verb 5))) (vg 6) (smain (noun 7) \ (inf (ppart (np (det 8) (noun 9)))))) (punct 10)) (A (B1 (t 6) (t 13)) (B2 (t 3) (t 7) (t 10)) (B3 (t 1) \ (t 9) (t 11) (t 14) (t 16)) (B4 (t 0) (t 5) (t 8))) (A (B1 6 13) (B2 3 7 10) (B3 1 \ 9 11 14 16) (B4 0 5 8)) (VP (VB 0) (PRT 2)) (VP (VP 0 3) (NP (PRP 1) (NN 2))) (ROOT (S (VP_2 (PP (APPR 0) (ART 1) (NN 2) (PP (APPR 3) (ART 4) \ (ADJA 5) (NN 6))) (ADJD 10) (PP (APPR 11) (NN 12)) (VVPP 13)) \ (VAFIN 7) (NP (ART 8) (NN 9))) ($. 14))''' sents = '''Leider stehen diese Fragen nicht im Vordergrund der \ augenblicklichen Diskussion . is Mary happy there das muss man jetzt machen Of ze had gewoon met haar vriendinnen rond kunnen slenteren in de \ buurt van Trafalgar Square . Het had een prachtige dag kunnen zijn in Londen . Cathy zag hen wild zwaaien . Het was een spel geworden , zij en haar vriendinnen kozen iemand \ uit en probeerden zijn of haar nationaliteit te raden . Elk jaar in het hoogseizoen trokken daar massa's toeristen \ voorbij , hun fototoestel in de aanslag , pratend , gillend \ en lachend in de vreemdste talen . Haar vader stak zijn duim omhoog alsof hij wilde zeggen : " het \ komt wel goed , joch " . Ze hadden languit naast elkaar op de strandstoelen kunnen gaan \ liggen . Het hoorde bij de warme zomerdag die ze ginds achter had gelaten . De oprijlaan was niet meer dan een hobbelige zandstrook die zich \ voortslingerde tussen de hoge grijze boomstammen . Haar moeder kleefde bijna tegen het autoraampje aan . Ze veegde de tranen uit haar ooghoeken , tilde haar twee koffers \ op en begaf zich in de richting van het landhuis . Het meisje dat vijf keer juist raadde werd getrakteerd op ijs . Haar neus werd platgedrukt en leek op een jonge champignon . Cathy zag de BMW langzaam verdwijnen tot hij niet meer was dan \ een zilveren schijnsel tussen de bomen en struiken . Ze had met haar moeder kunnen gaan winkelen , zwemmen of \ terrassen . Dat werkwoord had ze zelf uitgevonden . De middagzon hing klein tussen de takken en de schaduwen van de \ wolken drentelden over het gras . Zij zou mams rug ingewreven hebben en mam de hare . 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 Mit einer Messe in der Sixtinischen Kapelle ist das Konklave \ offiziell zu Ende gegangen .''' trees = [Tree.parse(a, parse_leaf=int) for a in trees.splitlines()] sents = [a.split() for a in sents.splitlines()] sents.extend([['Wake', None, 'up'], [None, 'your', 'friend', None]]) for n, (tree, sent) in enumerate(zip(trees, sents)): drawtree = DrawTree(tree, sent) print('\ntree, sent', n, tree, ' '.join('...' if a is None else a for a in sent), repr(drawtree), sep='\n') try: print(drawtree.text(unicodelines=True, ansi=True), sep='\n') except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False), sep='\n')
def bitext(): """ Bitext parsing with a synchronous CFG. Translation would require a special decoder (instead of normal kbest derivations where the whole sentence is given). """ print("bitext parsing with a synchronous CFG") trees = [Tree.parse(a, parse_leaf=int) for a in """\ (ROOT (S (NP (NNP (John 0) (John 7))) (VP (VB (misses 1) (manque 5))\ (PP (IN (a` 6)) (NP (NNP (Mary 2) (Mary 4)))))) (SEP (| 3))) (ROOT (S (NP (NNP (Mary 0) (Mary 4))) (VP (VB (likes 1) (aimes 5))\ (NP (DT (la 6)) (NN (pizza 2) (pizza 7))))) (SEP (| 3)))""".split('\n')] sents = [["0"] * len(a.leaves()) for a in trees] for a in trees: treetransforms.binarize(a) compiled_scfg = Grammar(treebankgrammar(trees, sents)) print("sentences:") for tree in trees: print(' '.join(w for _, w in sorted(tree.pos()))) print("treebank:") for tree in trees: print(tree) print(compiled_scfg, "\n") print("correct translations:") assert parse(compiled_scfg, ["0"] * 7, "John likes Mary | John aimes Mary".split()) assert parse(compiled_scfg, ["0"] * 9, "John misses pizza | la pizza manque a` John".split()) print("incorrect translations:") assert not parse(compiled_scfg, ["0"] * 7, "John likes Mary | Mary aimes John".split()) assert not parse(compiled_scfg, ["0"] * 9, "John misses pizza | John manque a` la pizza".split()) # the following SCFG is taken from: # http://cdec-decoder.org/index.php?title=SCFG_translation # the grammar has been binarized and some new non-terminals had to be # introduced because terminals cannot appear in binary rules. lexicon = ("|", "ein", "ich", "Haus", "kleines", "grosses", "sah", "fand", "small", "little", "big", "large", "house", "shell", "a", "I", "saw", "found") another_scfg = Grammar([ ((('DT', '_ein', '_a'), ((0, ), (1, ))), 0.5), ((('JJ', '_kleines', '_small'), ((0, ), (1, ))), 0.1), ((('JJ', '_kleines', '_little'), ((0, ), (1, ))), 0.9), ((('JJ', '_grosses', '_big'), ((0, ), (1, ))), 0.8), ((('JJ', '_grosses', '_large'), ((0, ), (1, ))), 0.2345), ((('NN_house', '_Haus', '_house'), ((0, ), (1, ))), 1), ((('NN_shell', '_Haus', '_shell'), ((0, ), (1, ))), 1), ((('NP', '_ich', '_I'), ((0, ), (1, ), )), 0.6), ((('NP', 'DT', 'NP|<JJ-NN>'), ((0, 1), (0, 1))), 0.5), ((('NP|<JJ-NN>', 'JJ', 'NN_house'), ((0, 1), (0, 1))), 0.1), ((('NP|<JJ-NN>', 'JJ', 'NN_shell'), ((0, 1), (0, 1))), 1.3), ((('ROOT', 'S', '_|'), ((0, 1, 0), )), 1), ((('S', 'NP', 'VP'), ((0, 1), (0, 1))), 0.2), ((('VP', 'V', 'NP'), ((0, 1), (0, 1))), 0.1), ((('V', '_sah', '_saw'), ((0, ), (1, ))), 0.4), ((('V', '_fand', '_found'), ((0, ), (1, ))), 0.4)] + [((('_%s' % word, 'Epsilon'), (word, )), 1) for word in lexicon]) print(another_scfg) sents = [ "ich sah ein kleines Haus | I saw a small house".split(), "ich sah ein kleines Haus | I saw a little house".split(), "ich sah ein kleines Haus | I saw a small shell".split(), "ich sah ein kleines Haus | I saw a little shell".split()] for sent in sents: assert parse(another_scfg, sent), sent
def parse(): """Parse sentence and return a textual representation of a parse tree. Output is either in a HTML fragment or in plain text. To be invoked by an AJAX call.""" sent = request.args.get('sent', None) est = request.args.get('est', 'rfe') marg = request.args.get('marg', 'nbest') objfun = request.args.get('objfun', 'mpp') coarse = request.args.get('coarse', None) html = 'html' in request.args lang = request.args.get('lang', 'detect') if not sent: return '' frags = nbest = None senttok = tokenize(sent) if not senttok or not 1 <= len(senttok) <= LIMIT: return 'Sentence too long: %d words, max %d' % (len(senttok), LIMIT) if lang == 'detect': lang = guesslang(senttok) elif lang not in PARSERS: return 'unknown language %r; languages: %r' % (lang, PARSERS.keys()) key = (senttok, est, marg, objfun, coarse, lang) resp = CACHE.get(key) if resp is None: link = 'parse?' + url_encode( dict(sent=sent, est=est, marg=marg, objfun=objfun, coarse=coarse, html=html)) PARSERS[lang].stages[-1].estimator = est PARSERS[lang].stages[-1].objective = objfun PARSERS[lang].stages[-1].kbest = marg in ('nbest', 'both') PARSERS[lang].stages[-1].sample = marg in ('sample', 'both') if PARSERS[lang].stages[0].mode.startswith('pcfg') and coarse: PARSERS[lang].stages[0].mode = coarse PARSERS[lang].stages[1].k = (1e-5 if coarse == 'pcfg-posterior' else 50) results = list(PARSERS[lang].parse(senttok)) if results[-1].noparse: parsetrees = [] result = 'no parse!' frags = nbest = '' else: if SHOWMORPH: replacemorph(results[-1].parsetree) if SHOWFUNC: treebank.handlefunctions('add', results[-1].parsetree, pos=True) tree = str(results[-1].parsetree) prob = results[-1].prob parsetrees = results[-1].parsetrees or [] parsetrees = heapq.nlargest(10, parsetrees, key=itemgetter(1)) parsetrees_ = [] fragments = results[-1].fragments or () APP.logger.info('[%s] %s', probstr(prob), tree) tree = Tree.parse(tree, parse_leaf=int) result = Markup( DrawTree(tree, senttok).text(unicodelines=True, html=html, funcsep='-')) frags = Markup( 'Phrasal fragments used in the most probable ' 'derivation of the highest ranked parse tree:\n' + '\n\n'.join( DrawTree(frag).text(unicodelines=True, html=html) for frag in fragments if frag.count('(') > 1)) for tree, prob, x in parsetrees: tree = PARSERS[lang].postprocess(tree, senttok, -1)[0] if SHOWMORPH: replacemorph(tree) if SHOWFUNC: treebank.handlefunctions('add', tree, pos=True) parsetrees_.append((tree, prob, x)) nbest = Markup('\n\n'.join( '%d. [%s]\n%s' % (n + 1, probstr(prob), DrawTree(tree, senttok).text( unicodelines=True, html=html, funcsep='-')) for n, (tree, prob, _) in enumerate(parsetrees_))) msg = '\n'.join(stage.msg for stage in results) elapsed = [stage.elapsedtime for stage in results] elapsed = 'CPU time elapsed: %s => %gs' % (' '.join( '%gs' % a for a in elapsed), sum(elapsed)) info = '\n'.join( ('length: %d; lang=%s; est=%s; objfun=%s; marg=%s' % (len(senttok), lang, est, objfun, marg), msg, elapsed, '10 most probable parse trees:', '\n'.join( '%d. [%s] %s' % (n + 1, probstr(prob), writediscbrackettree(tree, senttok)) for n, (tree, prob, _) in enumerate(parsetrees)) + '\n')) CACHE.set(key, (sent, result, frags, nbest, info, link), timeout=5000) else: ( sent, result, frags, nbest, # pylint: disable=unpacking-non-sequence info, link) = resp # pylint: disable=unpacking-non-sequence if html: return render_template('parsetree.html', sent=sent, result=result, frags=frags, nbest=nbest, info=info, link=link, randid=randid()) else: return Response('\n'.join((nbest, frags, info, result)), mimetype='text/plain')
def test(): """ Run some tests. """ from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import binarize, unbinarize, \ addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from discodop.fragments import getfragments logging.basicConfig(level=logging.DEBUG, format='%(message)s') filename = "alpinosample.export" corpus = NegraCorpusReader('.', filename, punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.parsed_sents().values())[:10]] print('plcfrs') lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label) print(lcfrs) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) print(grammar) grammar.testgrammar() fragments = getfragments(trees, sents, 1) debug = '--debug' in sys.argv grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug) print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) print(grammar) assert grammar.testgrammar(), "DOP1 should sum to 1." for tree, sent in zip(corpus.parsed_sents().values(), sents): print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) print('\n', msg, end='') print("\ngold ", tree) print("double dop", end='') if chart: mpp = {} parsetrees = {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.getkey(), chart, grammar, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) print(len(mpp), 'parsetrees', end='') print(sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): print(tp, '\n', t, end='') print("match:", t == str(tree)) assert len(set(parsetrees[t])) == len(parsetrees[t]) if not debug: continue for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) else: print("no parse") print(chart) print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def getfragments(trees, sents, numproc=1, disc=True, iterate=False, complement=False, indices=True, cover=True): """Get recurring fragments with exact counts in a single treebank. :returns: a dictionary whose keys are fragments as strings, and indices as values. When ``disc`` is ``True``, keys are of the form ``(frag, sent)`` where ``frag`` is a unicode string, and ``sent`` is a list of words as unicode strings; when ``disc`` is ``False``, keys are of the form ``frag`` where ``frag`` is a unicode string. :param trees: a sequence of binarized Tree objects. :param numproc: number of processes to use; pass 0 to use detected # CPUs. :param disc: when disc=True, assume trees with discontinuous constituents. :param iterate, complement: see :func:`_fragments.extractfragments`""" if numproc == 0: numproc = cpu_count() numtrees = len(trees) if not numtrees: raise ValueError('no trees.') mult = 1 # 3 if numproc > 1 else 1 fragments = {} trees = trees[:] work = workload(numtrees, mult, numproc) PARAMS.update(disc=disc, indices=indices, approx=False, complete=False, complement=complement, debug=False, adjacent=False, twoterms=False) initworkersimple(trees, list(sents), disc) if numproc == 1: mymap = map myapply = APPLY else: logging.info("work division:\n%s", "\n".join(" %s: %r" % kv for kv in sorted(dict(numchunks=len(work), numproc=numproc).items()))) # start worker processes pool = Pool(processes=numproc, initializer=initworkersimple, initargs=(trees, list(sents), disc)) mymap = pool.map myapply = pool.apply # collect recurring fragments logging.info("extracting recurring fragments") for a in mymap(worker, work): fragments.update(a) # add 'cover' fragments corresponding to single productions if cover: cover = myapply(coverfragworker, ()) before = len(fragments) fragments.update(cover) logging.info("merged %d unseen cover fragments", len(fragments) - before) fragmentkeys = list(fragments) bitsets = [fragments[a] for a in fragmentkeys] countchunk = len(bitsets) // numproc + 1 work = list(range(0, len(bitsets), countchunk)) work = [(n, len(work), bitsets[a:a + countchunk]) for n, a in enumerate(work)] logging.info("getting exact counts for %d fragments", len(bitsets)) counts = [] for a in mymap(exactcountworker, work): counts.extend(a) if numproc != 1: pool.close() pool.join() del pool if iterate: # optionally collect fragments of fragments logging.info("extracting fragments of recurring fragments") PARAMS['complement'] = False # needs to be turned off if it was on newfrags = fragments trees, sents = None, None ids = count() for _ in range(10): # up to 10 iterations newtrees = [binarize( introducepreterminals(Tree.parse(tree, parse_leaf=int), ids=ids), childchar="}") for tree, _ in newfrags] newsents = [["#%d" % next(ids) if word is None else word for word in sent] for _, sent in newfrags] newfrags, newcounts = iteratefragments( fragments, newtrees, newsents, trees, sents, numproc) if len(newfrags) == 0: break if trees is None: trees = [] sents = [] trees.extend(newtrees) sents.extend(newsents) fragmentkeys.extend(newfrags) counts.extend(newcounts) fragments.update(zip(newfrags, newcounts)) logging.info("found %d fragments", len(fragmentkeys)) if not disc: return {a.decode('utf-8'): b for a, b in zip(fragmentkeys, counts)} return {(a.decode('utf-8'), b): c for (a, b), c in zip(fragmentkeys, counts)}