def test_binarize(self): treestr = '(S (VP (PDS 0) (ADV 3) (VVINF 4)) (PIS 2) (VMFIN 1))' origtree = Tree(treestr) tree = Tree(treestr) assert str(binarize(tree, horzmarkov=0, tailmarker='')) == ( '(S (VP (PDS 0) (VP|<> (ADV 3) (VVINF 4))) (S|<> (PIS 2) ' '(VMFIN 1)))') assert unbinarize(tree) == origtree assert str(binarize(tree, horzmarkov=1, tailmarker='')) == ( '(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VVINF 4))) (S|<PIS> ' '(PIS 2) (VMFIN 1)))') assert unbinarize(tree) == origtree assert str(binarize(tree, horzmarkov=1, leftmostunary=False, rightmostunary=True, tailmarker='')) == ( '(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VP|<VVINF> (VVINF 4)))) ' '(S|<PIS> (PIS 2) (S|<VMFIN> (VMFIN 1))))') assert unbinarize(tree) == origtree assert str(binarize(tree, horzmarkov=1, leftmostunary=True, rightmostunary=False, tailmarker='')) == ( '(S (S|<VP> (VP (VP|<PDS> (PDS 0) (VP|<ADV> (ADV 3) ' '(VVINF 4)))) (S|<PIS> (PIS 2) (VMFIN 1))))') assert unbinarize(tree) == origtree assert str(binarize(tree, factor='left', horzmarkov=2, tailmarker='') ) == ('(S (S|<PIS,VMFIN> (VP (VP|<ADV,VVINF> (PDS 0) (ADV 3)) ' '(VVINF 4)) (PIS 2)) (VMFIN 1))') assert unbinarize(tree) == origtree tree = Tree('(S (A 0) (B 1) (C 2) (D 3) (E 4) (F 5))') assert str(binarize(tree, tailmarker='', reverse=False)) == ( '(S (A 0) (S|<B,C,D,E,F> (B 1) (S|<C,D,E,F> (C 2) (S|<D,E,F> ' '(D 3) (S|<E,F> (E 4) (F 5))))))')
def postprocess(self, treestr, stage=-1): """Take parse tree and apply postprocessing.""" parsetree = Tree.parse(treestr, parse_leaf=int) if self.stages[stage].split: mergediscnodes(unbinarize(parsetree, childchar=':', expandunary=False)) saveheads(parsetree, self.binarization.tailmarker) unbinarize(parsetree, expandunary=False) removefanoutmarkers(parsetree) if self.relationalrealizational: parsetree = rrbacktransform(parsetree, self.relationalrealizational['adjunctionlabel']) if self.transformations: reversetransform(parsetree, self.transformations) return parsetree, False
def postprocess(self, treestr, stage=-1, derivs=None): """ Take parse tree and apply postprocessing. """ parsetree = Tree.parse(treestr, parse_leaf=int) if self.stages[stage].split: mergediscnodes(unbinarize(parsetree, childchar=':')) saveheads(parsetree, self.tailmarker) unbinarize(parsetree) removefanoutmarkers(parsetree) if self.relationalrealizational: parsetree = rrbacktransform(parsetree, self.relationalrealizational['adjunctionlabel']) if self.transformations: reversetransform(parsetree, self.transformations) fragments = derivs.get(treestr) if derivs else None return parsetree, fragments, False
def test_binarize(self): treestr = '(S (VP (PDS 0) (ADV 3) (VVINF 4)) (VMFIN 1) (PIS 2))' origtree = Tree(treestr) tree = Tree(treestr) tree[1].type = HEAD # set VMFIN as head assert str(binarize(tree, horzmarkov=0)) == ( '(S (VP (PDS 0) (VP|<> (ADV 3) (VVINF 4))) (S|<> (VMFIN 1)' ' (PIS 2)))') assert unbinarize(tree) == origtree assert str(binarize(tree, horzmarkov=1)) == ( '(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VVINF 4))) (S|<VMFIN> ' '(VMFIN 1) (PIS 2)))') assert unbinarize(tree) == origtree assert str( binarize(tree, horzmarkov=1, leftmostunary=False, rightmostunary=True, headoutward=True) ) == ('(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VP|<VVINF> (VVINF 4)))) ' '(S|<VMFIN> (S|<VMFIN> (VMFIN 1)) (PIS 2)))') assert unbinarize(tree) == origtree assert str( binarize(tree, horzmarkov=1, leftmostunary=True, rightmostunary=False, headoutward=True)) == ( '(S (S|<VP> (VP (VP|<PDS> (PDS 0) (VP|<ADV> (ADV 3) ' '(VVINF 4)))) (S|<VMFIN> (VMFIN 1) (PIS 2))))') assert unbinarize(tree) == origtree assert str( binarize(tree, factor='left', horzmarkov=2, headoutward=True)) == ( '(S (S|<VMFIN,PIS> (VP (VP|<PDS,ADV> (PDS 0) (ADV 3)) ' '(VVINF 4)) (VMFIN 1)) (PIS 2))') assert unbinarize(tree) == origtree tree = Tree('(S (A 0) (B 1) (C 2) (D 3) (E 4) (F 5))') assert str(binarize(tree, headoutward=True)) == ( '(S (A 0) (S|<B,C,D,E,F> (B 1) (S|<C,D,E,F> (C 2) (S|<D,E,F> ' '(D 3) (S|<E,F> (E 4) (F 5))))))')
def parse(compiledgrammar, testsent, testtags=None): """ Parse a sentence with a grammar. """ chart, _ = plcfrs.parse(testsent, compiledgrammar, tags=testtags, exhaustive=True) print("input:", ' '.join("%d:%s" % a for a in enumerate(testtags if testtags else testsent)), end=' ') if chart: print() results = kbest.lazykbest(chart, 10)[0] for tree, prob in results: tree = Tree(tree) treetransforms.unbinarize(tree) print(exp(-prob), tree) print() return True else: print("no parse!\n") #print(chart) return False
def test_binarize(self): treestr = '(S (VP (PDS 0) (ADV 3) (VVINF 4)) (VMFIN 1) (PIS 2))' origtree = Tree(treestr) tree = Tree(treestr) sethead(tree[1]) # set VMFIN as head assert str(binarize(tree, horzmarkov=0)) == ( '(S (VP (PDS 0) (VP|<> (ADV 3) (VVINF 4))) (S|<> (VMFIN 1)' ' (PIS 2)))') assert unbinarize(tree) == origtree assert str(binarize(tree, horzmarkov=1)) == ( '(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VVINF 4))) (S|<VMFIN> ' '(VMFIN 1) (PIS 2)))') assert unbinarize(tree) == origtree assert str(binarize(tree, horzmarkov=1, leftmostunary=False, rightmostunary=True, headoutward=True)) == ( '(S (VP (PDS 0) (VP|<ADV> (ADV 3) (VP|<VVINF> (VVINF 4)))) ' '(S|<VMFIN> (S|<VMFIN> (VMFIN 1)) (PIS 2)))') assert unbinarize(tree) == origtree assert str(binarize(tree, horzmarkov=1, leftmostunary=True, rightmostunary=False, headoutward=True)) == ( '(S (S|<VP> (VP (VP|<PDS> (PDS 0) (VP|<ADV> (ADV 3) ' '(VVINF 4)))) (S|<VMFIN> (VMFIN 1) (PIS 2))))') assert unbinarize(tree) == origtree assert str(binarize(tree, factor='left', horzmarkov=2, headoutward=True) ) == ('(S (S|<VMFIN,PIS> (VP (VP|<PDS,ADV> (PDS 0) (ADV 3)) ' '(VVINF 4)) (VMFIN 1)) (PIS 2))') assert unbinarize(tree) == origtree tree = Tree('(S (A 0) (B 1) (C 2) (D 3) (E 4) (F 5))') assert str(binarize(tree, headoutward=True)) == ( '(S (A 0) (S|<B,C,D,E,F> (B 1) (S|<C,D,E,F> (C 2) (S|<D,E,F> ' '(D 3) (S|<E,F> (E 4) (F 5))))))')
def debinarize(fragments): """Debinarize fragments; fragments that fail to debinarize left as-is.""" result = [] for origfrag in fragments: if PARAMS['disc']: frag, sent = origfrag else: frag = origfrag try: frag = str(unbinarize(Tree(frag))) except: result.append(origfrag) else: result.append((frag, sent) if PARAMS['disc'] else frag) return result
config.read(argv[1]) data = SupertagParseDataset(f"{config['Corpus']['filename']}.train") from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from discodop.lexgrammar import SupertagGrammar grammar = load(open(f"{config['Corpus']['filename']}.grammar", "rb")) i = 0 evaluator = Evaluator(readparam("proper.prm")) for sentence in data: words = tuple(t.text for t in sentence) poss = tuple(t.get_tag("pos").value for t in sentence) tags = tuple(((t.get_tag("supertag").value, 0.0), ) for t in sentence) parses = grammar.parse(poss, tags, posmode=True) try: parse = next(parses) except StopIteration: leaves = (f"({p} {i})" for p, i in zip(poss, range(len(words)))) parse = ParentedTree(f"(NOPARSE {' '.join(leaves)})") gold = ParentedTree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(gold)))) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(Tree.convert(parse)))) evaluator.add(i, gold.copy(deep=True), list(words), parse.copy(deep=True), list(words)) i += 1 print(evaluator.summary())
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, '}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def evaluate(self, sentences: SupertagParseDataset, mini_batch_size: int = 32, num_workers: int = 1, embedding_storage_mode: str = "none", out_path=None, only_disc: str = "both", accuracy: str = "both", pos_accuracy: bool = True, return_loss: bool = True) -> Tuple[Result, float]: """ Predicts supertags, pos tags and parse trees, and reports the predictions scores for a set of sentences. :param sentences: a ``DataSet`` of sentences. For each sentence a gold parse tree is expected as value of the `tree` label, as provided by ``SupertagParseDataset``. :param only_disc: If set, overrides the setting `DISC_ONLY` in the evaluation parameter file ``self.evalparam``, i.e. only evaluates discontinuous constituents if True. Pass "both" to report both results. :param accuracy: either 'none', 'best', 'kbest' or 'both'. Determines if the accuracy is computed from the best, or k-best predicted tags. :param pos_accuracy: if set, reports acc. of predicted pos tags. :param return_loss: if set, nll loss wrt. gold tags is reported, otherwise the second component in the returned tuple is 0. :returns: tuple with evaluation ``Result``, where the main score is the f1-score (for all constituents, if only_disc == "both"). """ from flair.datasets import DataLoader from discodop.tree import ParentedTree, Tree from discodop.treetransforms import unbinarize, removefanoutmarkers from discodop.eval import Evaluator, readparam from timeit import default_timer from collections import Counter if self.__evalparam__ is None: raise Exception( "Need to specify evaluator parameter file before evaluating") if only_disc == "both": evaluators = { "F1-all": Evaluator({ **self.evalparam, "DISC_ONLY": False }), "F1-disc": Evaluator({ **self.evalparam, "DISC_ONLY": True }) } else: mode = self.evalparam["DISC_ONLY"] if only_disc == "param" else ( only_disc == "true") strmode = "F1-disc" if mode else "F1-all" evaluators = { strmode: Evaluator({ **self.evalparam, "DISC_ONLY": mode }) } data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # predict supertags and parse trees eval_loss = 0 start_time = default_timer() for batch in data_loader: loss = self.predict(batch, embedding_storage_mode=embedding_storage_mode, supertag_storage_mode=accuracy, postag_storage_mode=pos_accuracy, label_name='predicted', return_loss=return_loss) eval_loss += loss if return_loss else 0 end_time = default_timer() i = 0 batches = 0 noparses = 0 acc_ctr = Counter() for batch in data_loader: for sentence in batch: for token in sentence: if accuracy in ("kbest", "both") and token.get_tag("supertag").value in \ (l.value for l in token.get_tags_proba_dist('predicted-supertag')): acc_ctr["kbest"] += 1 if accuracy in ("best", "both") and token.get_tag("supertag").value == \ token.get_tag('predicted-supertag').value: acc_ctr["best"] += 1 if pos_accuracy and token.get_tag( "pos").value == token.get_tag( "predicted-pos").value: acc_ctr["pos"] += 1 acc_ctr["all"] += len(sentence) sent = [token.text for token in sentence] gold = Tree(sentence.get_labels("tree")[0].value) gold = ParentedTree.convert( unbinarize(removefanoutmarkers(gold))) parse = Tree(sentence.get_labels("predicted")[0].value) parse = ParentedTree.convert( unbinarize(removefanoutmarkers(parse))) if parse.label == "NOPARSE": noparses += 1 for evaluator in evaluators.values(): evaluator.add(i, gold.copy(deep=True), list(sent), parse.copy(deep=True), list(sent)) i += 1 batches += 1 scores = { strmode: float_or_zero(evaluator.acc.scores()['lf']) for strmode, evaluator in evaluators.items() } if accuracy in ("both", "kbest"): scores["accuracy-kbest"] = acc_ctr["kbest"] / acc_ctr["all"] if accuracy in ("both", "best"): scores["accuracy-best"] = acc_ctr["best"] / acc_ctr["all"] if pos_accuracy: scores["accuracy-pos"] = acc_ctr["pos"] / acc_ctr["all"] scores["coverage"] = 1 - (noparses / i) scores["time"] = end_time - start_time return (Result( scores['F1-all'] if 'F1-all' in scores else scores['F1-disc'], "\t".join(f"{mode}" for mode in scores), "\t".join(f"{s}" for s in scores.values()), '\n\n'.join(evaluator.summary() for evaluator in evaluators.values())), eval_loss / batches)
def test(): """ Run some tests. """ from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import binarize, unbinarize, \ addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from discodop.fragments import getfragments logging.basicConfig(level=logging.DEBUG, format='%(message)s') filename = "alpinosample.export" corpus = NegraCorpusReader('.', filename, punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.parsed_sents().values())[:10]] print('plcfrs') lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label) print(lcfrs) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) print(grammar) grammar.testgrammar() fragments = getfragments(trees, sents, 1) debug = '--debug' in sys.argv grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug) print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) print(grammar) assert grammar.testgrammar(), "DOP1 should sum to 1." for tree, sent in zip(corpus.parsed_sents().values(), sents): print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) print('\n', msg, end='') print("\ngold ", tree) print("double dop", end='') if chart: mpp = {} parsetrees = {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.getkey(), chart, grammar, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) print(len(mpp), 'parsetrees', end='') print(sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): print(tp, '\n', t, end='') print("match:", t == str(tree)) assert len(set(parsetrees[t])) == len(parsetrees[t]) if not debug: continue for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) else: print("no parse") print(chart) print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10]] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=debug, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))