def testpunct(): """ Verify that punctuation movement does not increase fan-out. """ from discodop.treetransforms import addbitsets, fanout from discodop.treebank import NegraCorpusReader filename = 'alpinosample.export' mangledtrees = NegraCorpusReader('.', filename, punct='move') nopunct = list(NegraCorpusReader('.', filename, punct='remove').parsed_sents().values()) originals = list(NegraCorpusReader('.', filename, headrules=None, encoding='iso-8859-1').parsed_sents().values()) phrasal = lambda x: len(x) and isinstance(x[0], Tree) for n, mangled, sent, nopunct, original in zip(count(), mangledtrees.parsed_sents().values(), mangledtrees.sents().values(), nopunct, originals): print(n, end='') for a, b in zip(sorted(addbitsets(mangled).subtrees(phrasal), key=lambda n: min(n.leaves())), sorted(addbitsets(nopunct).subtrees(phrasal), key=lambda n: min(n.leaves()))): if fanout(a) != fanout(b): print(' '.join(sent)) print(mangled) print(nopunct) print(original) assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % ( fanout(a), fanout(b), a, b) print()
def test_optimalbinarize(): """Verify that all optimal parsing complexities are lower than or equal to the complexities of right-to-left binarizations.""" from discodop.treetransforms import optimalbinarize, complexityfanout from discodop.treebank import NegraCorpusReader corpus = NegraCorpusReader('alpinosample.export', punct='move') total = violations = violationshd = 0 for n, (tree, sent) in enumerate(zip(list( corpus.trees().values())[:-2000], corpus.sents().values())): t = addbitsets(tree) if all(fanout(x) == 1 for x in t.subtrees()): continue print(n, tree, '\n', ' '.join(sent)) total += 1 optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1) # undo head-ordering to get a normal right-to-left binarization normbin = addbitsets(binarize(canonicalize(Tree.convert(tree)))) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('non-hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violations += 1 optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1) normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1)) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violationshd += 1 print('opt. bin. violations normal: %d / %d; hd: %d / %d' % ( violations, total, violationshd, total)) assert violations == violationshd == 0
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers from discodop.disambiguation import getderivations, marginalize corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, _backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(None, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) result, msg = grammar.testgrammar() assert result, 'RFE should sum to 1.\n%s' % msg for tree, sent in zip(corpus.trees().values(), sents): if debug: print('sentence:', ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: getderivations(chart, 100) _parses, _msg = marginalize('mpp', chart) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse('(ROOT (S (F (E (S (C (B (A 0))))))))', parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def testsplit(): """ Verify that splitting and merging discontinuties gives the same trees for a treebank. """ from discodop.treebank import NegraCorpusReader correct = wrong = 0 n = NegraCorpusReader('.', 'alpinosample.export') for tree in n.parsed_sents().values(): if mergediscnodes(splitdiscnodes(tree)) == tree: correct += 1 else: wrong += 1 total = len(n.sents()) print("correct", correct, "=", 100 * correct / total, "%") print("wrong", wrong, "=", 100 * wrong / total, "%")
def test_splitdisc(): """Verify that splitting and merging discontinuities gives the same trees.""" from discodop.treebank import NegraCorpusReader correct = wrong = 0 corpus = NegraCorpusReader('alpinosample.export') for tree in corpus.trees().values(): if mergediscnodes(splitdiscnodes(tree)) == tree: correct += 1 else: wrong += 1 total = len(corpus.sents()) print('disc. split-merge: correct', correct, '=', 100. * correct / total, '%') print('disc. split-merge: wrong', wrong, '=', 100. * wrong / total, '%') assert wrong == 0
def test_transform(self): from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader n = NegraCorpusReader('alpinosample.export') for transformations in ( ('FUNC-NODE', ), ('MORPH-NODE', ), ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')): nn = NegraCorpusReader('alpinosample.export') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] for a, b in islice(zip(n.trees().values(), trees), 100): before = bracketings(canonicalize(a)) transformb = reversetransform(b.copy(True), transformations) after = bracketings(canonicalize(transformb)) assert before == after, ( 'mismatch with %r\nbefore: %r\nafter: %r' % ( transformations, before, after))
def test_punct(): """Verify that punctuation movement does not increase fan-out.""" def phrasal(x): return x and isinstance(x[0], Tree) from discodop.treebank import NegraCorpusReader filename = 'alpinosample.export' mangledtrees = NegraCorpusReader(filename, punct='move') nopunct = list( NegraCorpusReader(filename, punct='remove').trees().values()) originals = list( NegraCorpusReader(filename, headrules=None, encoding='iso-8859-1').trees().values()) for n, mangled, sent, nopunct, original in zip( count(), mangledtrees.trees().values(), mangledtrees.sents().values(), nopunct, originals): print(n, end='. ') for a, b in zip( sorted(addbitsets(mangled).subtrees(phrasal), key=lambda n: min(n.leaves())), sorted(addbitsets(nopunct).subtrees(phrasal), key=lambda n: min(n.leaves()))): if fanout(a) != fanout(b): print(' '.join(sent)) print(mangled) print(nopunct) print(original) assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % (fanout(a), fanout(b), a, b) print()
def test_transforms(): """Test reversibility of Tiger transformations.""" from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('alpinosample.export', headrules=headrules) nn = NegraCorpusReader('alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [ transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values()) ] print('\ntransformed') correct = exact = e = 0 for a, b, c, d in islice( zip(n.trees().values(), n.sents().values(), trees, count()), 100): transformc = reversetransform(c.copy(True), b, transformations) c1 = bracketings(canonicalize(a)) c2 = bracketings(canonicalize(transformc)) z = -1 # 825 if c1 != c2 or e == z: precision = len(set(c1) & set(c2)) / len(set(c1)) recall = len(set(c1) & set(c2)) / len(set(c2)) if precision != 1.0 or recall != 1.0 or d == z: print( d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(b))) print('no match', precision, recall) print(len(c1), len(c2), 'gold-transformed', set(c2) - set(c1), 'transformed-gold', set(c1) - set(c2)) print(a) print(transformc) handlefunctions('add', a) print(a, '\n', b, '\n\n') else: correct += 1 else: exact += 1 correct += 1 e += 1 print('matches', correct, '/', e, 100 * correct / e, '%') print('exact', exact)
def test_transform(self): from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader n = NegraCorpusReader('alpinosample.export') for transformations in ( ('FUNC-NODE', ), ('MORPH-NODE', ), ('LEMMA-NODE', ), ('FUNC-NODE', 'MORPH-NODE', 'LEMMA-NODE')): nn = NegraCorpusReader('alpinosample.export') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] for sent, a, b in islice(zip( n.sents().values(), n.trees().values(), trees), 100): before = bracketings(canonicalize(a)) transformb = reversetransform( b.copy(True), sent, transformations) after = bracketings(canonicalize(transformb)) assert before == after, ( 'mismatch with %r\nbefore: %r\nafter: %r' % ( transformations, before, after))
def testtransforms(): """ Test whether the Tiger transformations (transform / reversetransform) are reversible. """ from discodop.treetransforms import canonicalize from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules) nn = NegraCorpusReader('.', 'alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.parsed_sents().values(), nn.sents().values())] print('\ntransformed') correct = exact = d = 0 for a, b, c in islice(zip(n.parsed_sents().values(), trees, n.sents().values()), 100): transformb = reversetransform(b.copy(True), transformations) b1 = bracketings(canonicalize(a)) b2 = bracketings(canonicalize(transformb)) z = -1 # 825 if b1 != b2 or d == z: precision = len(set(b1) & set(b2)) / len(set(b1)) recall = len(set(b1) & set(b2)) / len(set(b2)) if precision != 1.0 or recall != 1.0 or d == z: print(d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(c))) print('no match', precision, recall) print(len(b1), len(b2), 'gold-transformed', set(b2) - set(b1), 'transformed-gold', set(b1) - set(b2)) print(a) print(transformb) handlefunctions('add', a) print(a) print(b) print() else: correct += 1 else: exact += 1 correct += 1 d += 1 print('matches', correct, '/', d, 100 * correct / d, '%') print('exact', exact)
def test_transforms(): """Test reversibility of Tiger transformations.""" from discodop.treebanktransforms import transform, reversetransform, \ bracketings from discodop.treebank import NegraCorpusReader, handlefunctions headrules = None # 'alpino.headrules' n = NegraCorpusReader('alpinosample.export', headrules=headrules) nn = NegraCorpusReader('alpinosample.export', headrules=headrules) transformations = ('S-RC', 'VP-GF', 'NP') trees = [transform(tree, sent, transformations) for tree, sent in zip(nn.trees().values(), nn.sents().values())] print('\ntransformed') correct = exact = e = 0 for a, b, c, d in islice(zip(n.trees().values(), n.sents().values(), trees, count()), 100): transformc = reversetransform(c.copy(True), transformations) c1 = bracketings(canonicalize(a)) c2 = bracketings(canonicalize(transformc)) z = -1 # 825 if c1 != c2 or e == z: precision = len(set(c1) & set(c2)) / len(set(c1)) recall = len(set(c1) & set(c2)) / len(set(c2)) if precision != 1.0 or recall != 1.0 or d == z: print(d, ' '.join(':'.join((str(n), a.encode('unicode-escape'))) for n, a in enumerate(b))) print('no match', precision, recall) print(len(c1), len(c2), 'gold-transformed', set(c2) - set(c1), 'transformed-gold', set(c1) - set(c2)) print(a) print(transformc) handlefunctions('add', a) print(a, '\n', b, '\n\n') else: correct += 1 else: exact += 1 correct += 1 e += 1 print('matches', correct, '/', e, 100 * correct / e, '%') print('exact', exact)
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [ addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10] ] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=False, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile('^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, '}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
"""https://gist.github.com/andreasvc/7507135#file-tigersplit-py""" """ The train-test split described in Hall & Nivre (2008), Parsing Discontinuous Phrase Structure with Grammatical Functions. Corpus is divided in Sections 0-9, where sentence i is allocated to section i mod 10. For development train on sections 2-9; evaluate on section 1. For final evaluation (test) train on sections 1-9; evaluate on section 0. """ import io import os from discodop.treebank import NegraCorpusReader #corpus = NegraCorpusReader('tiger/corpus', 'tiger_release_aug07.export', #encoding='iso-8859-1') corpus = NegraCorpusReader('tiger21/corpus/tiger_release_aug07.export', encoding='iso-8859-1') #os.mkdir('tiger-split/') io.open('tiger21/tigertraindev.export', 'w', encoding='utf8').writelines( a for n, a in enumerate(corpus.blocks().values(), 1) if n % 10 > 1) io.open('tiger21/tigerdev.export', 'w', encoding='utf8').writelines( a for n, a in enumerate(corpus.blocks().values(), 1) if n % 10 == 1) io.open('tiger21/tigertraintest.export', 'w', encoding='utf8').writelines( a for n, a in enumerate(corpus.blocks().values(), 1) if n % 10 != 0) io.open('tiger21/tigertest.export', 'w', encoding='utf8').writelines( a for n, a in enumerate(corpus.blocks().values(), 1) if n % 10 == 0)
def test(): """ Run some tests. """ from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import binarize, unbinarize, \ addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from discodop.fragments import getfragments logging.basicConfig(level=logging.DEBUG, format='%(message)s') filename = "alpinosample.export" corpus = NegraCorpusReader('.', filename, punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.parsed_sents().values())[:10]] print('plcfrs') lcfrs = Grammar(treebankgrammar(trees, sents), start=trees[0].label) print(lcfrs) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) print(grammar) grammar.testgrammar() fragments = getfragments(trees, sents, 1) debug = '--debug' in sys.argv grammarx, backtransform, _ = doubledop(trees, fragments, debug=debug) print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) print(grammar) assert grammar.testgrammar(), "DOP1 should sum to 1." for tree, sent in zip(corpus.parsed_sents().values(), sents): print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) print('\n', msg, end='') print("\ngold ", tree) print("double dop", end='') if chart: mpp = {} parsetrees = {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.getkey(), chart, grammar, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) print(len(mpp), 'parsetrees', end='') print(sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): print(tp, '\n', t, end='') print("match:", t == str(tree)) assert len(set(parsetrees[t])) == len(parsetrees[t]) if not debug: continue for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) else: print("no parse") print(chart) print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))
def test_grammar(debug=False): """Demonstrate grammar extraction.""" from discodop.grammar import treebankgrammar, dopreduction, doubledop from discodop import plcfrs from discodop.containers import Grammar from discodop.treebank import NegraCorpusReader from discodop.treetransforms import addfanoutmarkers, removefanoutmarkers from discodop.disambiguation import recoverfragments from discodop.kbest import lazykbest from math import exp corpus = NegraCorpusReader('alpinosample.export', punct='move') sents = list(corpus.sents().values()) trees = [addfanoutmarkers(binarize(a.copy(True), horzmarkov=1)) for a in list(corpus.trees().values())[:10]] if debug: print('plcfrs\n', Grammar(treebankgrammar(trees, sents))) print('dop reduction') grammar = Grammar(dopreduction(trees[:2], sents[:2])[0], start=trees[0].label) if debug: print(grammar) _ = grammar.testgrammar() grammarx, backtransform, _, _ = doubledop(trees, sents, debug=debug, numproc=1) if debug: print('\ndouble dop grammar') grammar = Grammar(grammarx, start=trees[0].label) grammar.getmapping(grammar, striplabelre=None, neverblockre=re.compile(b'^#[0-9]+|.+}<'), splitprune=False, markorigin=False) if debug: print(grammar) assert grammar.testgrammar()[0], "RFE should sum to 1." for tree, sent in zip(corpus.trees().values(), sents): if debug: print("sentence:", ' '.join(a.encode('unicode-escape').decode() for a in sent)) chart, msg = plcfrs.parse(sent, grammar, exhaustive=True) if debug: print('\n', msg, '\ngold ', tree, '\n', 'double dop', end='') if chart: mpp, parsetrees = {}, {} derivations, _ = lazykbest(chart, 1000, b'}<') for d, (t, p) in zip(chart.rankededges[chart.root()], derivations): r = Tree(recoverfragments(d.key, chart, backtransform)) r = str(removefanoutmarkers(unbinarize(r))) mpp[r] = mpp.get(r, 0.0) + exp(-p) parsetrees.setdefault(r, []).append((t, p)) if debug: print(len(mpp), 'parsetrees', sum(map(len, parsetrees.values())), 'derivations') for t, tp in sorted(mpp.items(), key=itemgetter(1)): if debug: print(tp, t, '\nmatch:', t == str(tree)) if len(set(parsetrees[t])) != len(parsetrees[t]): print('chart:\n', chart) assert len(set(parsetrees[t])) == len(parsetrees[t]) if debug: for deriv, p in sorted(parsetrees[t], key=itemgetter(1)): print(' <= %6g %s' % (exp(-p), deriv)) elif debug: print('no parse\n', chart) if debug: print() tree = Tree.parse("(ROOT (S (F (E (S (C (B (A 0))))))))", parse_leaf=int) Grammar(treebankgrammar([tree], [[str(a) for a in range(10)]]))