def testpunct(): """ Verify that punctuation movement does not increase fan-out. """ from discodop.treetransforms import addbitsets, fanout from discodop.treebank import NegraCorpusReader filename = 'alpinosample.export' mangledtrees = NegraCorpusReader('.', filename, punct='move') nopunct = list(NegraCorpusReader('.', filename, punct='remove').parsed_sents().values()) originals = list(NegraCorpusReader('.', filename, headrules=None, encoding='iso-8859-1').parsed_sents().values()) phrasal = lambda x: len(x) and isinstance(x[0], Tree) for n, mangled, sent, nopunct, original in zip(count(), mangledtrees.parsed_sents().values(), mangledtrees.sents().values(), nopunct, originals): print(n, end='') for a, b in zip(sorted(addbitsets(mangled).subtrees(phrasal), key=lambda n: min(n.leaves())), sorted(addbitsets(nopunct).subtrees(phrasal), key=lambda n: min(n.leaves()))): if fanout(a) != fanout(b): print(' '.join(sent)) print(mangled) print(nopunct) print(original) assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % ( fanout(a), fanout(b), a, b) print()
def test_punct(): """Verify that punctuation movement does not increase fan-out.""" def phrasal(x): return x and isinstance(x[0], Tree) from discodop.treebank import NegraCorpusReader filename = 'alpinosample.export' mangledtrees = NegraCorpusReader(filename, punct='move') nopunct = list( NegraCorpusReader(filename, punct='remove').trees().values()) originals = list( NegraCorpusReader(filename, headrules=None, encoding='iso-8859-1').trees().values()) for n, mangled, sent, nopunct, original in zip( count(), mangledtrees.trees().values(), mangledtrees.sents().values(), nopunct, originals): print(n, end='. ') for a, b in zip( sorted(addbitsets(mangled).subtrees(phrasal), key=lambda n: min(n.leaves())), sorted(addbitsets(nopunct).subtrees(phrasal), key=lambda n: min(n.leaves()))): if fanout(a) != fanout(b): print(' '.join(sent)) print(mangled) print(nopunct) print(original) assert fanout(a) == fanout(b), '%d %d\n%s\n%s' % (fanout(a), fanout(b), a, b) print()
def test_optimalbinarize(): """Verify that all optimal parsing complexities are lower than or equal to the complexities of right-to-left binarizations.""" from discodop.treetransforms import optimalbinarize, complexityfanout from discodop.treebank import NegraCorpusReader corpus = NegraCorpusReader('alpinosample.export', punct='move') total = violations = violationshd = 0 for n, (tree, sent) in enumerate(zip(list( corpus.trees().values())[:-2000], corpus.sents().values())): t = addbitsets(tree) if all(fanout(x) == 1 for x in t.subtrees()): continue print(n, tree, '\n', ' '.join(sent)) total += 1 optbin = optimalbinarize(tree.copy(True), headdriven=False, h=None, v=1) # undo head-ordering to get a normal right-to-left binarization normbin = addbitsets(binarize(canonicalize(Tree.convert(tree)))) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('non-hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violations += 1 optbin = optimalbinarize(tree.copy(True), headdriven=True, h=1, v=1) normbin = addbitsets(binarize(Tree.convert(tree), horzmarkov=1)) if (max(map(complexityfanout, optbin.subtrees())) > max(map(complexityfanout, normbin.subtrees()))): print('hd\n', tree) print(max(map(complexityfanout, optbin.subtrees())), optbin) print(max(map(complexityfanout, normbin.subtrees())), normbin, '\n') violationshd += 1 print('opt. bin. violations normal: %d / %d; hd: %d / %d' % ( violations, total, violationshd, total)) assert violations == violationshd == 0
def new_flatten(tree, sent, ids): """ Auxiliary function for Double-DOP. Remove internal nodes from a tree and read off its binarized productions. Aside from returning productions, also return tree with lexical and frontier nodes replaced by a templating symbol '%s'. Input is a tree and sentence, as well as an iterator which yields unique IDs for non-terminals introdudced by the binarization; output is a tuple (prods, frag). Trees are in the form of strings. NB: this version is currently not used. #>>> ids = count() #>>> sent = [None, ',', None, '.'] #>>> tree = "(ROOT (S_2 0 2) (ROOT|<$,>_2 ($, 1) ($. 3)))" #>>> new_flatten(tree, sent, ids) #([(('ROOT', 'ROOT}<0>', '$.@.'), ((0, 1),)), #(('ROOT}<0>', 'S_2', '$,@,'), ((0, 1, 0),)), #(('$,@,', 'Epsilon'), (',',)), (('$.@.', 'Epsilon'), ('.',))], #'(S_2 {0}) (ROOT|<$,>_2 ($, {1}) ($. {2}))', #['(S_2 ', 0, ') (ROOT|<$,>_2 ($, ', 1, ') ($. ', 2 '))']) """ from discodop.treetransforms import factorconstituent, addbitsets def repl(x): """ Add information to a frontier or terminal: :frontiers: ``(label indices)`` :terminals: ``(tag@word idx)`` """ n = x.group(2) # index w/leading space nn = int(n) if sent[nn] is None: return x.group(0) # (label indices) word = quotelabel(sent[nn]) # (tag@word idx) return "(%s@%s%s)" % (x.group(1), word, n) if tree.count(' ') == 1: return lcfrs_productions(addbitsets(tree), sent), ([str(tree)], []) # give terminals unique POS tags prod = FRONTIERORTERM.sub(repl, tree) # remove internal nodes, reorder prod = "%s %s)" % (prod[:prod.index(' ')], ' '.join(x.group(0) for x in sorted(FRONTIERORTERM.finditer(prod), key=lambda x: int(x.group(2))))) prods = lcfrs_productions(factorconstituent(addbitsets(prod), "}", factor='left', markfanout=True, ids=ids, threshold=2), sent) # remember original order of frontiers / terminals for template order = [int(x.group(2)) for x in FRONTIERORTERM.finditer(prod)] # ensure string, split around substitution sites. #lambda x: order[x.group(2)], treeparts = FRONTIERORTERM_new.split(str(tree)) return prods, (treeparts, order)
def treebankfanout(trees): """ Get maximal fan-out of a list of trees. """ # avoid max over empty sequence: 'treebank' may only have unary productions try: return max((fanout(a), n) for n, tree in enumerate(trees) for a in addbitsets(tree).subtrees(lambda x: len(x) > 1)) except ValueError: return 1, 0
def treebankfanout(trees): """Get maximal fan-out of a list of trees.""" from discodop.treetransforms import addbitsets, fanout try: # avoid max over empty sequence: 'treebank' may only have unary prods return max((fanout(a), n) for n, tree in enumerate(trees) for a in addbitsets(tree).subtrees(lambda x: len(x) > 1)) except ValueError: return 1, 0
def test_balancedpunctraise(self): tree = ParentedTree.parse('(ROOT ($, 3) ($[ 7) ($[ 13) ($, 14) ($, 20)' ' (S (NP (ART 0) (ADJA 1) (NN 2) (NP (CARD 4) (NN 5) (PP' ' (APPR 6) (CNP (NN 8) (ADV 9) (ISU ($. 10) ($. 11)' ' ($. 12))))) (S (PRELS 15) (MPN (NE 16) (NE 17)) (ADJD 18)' ' (VVFIN 19))) (VVFIN 21) (ADV 22) (NP (ADJA 23) (NN 24)))' ' ($. 25))', parse_leaf=int) sent = ("Die zweite Konzertreihe , sechs Abende mit ' Orgel plus " ". . . ' , die Hayko Siemens musikalisch leitet , bietet " "wieder ungewoehnliche Kombinationen .".split()) punctraise(tree, sent) balancedpunctraise(tree, sent) assert max(map(fanout, addbitsets(tree).subtrees())) == 1 nopunct = Tree.parse('(ROOT (S (NP (ART 0) (ADJA 1) (NN 2) (NP ' '(CARD 3) (NN 4) (PP (APPR 5) (CNP (NN 6) (ADV 7)))) (S ' '(PRELS 8) (MPN (NE 9) (NE 10)) (ADJD 11) (VVFIN 12))) ' '(VVFIN 13) (ADV 14) (NP (ADJA 15) (NN 16))))', parse_leaf=int) assert max(map(fanout, addbitsets(nopunct).subtrees())) == 1
def flatten(tree, sent, ids, backtransform, binarized): """Auxiliary function for Double-DOP. Remove internal nodes from a tree and read off the (binarized) productions of the resulting flattened tree. Aside from returning productions, also return tree with lexical and frontier nodes replaced by a templating symbol '{n}' where n is an index. Input is a tree and sentence, as well as an iterator which yields unique IDs for non-terminals introdudced by the binarization; output is a tuple (prods, frag). Trees are in the form of strings. >>> ids = UniqueIDs() >>> sent = [None, ',', None, '.'] >>> tree = "(ROOT (S_2 0 2) (ROOT|<$,>_2 ($, 1) ($. 3)))" >>> flatten(tree, sent, ids, {}, True) # doctest: +NORMALIZE_WHITESPACE ([(('ROOT', 'ROOT}<0>', '$.@.'), ((0, 1),)), (('ROOT}<0>', 'S_2', '$,@,'), ((0, 1, 0),)), (('$,@,', 'Epsilon'), (',',)), (('$.@.', 'Epsilon'), ('.',))], '(ROOT {0} (ROOT|<$,>_2 {1} {2}))') >>> flatten(tree, sent, ids, {}, False) # doctest: +NORMALIZE_WHITESPACE ([(('ROOT', 'S_2', '$,@,', '$.@.'), ((0, 1, 0, 2),)), (('$,@,', 'Epsilon'), (',',)), (('$.@.', 'Epsilon'), ('.',))], '(ROOT {0} (ROOT|<$,>_2 {1} {2}))')""" from discodop.treetransforms import factorconstituent, addbitsets def repl(x): """Add information to a frontier or terminal node. :frontiers: ``(label indices)`` :terminals: ``(tag@word idx)``""" n = x.group(2) # index w/leading space nn = int(n) if sent[nn] is None: return x.group(0) # (label indices) word = quotelabel(sent[nn]) # (tag@word idx) return "(%s@%s%s)" % (x.group(1), word, n) if tree.count(' ') == 1: return lcfrsproductions(addbitsets(tree), sent), str(tree) # give terminals unique POS tags prod = FRONTIERORTERM.sub(repl, tree) # remove internal nodes, reorder prod = "%s %s)" % (prod[:prod.index(' ')], ' '.join(x.group(0) for x in sorted(FRONTIERORTERM.finditer(prod), key=lambda x: int(x.group(2))))) tmp = addbitsets(prod) if binarized: tmp = factorconstituent(tmp, "}", factor='left', markfanout=True, markyf=True, ids=ids, threshold=2) prods = lcfrsproductions(tmp, sent) # remember original order of frontiers / terminals for template order = {x.group(2): "{%d}" % n for n, x in enumerate(FRONTIERORTERM.finditer(prod))} # mark substitution sites and ensure string. newtree = FRONTIERORTERM.sub(lambda x: order[x.group(2)], tree) prod = prods[0] if prod in backtransform: # normally, rules of fragments are disambiguated by binarization IDs. # In case there's a fragment with only one or two frontier nodes, # we add an artficial node. newlabel = "%s}<%s>%s" % (prod[0][0], next(ids), '' if len(prod[1]) == 1 else '_%d' % len(prod[1])) prod1 = ((prod[0][0], newlabel) + prod[0][2:], prod[1]) # we have to determine fanout of the first nonterminal # on the right hand side prod2 = ((newlabel, prod[0][1]), tuple((0,) for component in prod[1] for a in component if a == 0)) prods[:1] = [prod1, prod2] return prods, str(newtree)
def flatten(tree, sent, ids): """ Auxiliary function for Double-DOP. Remove internal nodes from a tree and read off the binarized productions of the resulting flattened tree. Aside from returning productions, also return tree with lexical and frontier nodes replaced by a templating symbol '{n}' where n is an index. Input is a tree and sentence, as well as an iterator which yields unique IDs for non-terminals introdudced by the binarization; output is a tuple (prods, frag). Trees are in the form of strings. >>> ids = UniqueIDs() >>> sent = [None, ',', None, '.'] >>> tree = "(ROOT (S_2 0 2) (ROOT|<$,>_2 ($, 1) ($. 3)))" >>> flatten(tree, sent, ids) ([(('ROOT', 'ROOT}<0>', '$.@.'), ((0, 1),)), (('ROOT}<0>', 'S_2', '$,@,'), ((0, 1, 0),)), (('$,@,', 'Epsilon'), (',',)), (('$.@.', 'Epsilon'), ('.',))], '(ROOT {0} (ROOT|<$,>_2 {1} {2}))') >>> flatten("(NN 0)", ["foo"], ids) ([(('NN', 'Epsilon'), ('foo',))], '(NN 0)') >>> flatten(r"(S (S|<VP> (S|<NP> (NP (ART 0) (CNP (CNP|<TRUNC> " ... "(TRUNC 1) (CNP|<KON> (KON 2) (CNP|<NN> (NN 3)))))) (S|<VAFIN> " ... "(VAFIN 4))) (VP (VP|<ADV> (ADV 5) (VP|<NP> (NP (ART 6) (NN 7)) " ... "(VP|<NP> (NP_2 8 10) (VP|<VVPP> (VVPP 9))))))))", ... ['Das', 'Garten-', 'und', 'Friedhofsamt', 'hatte', 'kuerzlich', ... 'dem', 'Ortsbeirat', None, None, None], ids) ([(('S', 'S}<8>_2', 'VVPP'), ((0, 1, 0),)), (('S}<8>_2', 'S}<7>', 'NP_2'), ((0, 1), (1,))), (('S}<7>', 'S}<6>', 'NN@Ortsbeirat'), ((0, 1),)), (('S}<6>', 'S}<5>', 'ART@dem'), ((0, 1),)), (('S}<5>', 'S}<4>', 'ADV@kuerzlich'), ((0, 1),)), (('S}<4>', 'S}<3>', 'VAFIN@hatte'), ((0, 1),)), (('S}<3>', 'S}<2>', 'NN@Friedhofsamt'), ((0, 1),)), (('S}<2>', 'S}<1>', 'KON@und'), ((0, 1),)), (('S}<1>', 'ART@Das', 'TRUNC@Garten-'), ((0, 1),)), (('ART@Das', 'Epsilon'), ('Das',)), (('TRUNC@Garten-', 'Epsilon'), ('Garten-',)), (('KON@und', 'Epsilon'), ('und',)), (('NN@Friedhofsamt', 'Epsilon'), ('Friedhofsamt',)), (('VAFIN@hatte', 'Epsilon'), ('hatte',)), (('ADV@kuerzlich', 'Epsilon'), ('kuerzlich',)), (('ART@dem', 'Epsilon'), ('dem',)), (('NN@Ortsbeirat', 'Epsilon'), ('Ortsbeirat',))], '(S (S|<VP> (S|<NP> (NP {0} (CNP (CNP|<TRUNC> {1} (CNP|<KON> {2} \ (CNP|<NN> {3}))))) (S|<VAFIN> {4})) (VP (VP|<ADV> {5} (VP|<NP> \ (NP {6} {7}) (VP|<NP> {8} (VP|<VVPP> {9})))))))') >>> flatten("(S|<VP>_2 (VP_3 (VP|<NP>_3 (NP 0) (VP|<ADV>_2 " ... "(ADV 2) (VP|<VVPP> (VVPP 4))))) (S|<VAFIN> (VAFIN 1)))", ... (None, None, None, None, None), ids) ([(('S|<VP>_2', 'S|<VP>_2}<10>', 'VVPP'), ((0,), (1,))), (('S|<VP>_2}<10>', 'S|<VP>_2}<9>', 'ADV'), ((0, 1),)), (('S|<VP>_2}<9>', 'NP', 'VAFIN'), ((0, 1),))], '(S|<VP>_2 (VP_3 (VP|<NP>_3 {0} (VP|<ADV>_2 {2} (VP|<VVPP> {3})))) \ (S|<VAFIN> {1}))') """ from discodop.treetransforms import factorconstituent, addbitsets def repl(x): """ Add information to a frontier or terminal: :frontiers: ``(label indices)`` :terminals: ``(tag@word idx)`` """ n = x.group(2) # index w/leading space nn = int(n) if sent[nn] is None: return x.group(0) # (label indices) word = quotelabel(sent[nn]) # (tag@word idx) return "(%s@%s%s)" % (x.group(1), word, n) if tree.count(' ') == 1: return lcfrs_productions(addbitsets(tree), sent), str(tree) # give terminals unique POS tags prod = FRONTIERORTERM.sub(repl, tree) # remove internal nodes, reorder prod = "%s %s)" % (prod[:prod.index(' ')], ' '.join(x.group(0) for x in sorted(FRONTIERORTERM.finditer(prod), key=lambda x: int(x.group(2))))) prods = lcfrs_productions(factorconstituent(addbitsets(prod), "}", factor='left', markfanout=True, markyf=True, ids=ids, threshold=2), sent) # remember original order of frontiers / terminals for template order = {x.group(2): "{%d}" % n for n, x in enumerate(FRONTIERORTERM.finditer(prod))} # mark substitution sites and ensure string. newtree = FRONTIERORTERM.sub(lambda x: order[x.group(2)], tree) return prods, str(newtree)