def test_multiroot(self): tree = multi_dep_tree() term_pos = the_terminal_labeling_factory().get_strategy( 'pos').token_label fanout_1 = the_recursive_partitioning_factory().get_partitioning( 'fanout-1') for top_level_labeling_strategy in ['strict', 'child']: labeling_strategy = the_labeling_factory( ).create_simple_labeling_strategy(top_level_labeling_strategy, 'pos+deprel') for recursive_partitioning in [[direct_extraction], fanout_1, [left_branching]]: (_, grammar) = induce_grammar([tree], labeling_strategy, term_pos, recursive_partitioning, 'START') print(grammar) parser = LCFRS_parser(grammar, 'pA pB pC pD pE'.split(' ')) print(parser.best_derivation_tree()) cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') hybrid_tree = HybridTree() hybrid_tree = parser.dcp_hybrid_tree_best_derivation( hybrid_tree, cleaned_tokens, True, construct_conll_token) print(hybrid_tree) self.assertEqual(tree, hybrid_tree)
def test_conll_grammar_induction(): ignore_punctuation = True trees = parse_conll_corpus(TEST_FILE, False) trees = disconnect_punctuation(trees) terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') nonterminal_labeling = the_labeling_factory( ).create_simple_labeling_strategy('child', 'pos') (_, grammar) = d_i.induce_grammar(trees, nonterminal_labeling, terminal_labeling.token_label, [direct_extraction], 'START') trees2 = parse_conll_corpus(TEST_FILE_MODIFIED, False) trees2 = disconnect_punctuation(trees2) for tree in trees2: parser = LCFRS_parser( grammar, terminal_labeling.prepare_parser_input(tree.token_yield())) cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree() h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) # print h_tree print('input -> hybrid-tree -> output') print(tree_to_conll_str(tree)) print('parsed tokens') print(list(map(str, h_tree.full_token_yield()))) print('test_parser output') print(tree_to_conll_str(h_tree))
def test_single_root_induction(self): tree = hybrid_tree_1() # print tree.children("v") # print tree # # for id_set in ['v v1 v2 v21'.split(' '), 'v1 v2'.split(' '), # 'v v21'.split(' '), ['v'], ['v1'], ['v2'], ['v21']]: # print id_set, 'top:', top(tree, id_set), 'bottom:', bottom(tree, id_set) # print id_set, 'top_max:', max(tree, top(tree, id_set)), 'bottom_max:', max(tree, bottom(tree, id_set)) # # print "some rule" # for mem, arg in [(-1, 0), (0,0), (1,0)]: # print create_DCP_rule(mem, arg, top_max(tree, ['v','v1','v2','v21']), bottom_max(tree, ['v','v1','v2','v21']), # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1', 'v2'], ['v', 'v21']]]) # # # print "some other rule" # for mem, arg in [(-1,1),(1,0)]: # print create_DCP_rule(mem, arg, top_max(tree, ['v1','v2']), bottom_max(tree, ['v1','v2']), # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1'], ['v2']]]) # # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21'])) # print 'child:' , child_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21'])) # print '---' # print 'strict: ', strict_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21'])) # print 'child: ', child_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21'])) # print '---' # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21'])) # print 'child:' , child_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21'])) tree2 = hybrid_tree_2() # print tree2.children("v") # print tree2 # # print 'siblings v211', tree2.siblings('v211') # print top(tree2, ['v','v1', 'v211']) # print top_max(tree2, ['v','v1', 'v211']) # # print '---' # print 'strict:' , strict_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211'])) # print 'child:' , child_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211'])) # rec_par = ('v v1 v2 v21'.split(' '), # [('v1 v2'.split(' '), [(['v1'],[]), (['v2'],[])]) # ,('v v21'.split(' '), [(['v'],[]), (['v21'],[])]) # ]) # # grammar = LCFRS(nonterminal_str(tree, top_max(tree, rec_par[0]), bottom_max(tree, rec_par[0]), 'strict')) # # add_rules_to_grammar_rec(tree, rec_par, grammar, 'child') # # grammar.make_proper() # print grammar print(tree.recursive_partitioning()) terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'), terminal_labeling.token_label, [direct_extraction], 'START') print(max([grammar.fanout(nont) for nont in grammar.nonts()])) print(grammar) parser = LCFRS_parser(grammar, 'NP N V V'.split(' ')) print(parser.best_derivation_tree()) tokens = [ construct_conll_token(form, pos) for form, pos in zip( 'Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' ')) ] hybrid_tree = HybridTree() hybrid_tree = parser.dcp_hybrid_tree_best_derivation( hybrid_tree, tokens, True, construct_conll_token) print(list(map(str, hybrid_tree.full_token_yield()))) print(hybrid_tree) string = "foo" dcp_string = DCP_string(string) dcp_string.set_edge_label("bar") print(dcp_string, dcp_string.edge_label()) linearize( grammar, the_labeling_factory().create_simple_labeling_strategy( 'child', 'pos+deprel'), the_terminal_labeling_factory().get_strategy('pos'), sys.stdout)