def setUp(self): self.tree = HybridTree() self.tree.add_node("v1", construct_conll_token("Piet", "NP"), True) self.tree.add_node("v21", construct_conll_token("Marie", "N"), True) self.tree.add_node("v", construct_conll_token("helpen", "VP"), True) self.tree.add_node("v2", construct_conll_token("lezen", "V"), True) self.tree.add_child("v", "v2") self.tree.add_child("v", "v1") self.tree.add_child("v2", "v21") self.tree.add_node("v3", construct_conll_token(".", "Punc"), True, False) self.tree.add_to_root("v")
def fall_back_left_branching(forms, poss): tree = HybridTree() for i, (form, pos) in enumerate(zip(forms, poss)): token = construct_conll_token(form, pos) token.set_edge_label('_') tree.add_node(i, token, True) if i == 0: tree.add_to_root(i) else: tree.add_child(i - 1, i) return tree
def test_grammar_export(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') _, grammar = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'), terminal_labeling.token_label, [direct_extraction], 'START') print(max([grammar.fanout(nont) for nont in grammar.nonts()])) print(grammar) prefix = '/tmp/' name = 'tmpGrammar' name_ = export(grammar, prefix, name) self.assertEqual(0, compile_gf_grammar(prefix, name_)) GFParser.preprocess_grammar(grammar) string = ["NP", "N", "V", "V", "V"] parser = GFParser(grammar, string) self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start())) print(der) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def test_cfg_parser(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') for parser_class in [LCFRS_parser, CFGParser]: parser_class.preprocess_grammar(grammar) string = ["NP", "N", "V", "V", "V"] parser = parser_class(grammar, string) self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start())) print(der) print( derivation_to_hybrid_tree( der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip( 'Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def generic_parsing_test(self, parser_type, limit_train, limit_test, compare_order): def filter_by_id(n, trees): j = 0 for tree in trees: if j in n: yield tree j += 1 #params train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train # test = 'res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim, term_labelling) trees = parse_conll_corpus(test, False, limit_test) count_derivs = {} no_complete_match = 0 for i, tree in enumerate(trees): print("Parsing tree for ", i, file=stderr) print(tree, file=stderr) parser = parser_type(grammar_prim, tree) self.assertTrue(parser.recognized()) count_derivs[i] = 0 print("Found derivations for ", i, file=stderr) j = 0 derivations = [] for der in parser.all_derivation_trees(): self.assertTrue( der.check_integrity_recursive(der.root_id(), start)) print(count_derivs[i], file=stderr) print(der, file=stderr) output_tree = HybridTree() tokens = tree.token_yield() the_yield = der.compute_yield() # print >>stderr, the_yield tokens2 = list( map(lambda pos: construct_conll_token('_', pos), the_yield)) dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens2, False, construct_conll_token, reorder=False) print(tree, file=stderr) print(output_tree, file=stderr) self.compare_hybrid_trees(tree, output_tree, compare_order) count_derivs[i] += 1 derivations.append(der) self.assertTrue( sDCPParserTest.pairwise_different( derivations, sDCPParserTest.compare_derivations)) self.assertEqual(len(derivations), count_derivs[i]) if count_derivs[i] == 0: no_complete_match += 1 for key in count_derivs: print(key, count_derivs[key]) print("# trees with no complete match:", no_complete_match)
def parse_with_pgf(grammar, forms, poss, bin): """" :type grammar: PGF :return: :rtype: """ lcfrs = grammar.languages[bin + 'grammargfconcrete'] # sentence = "ADJD ADV _COMMA_ KOUS ADV PIS PROAV VVINF VMFIN _PUNCT_" sentence = ' '.join(map(escape, poss)) try: i = lcfrs.parse(sentence, n=1) p, e = next(i) except (StopIteration, pgf.ParseError): return None # print_ast(gr, e, 0) s = lcfrs.graphvizParseTree(e) assert isinstance(s, str) s_ = s.splitlines() tree = HybridTree() # print s i = 0 for line in s.splitlines(): match = re.search(r'^\s*(n\d+)\[label="([^\s]+)"\]\s*$', line) if match: node_id = match.group(1) label = match.group(2) order = int(node_id[1:]) >= 100000 if order: assert escape(poss[i]) == label tree.add_node( node_id, construct_constituent_token(form=forms[i], pos=poss[i], terminal=True), True) i += 1 else: tree.add_node( node_id, construct_constituent_token(form=label, pos='_', terminal=False), False) # print node_id, label if label == 'VROOT1': tree.add_to_root(node_id) continue match = re.search(r'^ (n\d+) -- (n\d+)\s*$', line) if match: parent = match.group(1) child = match.group(2) tree.add_child(parent, child) # print line # print parent, child continue # print tree assert poss == [token.pos() for token in tree.token_yield()] # print the_yield dep_tree = HybridTree() head_table = defaultdict(lambda: None) attachment_point = defaultdict(lambda: None) for i, node in enumerate(tree.id_yield()): token = tree.node_token(node) dep_token = construct_conll_token(token.form(), un_escape(token.pos())) current = tree.parent(node) current = tree.parent(current) while current: current_label = tree.node_token(current).category() if not re.search(r'\d+X\d+$', current_label): s = un_escape(current_label) if s == 'TOP1': s = 'ROOT1' dep_token.set_edge_label(s[:-1]) head_table[current] = i + 1 attachment_point[node] = current break else: current = tree.parent(current) dep_tree.add_node(i + 1, dep_token, order=True) # print head_table for node, dep_node in zip(tree.id_yield(), dep_tree.id_yield()): node = tree.parent(attachment_point[node]) while node: if head_table[node]: dep_tree.add_child(head_table[node], dep_node) break node = tree.parent(node) if not node: dep_tree.add_to_root(dep_node) # print "dep_tree" # print dep_tree # print ' '.join(['(' + token.form() + '/' + token.deprel() + ')' for token in dep_tree.token_yield()]) return dep_tree
def test_single_root_induction(self): tree = hybrid_tree_1() # print tree.children("v") # print tree # # for id_set in ['v v1 v2 v21'.split(' '), 'v1 v2'.split(' '), # 'v v21'.split(' '), ['v'], ['v1'], ['v2'], ['v21']]: # print id_set, 'top:', top(tree, id_set), 'bottom:', bottom(tree, id_set) # print id_set, 'top_max:', max(tree, top(tree, id_set)), 'bottom_max:', max(tree, bottom(tree, id_set)) # # print "some rule" # for mem, arg in [(-1, 0), (0,0), (1,0)]: # print create_DCP_rule(mem, arg, top_max(tree, ['v','v1','v2','v21']), bottom_max(tree, ['v','v1','v2','v21']), # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1', 'v2'], ['v', 'v21']]]) # # # print "some other rule" # for mem, arg in [(-1,1),(1,0)]: # print create_DCP_rule(mem, arg, top_max(tree, ['v1','v2']), bottom_max(tree, ['v1','v2']), # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1'], ['v2']]]) # # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21'])) # print 'child:' , child_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21'])) # print '---' # print 'strict: ', strict_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21'])) # print 'child: ', child_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21'])) # print '---' # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21'])) # print 'child:' , child_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21'])) tree2 = hybrid_tree_2() # print tree2.children("v") # print tree2 # # print 'siblings v211', tree2.siblings('v211') # print top(tree2, ['v','v1', 'v211']) # print top_max(tree2, ['v','v1', 'v211']) # # print '---' # print 'strict:' , strict_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211'])) # print 'child:' , child_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211'])) # rec_par = ('v v1 v2 v21'.split(' '), # [('v1 v2'.split(' '), [(['v1'],[]), (['v2'],[])]) # ,('v v21'.split(' '), [(['v'],[]), (['v21'],[])]) # ]) # # grammar = LCFRS(nonterminal_str(tree, top_max(tree, rec_par[0]), bottom_max(tree, rec_par[0]), 'strict')) # # add_rules_to_grammar_rec(tree, rec_par, grammar, 'child') # # grammar.make_proper() # print grammar print(tree.recursive_partitioning()) terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'), terminal_labeling.token_label, [direct_extraction], 'START') print(max([grammar.fanout(nont) for nont in grammar.nonts()])) print(grammar) parser = LCFRS_parser(grammar, 'NP N V V'.split(' ')) print(parser.best_derivation_tree()) tokens = [ construct_conll_token(form, pos) for form, pos in zip( 'Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' ')) ] hybrid_tree = HybridTree() hybrid_tree = parser.dcp_hybrid_tree_best_derivation( hybrid_tree, tokens, True, construct_conll_token) print(list(map(str, hybrid_tree.full_token_yield()))) print(hybrid_tree) string = "foo" dcp_string = DCP_string(string) dcp_string.set_edge_label("bar") print(dcp_string, dcp_string.edge_label()) linearize( grammar, the_labeling_factory().create_simple_labeling_strategy( 'child', 'pos+deprel'), the_terminal_labeling_factory().get_strategy('pos'), sys.stdout)
def test_fst_compilation_left(self): if not test_pynini: return tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [left_branching], 'START') fst, rules = compile_wfst_from_left_branching_grammar(grammar) print(repr(fst)) symboltable = fst.input_symbols() string = ["NP", "N", "V", "V", "V"] fsa = fsa_from_list_of_symbols(string, symboltable) self.assertEqual( fsa.text().decode('utf-8'), '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n' ) b = compose(fsa, fst) print(b.text(symboltable, symboltable)) print("Shortest path probability", end=' ') best = shortestpath(b) best.topsort() # self.assertAlmostEquals(pow(e, -float(shortestdistance(best)[-1])), 1.80844898756e-05) print(best.text()) polish_rules = retrieve_rules(best) self.assertSequenceEqual(polish_rules, [1, 2, 3, 4, 5, 4, 9, 4, 7, 8]) polish_rules = list(map(rules.index_object, polish_rules)) for rule in polish_rules: print(rule) print() der = ReversePolishDerivation(polish_rules[0:-1]) self.assertTrue(der.check_integrity_recursive(der.root_id())) print(der) LeftBranchingFSTParser.preprocess_grammar(grammar) parser = LeftBranchingFSTParser(grammar, string) der_ = parser.best_derivation_tree() print(der_) self.assertTrue(der_.check_integrity_recursive(der_.root_id())) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) print( derivation_to_hybrid_tree(der_, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def test_fst_compilation_right(self): if not test_pynini: return tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [right_branching], 'START') a, rules = compile_wfst_from_right_branching_grammar(grammar) print(repr(a)) symboltable = a.input_symbols() string = 'NP N V V V'.split(' ') token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip( 'Piet Marie helpen leren lezen'.split(' '), string) ] fsa = fsa_from_list_of_symbols(string, symboltable) self.assertEqual( '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n', fsa.text().decode('utf-8')) b = compose(fsa, a) print(b.input_symbols()) for i in b.input_symbols(): print(i) print("Input Composition") print(b.text(symboltable, symboltable).decode('utf-8')) i = 0 for path in paths(b): print(i, "th path:", path, end=' ') r = list(map(rules.index_object, path)) d = PolishDerivation(r[1::]) dcp = DCP_evaluator(d).getEvaluation() h = HybridTree() dcp_to_hybridtree(h, dcp, token_sequence, False, construct_conll_token) h.reorder() if h == tree2: print("correct") else: print("incorrect") i += 1 stats = defaultdict(lambda: 0) local_rule_stats(b, stats, 15) print(stats) print("Shortest path probability") best = shortestpath(b) best.topsort() self.assertAlmostEqual(1.80844898756e-05, pow(e, -float(shortestdistance(best)[-1]))) print(best.text()) polish_rules = retrieve_rules(best) self.assertSequenceEqual(polish_rules, [8, 7, 1, 6, 2, 5, 3, 10, 3, 3]) polish_rules = list(map(rules.index_object, polish_rules)) print(polish_rules) der = PolishDerivation(polish_rules[1::]) print(der) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)