def setUp(self): self.tree = HybridTree() self.tree.add_node("v1", construct_conll_token("Piet", "NP"), True) self.tree.add_node("v21", construct_conll_token("Marie", "N"), True) self.tree.add_node("v", construct_conll_token("helpen", "VP"), True) self.tree.add_node("v2", construct_conll_token("lezen", "V"), True) self.tree.add_child("v", "v2") self.tree.add_child("v", "v1") self.tree.add_child("v2", "v21") self.tree.add_node("v3", construct_conll_token(".", "Punc"), True, False) self.tree.add_to_root("v")
def test_multiroot(self): tree = multi_dep_tree() term_pos = the_terminal_labeling_factory().get_strategy( 'pos').token_label fanout_1 = the_recursive_partitioning_factory().get_partitioning( 'fanout-1') for top_level_labeling_strategy in ['strict', 'child']: labeling_strategy = the_labeling_factory( ).create_simple_labeling_strategy(top_level_labeling_strategy, 'pos+deprel') for recursive_partitioning in [[direct_extraction], fanout_1, [left_branching]]: (_, grammar) = induce_grammar([tree], labeling_strategy, term_pos, recursive_partitioning, 'START') print(grammar) parser = LCFRS_parser(grammar, 'pA pB pC pD pE'.split(' ')) print(parser.best_derivation_tree()) cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') hybrid_tree = HybridTree() hybrid_tree = parser.dcp_hybrid_tree_best_derivation( hybrid_tree, cleaned_tokens, True, construct_conll_token) print(hybrid_tree) self.assertEqual(tree, hybrid_tree)
def test_minimum_risk_parsing(self): limit_train = 20 limit_test = 10 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) # print >>stderr, tree parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=50) self.assertTrue(parser.recognized()) derivations = [der for der in parser.k_best_derivation_trees()] print("# derivations: ", len(derivations), file=stderr) h_trees = [] current_weight = 0 weights = [] derivation_list = [] for weight, der in derivations: self.assertTrue(not der in derivation_list) derivation_list.append(der) dcp = DCP_evaluator(der).getEvaluation() h_tree = HybridTree() cleaned_tokens = copy.deepcopy(tree.full_token_yield()) dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) h_trees.append(h_tree) weights.append(weight) if True: min_risk_tree = compute_minimum_risk_tree(h_trees, weights) if not min_risk_tree.__eq__(h_trees[0]): print(h_trees[0]) print(min_risk_tree)
def fall_back_left_branching_token(clean_tokens): tree = HybridTree() for i, token in enumerate(clean_tokens): token.set_edge_label('_') tree.add_node(i, token, True) if i == 0: tree.add_to_root(i) else: tree.add_child(i - 1, i) return tree
def fall_back_left_branching(forms, poss): tree = HybridTree() for i, (form, pos) in enumerate(zip(forms, poss)): token = construct_conll_token(form, pos) token.set_edge_label('_') tree.add_node(i, token, True) if i == 0: tree.add_to_root(i) else: tree.add_child(i - 1, i) return tree
def test_grammar_export(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') _, grammar = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'), terminal_labeling.token_label, [direct_extraction], 'START') print(max([grammar.fanout(nont) for nont in grammar.nonts()])) print(grammar) prefix = '/tmp/' name = 'tmpGrammar' name_ = export(grammar, prefix, name) self.assertEqual(0, compile_gf_grammar(prefix, name_)) GFParser.preprocess_grammar(grammar) string = ["NP", "N", "V", "V", "V"] parser = GFParser(grammar, string) self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start())) print(der) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def test_dcp_evaluation_with_induced_dependency_grammar(self): tree = hybrid_tree_1() print(tree) tree2 = hybrid_tree_2() print(tree2) # print tree.recursive_partitioning() labeling = the_labeling_factory().create_simple_labeling_strategy( 'child', 'pos') term_pos = the_terminal_labeling_factory().get_strategy( 'pos').token_label (_, grammar) = induce_grammar([tree, tree2], labeling, term_pos, [direct_extraction], 'START') # print grammar self.assertEqual(grammar.well_formed(), None) self.assertEqual(grammar.ordered()[0], True) # print max([grammar.fanout(nont) for nont in grammar.nonts()]) print(grammar) parser = Parser(grammar, 'NP N V V'.split(' ')) self.assertEqual(parser.recognized(), True) for item in parser.successful_root_items(): der = Derivation() derivation_tree(der, item, None) print(der) hybrid_tree = derivation_to_hybrid_tree( der, 'NP N V V'.split(' '), 'Piet Marie helpen lezen'.split(' '), construct_constituent_token) print(hybrid_tree) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token)
def test_cfg_parser(self): tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') for parser_class in [LCFRS_parser, CFGParser]: parser_class.preprocess_grammar(grammar) string = ["NP", "N", "V", "V", "V"] parser = parser_class(grammar, string) self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start())) print(der) print( derivation_to_hybrid_tree( der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip( 'Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def test_basic_sdcp_parsing_dependency(self): tree1 = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree1, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START') print("grammar induced. Printing rules...", file=stderr) for rule in grammar.rules(): print(rule, file=stderr) parser_type = LCFRS_sDCP_Parser print("preprocessing grammar", file=stderr) parser_type.preprocess_grammar(grammar, terminal_labeling) print("invoking parser", file=stderr) parser = parser_type(grammar, tree1) print("listing derivations", file=stderr) for der in parser.all_derivation_trees(): print(der) output_tree = HybridTree() tokens = tree1.token_yield() dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_conll_token) print(tree1) print(output_tree) print("completed test", file=stderr)
def parse_conll_corpus(path, ignore_punctuation, limit=sys.maxsize, start=0): """ :param path: path to corpus :type: str :param ignore_punctuation: exclude punctuation from tree structure :type ignore_punctuation: bool :param limit: stop generation after limit trees :type: int :param start: start generation with start'th tree :type start: int :return: a series of hybrid trees read from file :rtype: __generator[HybridTree] :raise Exception: unexpected input in corpus file Lazily parses a dependency corpus (in CoNLL format) and generates GeneralHybridTrees. """ # print path with open(path) as file_content: tree_count = 0 while tree_count < limit: tree = None try: line = next(file_content) while line.startswith('#'): line = next(file_content) except StopIteration: break match = CONLL_LINE.match(line) while match: if match.group(1) == '1': tree_count += 1 tree = HybridTree('tree' + str(tree_count)) node_id = match.group(1) form = match.group(2) lemma = match.group(3) cpos = match.group(4) pos = match.group(5) feats = match.group(6) parent = match.group(7) deprel = match.group(8) # We ignore information about multiple token's as present in the UD version of Prague Dep. TB if MULTI_TOKEN.search(node_id): pass else: # If punctuation is to be ignored, we # remove it from the hybrid tree # Punctuation according to definition # cf. http://ilk.uvt.nl/conll/software.html#eval # if not ignore_punctuation or form.translate(no_translation, string.punctuation): tree.add_node(node_id, CoNLLToken(form, lemma, cpos, pos, feats, deprel), True, True) if parent != '0': tree.add_child(parent, node_id) # else: # tree.add_node(node_id, CoNLLToken(form, lemma, pos, fine_grained_pos, feats, deprel), True, False) # TODO: If punctuation is ignored and the root is punctuation, # TODO: it is added to the tree anyhow. if parent == '0': tree.add_to_root(node_id) try: line = next(file_content) while line.startswith('#'): line = next(file_content) match = CONLL_LINE.search(line) except StopIteration: line = '' match = None # Assume empty line, otherwise raise exception match = EMPTY_LINE.match(line) if not match: raise Exception("Unexpected input in CoNLL corpus file.") if tree: # basic sanity checks if not tree.root: # FIXME: ignoring punctuation may leads to malformed trees print("non-rooted") if ignore_punctuation: continue raise Exception # elif root > 1: # FIXME: turkish corpus contains trees with more than one root # FIXME: currently, they are ignored # continue elif tree.n_nodes() != len(tree.id_yield()) or len(tree.nodes()) != len(tree.full_yield()): # FIXME: ignoring punctuation may leads to malformed trees if ignore_punctuation: continue raise Exception( '{4}: connected nodes: {0}, total nodes: {1}, full yield: {2}, connected yield: {3}'.format( str(tree.n_nodes()), str(len(tree.nodes())), str(len(tree.full_yield())), str(len(tree.id_yield())), tree.sent_label())) if tree_count > start: yield tree
def query_result_tree(connection, exp, tree_id): """ :param connection: :param exp: :param tree_id: :rtype: str, HybridTree :return: """ cursor = connection.cursor() result_tree_ids = cursor.execute( '''SELECT rt_id, status FROM result_trees WHERE exp_id = ? AND t_id = ?''', (exp, tree_id)).fetchall() # parse: if result_tree_ids: assert (len(result_tree_ids) == 1) result_tree_id, status = result_tree_ids[0] if status in ["parse", "fallback"]: name = cursor.execute('''SELECT name FROM trees WHERE t_id = ?''', (tree_id, )).fetchall()[0][0] tree_nodes = cursor.execute(( ' SELECT tree_nodes.sent_position, label, pos, result_tree_nodes.head, result_tree_nodes.deprel FROM result_tree_nodes\n' ' JOIN result_trees\n' ' ON result_tree_nodes.rt_id = result_trees.rt_id\n' ' JOIN tree_nodes\n' ' ON result_trees.t_id = tree_nodes.t_id\n' ' AND result_tree_nodes.sent_position = tree_nodes.sent_position\n' ' WHERE result_tree_nodes.rt_id = ?'), (result_tree_id, )) tree = HybridTree(name) for i, label, pos, head, deprel in tree_nodes: if deprel is None: deprel = 'UNKNOWN' token = CoNLLToken(label, '_', pos, pos, '_', deprel) tree.add_node(str(i), token, True, True) if head == 0: tree.add_to_root(str(i)) else: tree.add_child(str(head), str(i)) assert tree.root is not [] return status, tree # legacy: no entry found else: status = "simple_fallback" # Create a left branching tree without labels as default strategy tree_nodes = cursor.execute( ''' SELECT tree_nodes.sent_position, label, pos FROM tree_nodes WHERE tree_nodes.t_id = ?''', (tree_id, )).fetchall() left_branch = lambda x: x - 1 right_branch = lambda x: x + 1 strategy = left_branch length = len(tree_nodes) tree = HybridTree() for i, label, pos in tree_nodes: token = CoNLLToken(label, '_', pos, pos, '_', '_') tree.add_node(str(i), token, True, True) parent = strategy(i) if (parent == 0 and strategy == left_branch) or (parent == length + 1 and strategy == right_branch): tree.add_to_root(str(i)) else: tree.add_child(str(parent), str(i)) assert tree.root is not [] return status, tree
def disconnect_punctuation(trees): """ :param trees: corpus of hybrid trees :type trees: __generator[HybridTree] :return: corpus of hybrid trees :rtype: __generator[GeneralHybridTree] lazily disconnect punctuation from each hybrid tree in a corpus of hybrid trees """ for tree in trees: tree2 = HybridTree(tree.sent_label()) for root_id in tree.root: if not is_punctuation(tree.node_token(root_id).form()): tree2.add_to_root(root_id) for id in tree.full_yield(): token = tree.node_token(id) if not is_punctuation(token.form()): parent = tree.parent(id) while parent and parent not in tree.root and is_punctuation( tree.node_token(parent).form()): parent = tree.parent(parent) if parent and is_punctuation(tree.node_token(parent).form()): tree2.add_to_root(id) else: tree2.add_child(parent, id) tree2.add_node(id, token, True, True) else: tree2.add_node(id, token, True, False) if tree2: # basic sanity checks if not tree2.root \ and len(tree2.id_yield()) == 0 \ and len(tree2.nodes()) == len(tree2.full_yield()): # Tree consists only of punctuation continue elif not tree2.root \ or tree2.n_nodes() != len(tree2.id_yield()) \ or len(tree2.nodes()) != len(tree2.full_yield()): print(tree) print(tree2) print(tree2.sent_label()) print("Root:", tree2.root) print("Nodes: ", tree2.n_nodes()) print("Id_yield:", len(tree2.id_yield()), tree2.id_yield()) print("Nodes: ", len(tree2.nodes())) print("full yield: ", len(tree2.full_yield())) raise Exception() yield tree2
def hybrid_tree_2(): tree2 = HybridTree() tree2.add_node('v1', CoNLLToken('Piet', '_', 'NP', 'NP', '_', 'SBJ'), True) tree2.add_node('v211', CoNLLToken('Marie', '_', 'N', 'N', '_', 'OBJ'), True) tree2.add_node('v', CoNLLToken('helpen', '_', 'V', 'V', '_', 'ROOT'), True) tree2.add_node('v2', CoNLLToken('leren', '_', 'V', 'V', '_', 'VBI'), True) tree2.add_node('v21', CoNLLToken('lezen', '_', 'V', 'V', '_', 'VFIN'), True) tree2.add_child('v', 'v2') tree2.add_child('v', 'v1') tree2.add_child('v2', 'v21') tree2.add_child('v21', 'v211') tree2.add_to_root('v') tree2.reorder() return tree2
def generic_parsing_test(self, parser_type, limit_train, limit_test, compare_order): def filter_by_id(n, trees): j = 0 for tree in trees: if j in n: yield tree j += 1 #params train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train # test = 'res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim, term_labelling) trees = parse_conll_corpus(test, False, limit_test) count_derivs = {} no_complete_match = 0 for i, tree in enumerate(trees): print("Parsing tree for ", i, file=stderr) print(tree, file=stderr) parser = parser_type(grammar_prim, tree) self.assertTrue(parser.recognized()) count_derivs[i] = 0 print("Found derivations for ", i, file=stderr) j = 0 derivations = [] for der in parser.all_derivation_trees(): self.assertTrue( der.check_integrity_recursive(der.root_id(), start)) print(count_derivs[i], file=stderr) print(der, file=stderr) output_tree = HybridTree() tokens = tree.token_yield() the_yield = der.compute_yield() # print >>stderr, the_yield tokens2 = list( map(lambda pos: construct_conll_token('_', pos), the_yield)) dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens2, False, construct_conll_token, reorder=False) print(tree, file=stderr) print(output_tree, file=stderr) self.compare_hybrid_trees(tree, output_tree, compare_order) count_derivs[i] += 1 derivations.append(der) self.assertTrue( sDCPParserTest.pairwise_different( derivations, sDCPParserTest.compare_derivations)) self.assertEqual(len(derivations), count_derivs[i]) if count_derivs[i] == 0: no_complete_match += 1 for key in count_derivs: print(key, count_derivs[key]) print("# trees with no complete match:", no_complete_match)
def do_parsing(grammar_prim, limit, ignore_punctuation, recompile=True, preprocess_path=None): trees = parse_conll_corpus(test, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) total_time = 0.0 load_preprocess = preprocess_path if recompile or (not os.path.isfile( parser_type.resolve_path(preprocess_path))): load_preprocess = None parser = parser_type(grammar_prim, save_preprocess=preprocess_path, load_preprocess=load_preprocess) with open(result, 'w') as result_file: failures = 0 for tree in trees: if len(tree.id_yield()) > limit: continue time_stamp = time.clock() parser.set_input(tree_yield(tree.token_yield())) parser.parse() # if not parser.recognized(): # parser = parser_type(grammar_second, tree_yield(tree.token_yield())) # if not parser.recognized(): # parser = parser_type(grammar_tern, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) if parser_type == GFParser_k_best and parser.recognized(): der_to_tree = lambda der: dcp_to_hybridtree( HybridTree(), DCP_evaluator(der).getEvaluation(), copy.deepcopy(tree.full_token_yield()), False, construct_conll_token) h_tree = parser.best_trees(der_to_tree)[0][0] elif parser_type == CFGParser \ or parser_type == GFParser \ or parser_type == LeftBranchingFSTParser \ or parser_type == RightBranchingFSTParser: h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) else: h_tree = None if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] result_file.write( tree_to_conll_str(fall_back_left_branching(forms, poss))) result_file.write('\n\n') parser.clear() print("parse failures", failures) print("parse time", total_time) print("eval.pl", "no punctuation") p = subprocess.Popen( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"]) p.communicate() print("eval.pl", "punctation") p = subprocess.Popen( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"]) p.communicate()
def test_recursive_partitioning_transformation(self): tree = HybridTree("mytree") ids = ['a', 'b', 'c', 'd'] for f in ids: tree.add_node(f, CoNLLToken(f, '_', '_', '_', '_', '_'), True, True) if f != 'a': tree.add_child('a', f) tree.add_to_root('a') print(tree) self.assertEqual([token.form() for token in tree.token_yield()], ids) self.assertEqual(tree.recursive_partitioning(), (set([0, 1, 2, 3]), [(set([0]), []), (set([1]), []), (set([2]), []), (set([3]), [])])) print(tree.recursive_partitioning()) [fanout_1 ] = the_recursive_partitioning_factory().get_partitioning('fanout-1') print(fanout_1(tree))
def main(limit=100000, ignore_punctuation=False): if PARSER_TYPE.__name__ != 'GFParser': print('GFParser not found, using', PARSER_TYPE.__name__, 'instead!') print('Please install grammatical framework to reproduce experiments.') test_limit = 10000 trees = parse_conll_corpus(TRAIN, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar_prim) = d_i.induce_grammar(trees, PRIMARY_LABELLING, TERMINAL_LABELLING.token_label, RECURSIVE_PARTITIONING, START) PARSER_TYPE.preprocess_grammar(grammar_prim) trees = parse_conll_corpus(TRAIN, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar_second) = d_i.induce_grammar(trees, SECONDARY_LABELLING, TERMINAL_LABELLING.token_label, RECURSIVE_PARTITIONING, START) PARSER_TYPE.preprocess_grammar(grammar_second) trees = parse_conll_corpus(TRAIN, False, limit) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar_tern) = d_i.induce_grammar(trees, TERNARY_LABELLING, TERMINAL_LABELLING.token_label, RECURSIVE_PARTITIONING, START) PARSER_TYPE.preprocess_grammar(grammar_tern) trees = parse_conll_corpus(TEST, False, test_limit) if ignore_punctuation: trees = disconnect_punctuation(trees) total_time = 0.0 with open(RESULT, 'w') as result_file: failures = 0 for tree in trees: time_stamp = time.clock() the_parser = PARSER_TYPE(grammar_prim, TREE_YIELD(tree.token_yield())) if not the_parser.recognized(): the_parser = PARSER_TYPE(grammar_second, TREE_YIELD(tree.token_yield())) if not the_parser.recognized(): the_parser = PARSER_TYPE(grammar_tern, TREE_YIELD(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) h_tree = the_parser.dcp_hybrid_tree_best_derivation(h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] result_file.write(tree_to_conll_str(fall_back_left_branching(forms, poss))) result_file.write('\n\n') print("parse failures", failures) print("parse time", total_time) print("eval.pl", "no punctuation") p = subprocess.Popen(["perl", "util/eval.pl", "-g", TEST, "-s", RESULT, "-q"]) p.communicate() print("eval.pl", "punctuation") p = subprocess.Popen( ["perl", "util/eval.pl", "-g", TEST, "-s", RESULT, "-q", "-p"]) p.communicate()
def parse_sentences_from_file(grammar, parser_type, experiment, connection, path, tree_yield, max_length=sys.maxsize, limit=sys.maxsize, quiet=False, ignore_punctuation=True, root_default_deprel=None, disconnected_default_deprel=None): """ :rtype: None :type grammar: LCFRS :param path: file path for test corpus (dependency grammar in CoNLL format) :type path: str :param tree_yield: parse on words or POS or .. :type tree_yield: GeneralHybridTree -> list[str] :param max_length: don't parse sentences with yield > max_length :type max_length: int :param limit: only parse the limit first sentences of the corpus :type limit: int :param quiet: output status information :type quiet: bool :param ignore_punctuation: exclude punctuation from parsing :type ignore_punctuation: bool Parse sentences from corpus and compare derived dependency structure with gold standard information. """ if not quiet: print("Building lookahead tables for grammar") parser_type.preprocess_grammar(grammar) experiment_database.set_experiment_test_corpus(connection, experiment, path) if not quiet: if max_length != sys.maxsize: s = ', ignoring sentences with length > ' + str(max_length) else: s = '' print('Start parsing sentences' + s) trees = parse_conll_corpus(path, False, limit) trees = add_trees_to_db(path, connection, trees) if ignore_punctuation: trees = disconnect_punctuation(trees) (UAS, LAS, UEM, LEM) = (0, 0, 0, 0) parse = 0 no_parse = 0 n_gaps_gold = 0 n_gaps_test = 0 skipped = 0 start_at = time.clock() for tree in trees: if len(tree.id_yield()) > max_length: skipped += 1 continue time_stamp = time.clock() parser = parser_type(grammar, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) if h_tree: experiment_database.add_result_tree(connection, h_tree, path, experiment, 1, parser.best(), time_stamp, 'parse', root_default_deprel, disconnected_default_deprel) n_gaps_gold += tree.n_gaps() n_gaps_test += h_tree.n_gaps() parse += 1 (dUAS, dLAS, dUEM, dLEM) = score_cmp_dep_trees(tree, h_tree) UAS += dUAS LAS += dLAS UEM += dUEM LEM += dLEM else: experiment_database.no_parse_result(connection, tree.sent_label(), path, experiment, time_stamp, "no_parse") no_parse += 1 end_at = time.clock() total = parse + no_parse if not quiet: print('Parsed ' + str(parse) + ' out of ' + str(total) + ' (skipped ' + str(skipped) + ')') print('fail: ', no_parse) if parse > 0: print('UAS: ', UAS / parse) print('LAS: ', LAS / parse) print('UEM: ', UEM / parse) print('LEM: ', LEM / parse) print('n gaps (gold): ', n_gaps_gold * 1.0 / parse) print('n gaps (test): ', n_gaps_test * 1.0 / parse) print('parse time: ', end_at - start_at, 's') print()
def parse_with_pgf(grammar, forms, poss, bin): """" :type grammar: PGF :return: :rtype: """ lcfrs = grammar.languages[bin + 'grammargfconcrete'] # sentence = "ADJD ADV _COMMA_ KOUS ADV PIS PROAV VVINF VMFIN _PUNCT_" sentence = ' '.join(map(escape, poss)) try: i = lcfrs.parse(sentence, n=1) p, e = next(i) except (StopIteration, pgf.ParseError): return None # print_ast(gr, e, 0) s = lcfrs.graphvizParseTree(e) assert isinstance(s, str) s_ = s.splitlines() tree = HybridTree() # print s i = 0 for line in s.splitlines(): match = re.search(r'^\s*(n\d+)\[label="([^\s]+)"\]\s*$', line) if match: node_id = match.group(1) label = match.group(2) order = int(node_id[1:]) >= 100000 if order: assert escape(poss[i]) == label tree.add_node( node_id, construct_constituent_token(form=forms[i], pos=poss[i], terminal=True), True) i += 1 else: tree.add_node( node_id, construct_constituent_token(form=label, pos='_', terminal=False), False) # print node_id, label if label == 'VROOT1': tree.add_to_root(node_id) continue match = re.search(r'^ (n\d+) -- (n\d+)\s*$', line) if match: parent = match.group(1) child = match.group(2) tree.add_child(parent, child) # print line # print parent, child continue # print tree assert poss == [token.pos() for token in tree.token_yield()] # print the_yield dep_tree = HybridTree() head_table = defaultdict(lambda: None) attachment_point = defaultdict(lambda: None) for i, node in enumerate(tree.id_yield()): token = tree.node_token(node) dep_token = construct_conll_token(token.form(), un_escape(token.pos())) current = tree.parent(node) current = tree.parent(current) while current: current_label = tree.node_token(current).category() if not re.search(r'\d+X\d+$', current_label): s = un_escape(current_label) if s == 'TOP1': s = 'ROOT1' dep_token.set_edge_label(s[:-1]) head_table[current] = i + 1 attachment_point[node] = current break else: current = tree.parent(current) dep_tree.add_node(i + 1, dep_token, order=True) # print head_table for node, dep_node in zip(tree.id_yield(), dep_tree.id_yield()): node = tree.parent(attachment_point[node]) while node: if head_table[node]: dep_tree.add_child(head_table[node], dep_node) break node = tree.parent(node) if not node: dep_tree.add_to_root(dep_node) # print "dep_tree" # print dep_tree # print ' '.join(['(' + token.form() + '/' + token.deprel() + ')' for token in dep_tree.token_yield()]) return dep_tree
def test_recursive_partition(self): self.assertEqual( PartitionBuilder(choice_function=choose_min, split_function=spans_split).string_partition( tree=HybridTree()), (set(), []))
def build_score_validator(baseline_grammar, grammarInfo, nont_map, storageManager, term_labelling, parser, corpus_validation, validationMethod): validator = PyCandidateScoreValidator(grammarInfo, storageManager, validationMethod) # parser = GFParser(baseline_grammar) tree_count = 0 der_count = 0 for gold_tree in corpus_validation.get_trees(): tree_count += 1 parser.set_input( term_labelling.prepare_parser_input(gold_tree.token_yield())) parser.parse() derivations = map(lambda x: x[1], parser.k_best_derivation_trees()) manager = PyDerivationManager(baseline_grammar, nont_map) manager.convert_derivations_to_hypergraphs(derivations) scores = [] gold_labels = {} gold_heads = {} for position, id in enumerate(gold_tree.id_yield()): parent_id = gold_tree.parent(id) gold_labels[position] = gold_tree.node_token(id).deprel() if parent_id is None: assert id in gold_tree.root gold_heads[position] = 0 else: gold_heads[position] = gold_tree.id_yield().index( parent_id) + 1 derivations = parser.k_best_derivation_trees() for _, der in derivations: der_count += 1 h_tree = HybridTree() cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield()) dcp = DCP_evaluator(der).getEvaluation() dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) las, uas, lac = 0, 0, 0 for position, id in enumerate(h_tree.id_yield()): parent_id = h_tree.parent(id) if parent_id is None: assert id in h_tree.root head = 0 else: head = h_tree.id_yield().index(parent_id) + 1 label = h_tree.node_token(id).deprel() if gold_heads[position] == head: uas += 1 if gold_labels[position] == label: lac += 1 if gold_heads[position] == head and gold_labels[ position] == label: las += 1 if validationMethod == "LAS": scores.append(las) elif validationMethod == "UAS": scores.append(uas) elif validationMethod == "LAC": scores.append(lac) max_score = len(gold_tree.id_yield()) validator.add_scored_candidates(manager, scores, max_score) print(tree_count, max_score, scores) parser.clear() print("trees used for validation ", tree_count, "with", der_count * 1.0 / tree_count, "derivations on average") return validator
def do_parsing(grammar, test_corpus, term_labelling, result, grammar_identifier, parser_type, k_best, minimum_risk=False, oracle_parse=False, recompile=True, reparse=False, dir=None, opt=None): tree_yield = term_labelling.prepare_parser_input result_path = result(grammar_identifier) minimum_risk_path = result(grammar_identifier, 'min_risk') oracle_parse_path = result(grammar_identifier, 'oracle_file') total_time = 0.0 preprocess_path = [os.path.join(dir, grammar_identifier), "gf_grammar"] # print(preprocess_path) load_preprocess = preprocess_path if parser_type not in [GFParser, GFParser_k_best, Coarse_to_fine_parser] \ or recompile \ or (not os.path.isfile(GFParser.resolve_path(preprocess_path))): load_preprocess = None if parser_type in [GFParser, GFParser_k_best, Coarse_to_fine_parser] \ and not os.path.isdir(os.path.join(dir, grammar_identifier)): os.makedirs(os.path.join(dir, grammar_identifier)) if parser_type == GFParser_k_best: parser = GFParser_k_best(grammar, save_preprocessing=preprocess_path, load_preprocessing=load_preprocess, k=k_best) elif parser_type == Coarse_to_fine_parser: parser = Coarse_to_fine_parser(grammar, base_parser_type=GFParser_k_best, la=opt["latentAnnotation"], grammarInfo=opt["grammarInfo"], nontMap=opt["nontMap"], save_preprocessing=preprocess_path, load_preprocessing=load_preprocess, k=k_best) else: parser = parser_type(grammar, save_preprocess=preprocess_path, load_preprocess=load_preprocess) if recompile or reparse or \ not os.path.isfile(result_path) \ or (minimum_risk and not os.path.isfile(minimum_risk_path)) \ or (oracle_parse and not os.path.isfile(oracle_parse_path)): result_dirs = map(lambda path: os.path.split(path)[0], [result_path, minimum_risk_path, oracle_parse_path]) for result_dir in result_dirs: if not os.path.isdir(result_dir): os.makedirs(result_dir) with open(result_path, 'w') as result_file, \ open(minimum_risk_path, 'w') as minimum_risk_file, \ open(oracle_parse_path, 'w') as oracle_parse_file: failures = 0 for tree in test_corpus.get_trees(): time_stamp = time.clock() parser.set_input(tree_yield(tree.token_yield())) parser.parse() # if not parser.recognized(): # parser = parser_type(grammar_second, tree_yield(tree.token_yield())) # if not parser.recognized(): # parser = parser_type(grammar_tern, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) if parser_type in [GFParser_k_best, Coarse_to_fine_parser ] and parser.recognized(): if minimum_risk or oracle_parse: h_trees = [] weights = [] for weight, der in parser.k_best_derivation_trees(): dcp = DCP_evaluator(der).getEvaluation() h_tree = HybridTree() cleaned_tokens = copy.deepcopy( tree.full_token_yield()) dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) h_trees.append(h_tree) weights.append(weight) if minimum_risk: h_tree_min_risk = compute_minimum_risk_tree( h_trees, weights) if oracle_parse: h_tree_oracle = compute_oracle_tree(h_trees, tree) der_to_tree = lambda der: dcp_to_hybridtree( HybridTree(), DCP_evaluator(der).getEvaluation(), copy.deepcopy(tree.full_token_yield()), False, construct_conll_token) # h_tree = parser.best_trees(der_to_tree)[0][0] h_tree = HybridTree(tree.sent_label()) h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, copy.deepcopy(tree.full_token_yield()), ignore_punctuation, construct_conll_token) elif parser_type == CFGParser \ or parser_type == GFParser \ or parser_type == LeftBranchingFSTParser \ or parser_type == RightBranchingFSTParser: h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) else: h_tree = None if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') if minimum_risk and parser_type in [ GFParser_k_best, Coarse_to_fine_parser ]: minimum_risk_file.write( tree_to_conll_str(h_tree_min_risk)) minimum_risk_file.write('\n\n') if oracle_parse and parser_type in [ GFParser_k_best, Coarse_to_fine_parser ]: oracle_parse_file.write( tree_to_conll_str(h_tree_oracle)) oracle_parse_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] fall_back = tree_to_conll_str( fall_back_left_branching(forms, poss)) files = [result_file] if minimum_risk: files.append(minimum_risk_file) if oracle_parse: files.append(oracle_parse_file) for file in files: file.write(fall_back) file.write('\n\n') parser.clear() print("parse failures", failures) print("parse time", total_time) if parser_type == GFParser_k_best: print("best parse results") else: print("viterbi parse results") eval_pl_call(test_corpus._path, result_path) if oracle_parse: print("\noracle parse results") eval_pl_call(test_corpus._path, oracle_parse_path) if minimum_risk: print("\nminimum risk results") eval_pl_call(test_corpus._path, minimum_risk_path) return parser
def multi_const_tree(): tree = HybridTree("multi") tree.add_node('1.1', ConstituentTerminal('A', 'pA'), True, True) tree.add_node('2.1', ConstituentTerminal('B', 'pB'), True, True) tree.add_node('1.2', ConstituentTerminal('C', 'pC'), True, True) tree.add_node('2.2', ConstituentTerminal('D', 'pD'), True, True) tree.add_node('1', ConstituentCategory('E'), False, True) tree.add_node('2', ConstituentCategory('F'), False, True) for p in ['2', '1']: tree.add_to_root(p) for c in ['1', '2']: tree.add_child(p, p + '.' + c) return tree
def test_best_trees(self): limit_train = 5000 limit_test = 100 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("child", "pos+deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=200) self.assertTrue(parser.recognized()) viterbi_weight = parser.viterbi_weight() viterbi_deriv = parser.viterbi_derivation() der_to_tree = lambda der: dcp_to_hybridtree( HybridTree(), DCP_evaluator(der).getEvaluation(), copy.deepcopy(tree.full_token_yield()), False, construct_conll_token) viterbi_tree = der_to_tree(viterbi_deriv) ordered_parse_trees = parser.best_trees(der_to_tree) best_tree, best_weight, best_witnesses = ordered_parse_trees[0] for i, (parsed_tree, _, _) in enumerate(ordered_parse_trees): if parsed_tree.__eq__(tree): print("Gold tree is ", i + 1, " in best tree list", file=stderr) break if (not viterbi_tree.__eq__(best_tree) and viterbi_weight != best_weight): print("viterbi and k-best tree differ", file=stderr) print("viterbi: ", viterbi_weight, file=stderr) print("k-best: ", best_weight, best_witnesses, file=stderr) if False: print(viterbi_tree, file=stderr) print(tree_to_conll_str(viterbi_tree), file=stderr) print(best_tree, file=stderr) print(tree_to_conll_str(best_tree), file=stderr) print("gold tree", file=stderr) print(tree, file=stderr) print(tree_to_conll_str(tree), file=stderr)
def multi_dep_tree(): tree = HybridTree('multi') tree.add_node('1', CoNLLToken('A', '_', 'pA', 'pA', '_', 'dA'), True) tree.add_node('211', CoNLLToken('B', '_', 'pB', 'pB', '_', 'dB'), True) tree.add_node('11', CoNLLToken('C', '_', 'pC', 'pC', '_', 'dC'), True) tree.add_node('2', CoNLLToken('D', '_', 'pD', 'pD', '_', 'dD'), True) tree.add_node('21', CoNLLToken('E', '_', 'pE', 'pE', '_', 'dE'), True) tree.add_to_root('2') tree.add_to_root('1') for c in ['21', '211']: tree.add_child('2', c) tree.add_child('1', '11') tree.reorder() return tree
def test_k_best_parsing(self): limit_train = 20 limit_test = 10 train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll' test = train parser_type = GFParser_k_best # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll' trees = parse_conll_corpus(train, False, limit_train) primary_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) parser_type.preprocess_grammar(grammar_prim) tree_yield = term_labelling.prepare_parser_input trees = parse_conll_corpus(test, False, limit_test) for i, tree in enumerate(trees): print("Parsing sentence ", i, file=stderr) # print >>stderr, tree parser = parser_type(grammar_prim, tree_yield(tree.token_yield()), k=50) self.assertTrue(parser.recognized()) derivations = [der for der in parser.k_best_derivation_trees()] print("# derivations: ", len(derivations), file=stderr) h_trees = [] current_weight = 0 weights = [] derivation_list = [] for weight, der in derivations: # print >>stderr, exp(-weight) # print >>stderr, der self.assertTrue(not der in derivation_list) derivation_list.append(der) # TODO this should hold, but it looks like a GF bug! # self.assertGreaterEqual(weight, current_weight) current_weight = weight dcp = DCP_evaluator(der).getEvaluation() h_tree = HybridTree() cleaned_tokens = copy.deepcopy(tree.full_token_yield()) dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) h_trees.append(h_tree) weights.append(weight) # print >>stderr, h_tree # print a matrix indicating which derivations result # in the same hybrid tree if True: for i, h_tree1 in enumerate(h_trees): for h_tree2 in h_trees: if h_tree1 == h_tree2: print("x", end=' ', file=stderr) else: print("", end=' ', file=stderr) print(weights[i], file=stderr) print(file=stderr)
def derivation_to_hybrid_tree(der, poss, ordered_labels, construct_token, disconnected=None): """ :param der: :type der: LCFRSDerivation :param poss: list of POS-tags :type poss: list[str] :param ordered_labels: list of words :type ordered_labels: list[str] :param disconnected: list of positions in ordered_labels that are disconnected :type disconnected: list[object] :rtype: GeneralHybridTree Turn a derivation tree into a hybrid tree. Assuming poss and ordered_labels to have equal length. """ if not disconnected: disconnected = [] tree = HybridTree() j = 1 for i in range(len(ordered_labels)): token = construct_token(ordered_labels[i], poss[i], True) if i in disconnected: tree.add_node("d" + str(i), token, True, False) else: tree.add_node("c" + str(j), token, True, True) j += 1 for id in der.ids(): token = construct_token(der.getRule(id).lhs().nont(), '_', False) tree.add_node(id, token) for child in der.child_ids(id): tree.add_child(id, child) for position in der.terminal_positions(id): tree.add_child(id, "c" + str(position)) tree.add_to_root(der.root_id()) tree.reorder() return tree
def test_single_root_induction(self): tree = hybrid_tree_1() # print tree.children("v") # print tree # # for id_set in ['v v1 v2 v21'.split(' '), 'v1 v2'.split(' '), # 'v v21'.split(' '), ['v'], ['v1'], ['v2'], ['v21']]: # print id_set, 'top:', top(tree, id_set), 'bottom:', bottom(tree, id_set) # print id_set, 'top_max:', max(tree, top(tree, id_set)), 'bottom_max:', max(tree, bottom(tree, id_set)) # # print "some rule" # for mem, arg in [(-1, 0), (0,0), (1,0)]: # print create_DCP_rule(mem, arg, top_max(tree, ['v','v1','v2','v21']), bottom_max(tree, ['v','v1','v2','v21']), # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1', 'v2'], ['v', 'v21']]]) # # # print "some other rule" # for mem, arg in [(-1,1),(1,0)]: # print create_DCP_rule(mem, arg, top_max(tree, ['v1','v2']), bottom_max(tree, ['v1','v2']), # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1'], ['v2']]]) # # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21'])) # print 'child:' , child_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21'])) # print '---' # print 'strict: ', strict_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21'])) # print 'child: ', child_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21'])) # print '---' # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21'])) # print 'child:' , child_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21'])) tree2 = hybrid_tree_2() # print tree2.children("v") # print tree2 # # print 'siblings v211', tree2.siblings('v211') # print top(tree2, ['v','v1', 'v211']) # print top_max(tree2, ['v','v1', 'v211']) # # print '---' # print 'strict:' , strict_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211'])) # print 'child:' , child_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211'])) # rec_par = ('v v1 v2 v21'.split(' '), # [('v1 v2'.split(' '), [(['v1'],[]), (['v2'],[])]) # ,('v v21'.split(' '), [(['v'],[]), (['v21'],[])]) # ]) # # grammar = LCFRS(nonterminal_str(tree, top_max(tree, rec_par[0]), bottom_max(tree, rec_par[0]), 'strict')) # # add_rules_to_grammar_rec(tree, rec_par, grammar, 'child') # # grammar.make_proper() # print grammar print(tree.recursive_partitioning()) terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'), terminal_labeling.token_label, [direct_extraction], 'START') print(max([grammar.fanout(nont) for nont in grammar.nonts()])) print(grammar) parser = LCFRS_parser(grammar, 'NP N V V'.split(' ')) print(parser.best_derivation_tree()) tokens = [ construct_conll_token(form, pos) for form, pos in zip( 'Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' ')) ] hybrid_tree = HybridTree() hybrid_tree = parser.dcp_hybrid_tree_best_derivation( hybrid_tree, tokens, True, construct_conll_token) print(list(map(str, hybrid_tree.full_token_yield()))) print(hybrid_tree) string = "foo" dcp_string = DCP_string(string) dcp_string.set_edge_label("bar") print(dcp_string, dcp_string.edge_label()) linearize( grammar, the_labeling_factory().create_simple_labeling_strategy( 'child', 'pos+deprel'), the_terminal_labeling_factory().get_strategy('pos'), sys.stdout)
def test_fst_compilation_left(self): if not test_pynini: return tree = hybrid_tree_1() tree2 = hybrid_tree_2() terminal_labeling = the_terminal_labeling_factory().get_strategy('pos') (_, grammar) = induce_grammar( [tree, tree2], the_labeling_factory().create_simple_labeling_strategy( 'empty', 'pos'), terminal_labeling.token_label, [left_branching], 'START') fst, rules = compile_wfst_from_left_branching_grammar(grammar) print(repr(fst)) symboltable = fst.input_symbols() string = ["NP", "N", "V", "V", "V"] fsa = fsa_from_list_of_symbols(string, symboltable) self.assertEqual( fsa.text().decode('utf-8'), '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n' ) b = compose(fsa, fst) print(b.text(symboltable, symboltable)) print("Shortest path probability", end=' ') best = shortestpath(b) best.topsort() # self.assertAlmostEquals(pow(e, -float(shortestdistance(best)[-1])), 1.80844898756e-05) print(best.text()) polish_rules = retrieve_rules(best) self.assertSequenceEqual(polish_rules, [1, 2, 3, 4, 5, 4, 9, 4, 7, 8]) polish_rules = list(map(rules.index_object, polish_rules)) for rule in polish_rules: print(rule) print() der = ReversePolishDerivation(polish_rules[0:-1]) self.assertTrue(der.check_integrity_recursive(der.root_id())) print(der) LeftBranchingFSTParser.preprocess_grammar(grammar) parser = LeftBranchingFSTParser(grammar, string) der_ = parser.best_derivation_tree() print(der_) self.assertTrue(der_.check_integrity_recursive(der_.root_id())) print( derivation_to_hybrid_tree(der, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) print( derivation_to_hybrid_tree(der_, string, "Piet Marie helpen lezen leren".split(), construct_conll_token)) dcp = DCP_evaluator(der).getEvaluation() h_tree_2 = HybridTree() token_sequence = [ construct_conll_token(form, lemma) for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '), 'NP N V V V'.split(' ')) ] dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False, construct_conll_token) print(h_tree_2)
def trainAndEval(strategy, labelling1, labelling2, fanout, parser_type, train, test, cDT, parseStrings, ignore_punctuation=False): file = open('results.txt', 'a') term_labelling = the_terminal_labeling_factory().get_strategy('pos') recursive_partitioning = d_i.the_recursive_partitioning_factory( ).get_partitioning('fanout-' + str(fanout) + strategy) primary_labelling = d_l.the_labeling_factory( ).create_simple_labeling_strategy(labelling1, labelling2) trees = parse_conll_corpus(train, False, train_limit) if ignore_punctuation: trees = disconnect_punctuation(trees) (n_trees, grammar) = d_i.induce_grammar(trees, primary_labelling, term_labelling.token_label, recursive_partitioning, start) # write current transformation strategy and hyperparameters to results.txt if strategy == '': file.write('rtl ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) else: splitList = strategy.split('-') if splitList[1] == 'left': file.write('ltr ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) elif splitList[1] == 'random': file.write('random seed:' + splitList[2] + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) elif splitList[1] == 'no': if splitList[4] == 'random': file.write('nnont fallback:random seed:' + splitList[5] + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) elif splitList[4] == 'ltr': file.write('nnont fallback:ltr' + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) elif splitList[4] == 'rtl': file.write('nnont fallback:rtl' + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) else: file.write('nnont fallback:argmax' + ' ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) else: #argmax file.write('argmax ' + labelling1 + ' ' + labelling2 + ' maximal fanout:' + fanout) file.write('\n') res = '' res += '#nonts:' + str(len(grammar.nonts())) res += ' #rules:' + str(len(grammar.rules())) file.write(res) res = '' # The following code is to count the number of derivations for a hypergraph (tree parser required) if cDT == True: tree_parser.preprocess_grammar(grammar, term_labelling) trees = parse_conll_corpus(train, False, train_limit) if ignore_punctuation: trees = disconnect_punctuation(trees) derCount = 0 derMax = 0 for tree in trees: parser = tree_parser(grammar, tree) # if tree parser is used der = parser.count_derivation_trees() if der > derMax: derMax = der derCount += der res += "\n#derivation trees: average: " + str( 1.0 * derCount / n_trees) res += " maximal: " + str(derMax) file.write(res) res = '' total_time = 0.0 # The following code works for string parsers for evaluating if parseStrings == True: parser_type.preprocess_grammar(grammar) trees = parse_conll_corpus(test, False, test_limit) if ignore_punctuation: trees = disconnect_punctuation(trees) i = 0 with open(result, 'w') as result_file: failures = 0 for tree in trees: time_stamp = time.clock() i += i #if (i % 100 == 0): #print '.', #sys.stdout.flush() parser = parser_type(grammar, tree_yield(tree.token_yield())) time_stamp = time.clock() - time_stamp total_time += time_stamp cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') h_tree = HybridTree(tree.sent_label()) h_tree = parser.dcp_hybrid_tree_best_derivation( h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token) if h_tree: result_file.write(tree_to_conll_str(h_tree)) result_file.write('\n\n') else: failures += 1 forms = [token.form() for token in tree.full_token_yield()] poss = [token.pos() for token in tree.full_token_yield()] result_file.write( tree_to_conll_str( fall_back_left_branching_token(cleaned_tokens))) result_file.write('\n\n') res += "\nattachment scores:\nno punctuation: " out = subprocess.check_output( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"]) match = re.search(r'[^=]*= (\d+\.\d+)[^=]*= (\d+.\d+).*', out) res += ' labelled:' + match.group(1) #labeled attachment score res += ' unlabelled:' + match.group(2) #unlabeled attachment score res += "\npunctation: " out = subprocess.check_output( ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"]) match = re.search(r'[^=]*= (\d+\.\d+)[^=]*= (\d+.\d+).*', out) res += ' labelled:' + match.group(1) res += ' unlabelled:' + match.group(2) res += "\nparse time: " + str(total_time) file.write(res) file.write('\n\n\n') file.close()