def test_basic_sdcp_parsing_constituency(self): tree1 = constituent_tree_1() tree2 = constituent_tree_2() tree3 = constituent_tree_1_pos_stripped() terminal_labeling = FormTerminals() # [tree1, tree2], 1, filter=["VP"]) fanout = 1 grammar = LCFRS('START') for tree in [tree1, tree2]: tree_part = tree.unlabelled_structure() part = fanout_limited_partitioning(tree_part, fanout) tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling) grammar.add_gram(tree_grammar) grammar.make_proper() print("grammar induced. Printing rules...", file=stderr) for rule in grammar.rules(): print(rule, file=stderr) parser_type = LCFRS_sDCP_Parser print("preprocessing grammar", file=stderr) parser_type.preprocess_grammar(grammar, terminal_labeling, debug=True) print("invoking parser", file=stderr) parser = parser_type(grammar, tree1) print("listing derivations", file=stderr) for der in parser.all_derivation_trees(): print(der) output_tree = ConstituentTree(tree1.sent_label()) tokens = [construct_constituent_token(token.form(), '--', True) for token in tree1.token_yield()] dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_constituent_token) print(tree1) print(output_tree) parser = parser_type(grammar, tree3) print(parser.recognized()) for der in parser.all_derivation_trees(): print(der) output_tree = ConstituentTree(tree3.sent_label()) tokens = [construct_constituent_token(token.form(), '--', True) for token in tree3.token_yield()] dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_constituent_token) print(tree3) print(output_tree) print("completed test", file=stderr)
def test_negra_dag_small_grammar(self): DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export' DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export' names = list([str(i) for i in range(1, 101)]) if not os.path.exists(DAG_CORPUS): print( 'run the following command to create an export corpus with dags:' ) print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' + 'res/tiger/tiger_release_aug07.corrected.16012013.xml ' + DAG_CORPUS + ' 1 50474') self.assertTrue(os.path.exists(DAG_CORPUS)) if not os.path.exists(DAG_CORPUS_BIN): print( 'run the following command to binarize the export corpus with dags:' ) print("discodop treetransforms --binarize -v 1 -h 1 " + DAG_CORPUS + " " + DAG_CORPUS_BIN) # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export') # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN]) self.assertTrue(os.path.exists(DAG_CORPUS_BIN)) corpus = np.sentence_names_to_hybridtrees(names, DAG_CORPUS, secedge=True) corpus_bin = np.sentence_names_to_hybridtrees(names, DAG_CORPUS_BIN, secedge=True) grammar = LCFRS(start="START") for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin): self.assertEqual(len(hybrid_dag.token_yield()), len(hybrid_dag_bin.token_yield())) dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus( hybrid_dag_bin) grammar.add_gram(dag_grammar) grammar.make_proper() print( "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules" % (len(grammar.nonts()), len(grammar.rules()))) parser = DiscodopKbestParser(grammar, k=1) _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_', suffix='.export') with open(RESULT_FILE, 'w') as results: for hybrid_dag in corpus: poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield())) parser.set_input(poss) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() dcp_term = DCP_evaluator(der).getEvaluation() dag_eval = HybridDag(hybrid_dag.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(hybrid_dag.token_yield()), False, construct_token=construct_constituent_token) lines = np.serialize_hybridtrees_to_negra( [dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='', file=results) parser.clear() print("Wrote results to %s" % RESULT_FILE)