def test_negra_to_dag_parsing(self): names = list(map(str, [26954])) fd_, primary_file = tempfile.mkstemp(suffix='.export') with open(primary_file, mode='w') as pf: for s in names: dsg = tp.sentence_names_to_deep_syntax_graphs( ["s" + s], "res/tiger/tiger_s%s.xml" % s, hold=False, ignore_puntcuation=False)[0] dsg.set_label(dsg.label[1:]) lines = np.serialize_hybrid_dag_to_negra( [dsg], 0, 500, use_sentence_names=True) print(''.join(lines), file=pf) _, binarized_file = tempfile.mkstemp(suffix='.export') subprocess.call([ "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", primary_file, binarized_file ]) print(primary_file) print(binarized_file) corpus = np.sentence_names_to_hybridtrees(names, primary_file, secedge=True) corpus2 = np.sentence_names_to_hybridtrees(names, binarized_file, secedge=True) dag = corpus[0] print(dag) assert isinstance(dag, HybridDag) self.assertEqual(8, len(dag.token_yield())) for token in dag.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() dag_bin = corpus2[0] print(dag_bin) for token in dag_bin.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() self.assertEqual(8, len(dag_bin.token_yield())) for node, token in zip( dag_bin.nodes(), list(map(str, map(dag_bin.node_token, dag_bin.nodes())))): print(node, token) print() print(top(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'101', '500'}, top(dag_bin, {'500', '101', '102'})) print(bottom(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'})) nont_labeling = BasicNonterminalLabeling() term_labeling = FormTerminals() # PosTerminals() grammar = direct_extract_lcfrs_from_prebinarized_corpus( dag_bin, term_labeling, nont_labeling) # print(grammar) for rule in grammar.rules(): print(rule.get_idx(), rule) print("Testing LCFRS parsing and DCP evaluation".center(80, '=')) parser = LCFRS_parser(grammar) parser_input = term_labeling.prepare_parser_input( dag_bin.token_yield()) print(parser_input) parser.set_input(parser_input) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) dcp_term = DCP_evaluator(der).getEvaluation() print(dcp_term[0]) dag_eval = HybridDag(dag_bin.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(dag_bin.token_yield()), False, construct_token=construct_constituent_token) print(dag_eval) for node in dag_eval.nodes(): token = dag_eval.node_token(node) if token.type() == "CONSTITUENT-CATEGORY": label = token.category() elif token.type() == "CONSTITUENT-TERMINAL": label = token.form(), token.pos() print(node, label, dag_eval.children(node), dag_eval.sec_children(node), dag_eval.sec_parents(node)) lines = np.serialize_hybridtrees_to_negra([dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='') print() with open(primary_file) as pcf: for line in pcf: print(line, end='') print('Testing reduct computation with Schick parser'.center(80, '=')) grammar_path = '/tmp/lcfrs_dcp_grammar.gr' derivation_manager = PyDerivationManager(grammar) with open(grammar_path, 'w') as grammar_file: nonterminal_enc, terminal_enc = linearize( grammar, nont_labeling, term_labeling, grammar_file, delimiter=' : ', nonterminal_encoder=derivation_manager.get_nonterminal_map()) print(np.negra_to_json(dag, terminal_enc, term_labeling)) json_data = np.export_corpus_to_json([dag], terminal_enc, term_labeling) corpus_path = '/tmp/json_dags.json' with open(corpus_path, 'w') as data_file: json.dump(json_data, data_file) reduct_dir = '/tmp/schick_parser_reducts' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([ ' '.join([ "java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'reduct', '-g', grammar_path, '-t', corpus_path, "--input-format", "json", "-o", reduct_dir ]) ], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] def decode_nonterminals(s): return derivation_manager.get_nonterminal_map().index_object( int(s)) for i in range(1, len(corpus) + 1): rtgs.append( read_rtg(os.path.join(reduct_dir, str(i) + '.gra'), symbol_offset=-1, rule_prefix='r', process_nonterminal=decode_nonterminals)) print("Reduct RTG") for rule in rtgs[0].rules: print(rule.lhs, "->", rule.symbol, rule.rhs) derivation_manager.get_nonterminal_map().print_index() derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize( bytes('/tmp/reduct_manager.trace', encoding='utf8')) derivations = [ LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(0, grammar) ] self.assertGreaterEqual(len(derivations), 1) if len(derivations) >= 1: print("Sentence", i) for der in derivations: print(der) self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start()))
def test_json_corpus_grammar_export(self): start = 1 stop = 50 # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" path = "res/tiger/tiger_8000.xml" exclude = [] dsgs = sentence_names_to_deep_syntax_graphs( ['s' + str(i) for i in range(start, stop + 1) if i not in exclude] , path , hold=False) rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0] def label_edge(edge): if isinstance(edge.label, ConstituentTerminal): return edge.label.pos() else: return edge.label nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge) term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling) grammar.make_proper() terminals = Enumerator() data = export_dog_grammar_to_json(grammar, terminals) grammar_path = '/tmp/json_grammar.json' with open(grammar_path, 'w') as file: json.dump(data, file) corpus_path = '/tmp/json_corpus.json' with open(corpus_path, 'w') as file: json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file) with open('/tmp/enumerator.enum', 'w') as file: terminals.print_index(file) reduct_dir = '/tmp/reduct_grammars' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([' '.join( ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t', corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] for i in range(1, len(dsgs) + 1): rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra')) derivation_manager = PyDerivationManager(grammar) derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8')) f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)): derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)] self.assertGreaterEqual(len(derivations), 1) if len(derivations) > 1: print("Sentence", i) for der in derivations: print(der) for der in derivations: dog, sync = dog_evaluation(der) dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync) dsg.dog.project_labels(f) dsg.sentence = list(map(f, dsg.sentence)) self.assertEqual(dsg.sentence, dsg2.sentence) morphs = dsg.dog.compute_isomorphism(dsg2.dog) self.assertFalse(morphs is None) self.assertListEqual([[morphs[0].get(node, node) for node in syncs] for syncs in dsg.synchronization], dsg2.synchronization) pass