def test_fallback_labeling(self): file = "res/TIGER/tiger21/tigertraindev_root_attach.export" corpus = np.sentence_names_to_hybridtrees( [str(x) for x in range(50) if x % 10 > 1], file, disconnect_punctuation=False) labeling = tl.FrequencyBiasedTerminalLabeling(tl.FormTerminals(), tl.PosTerminals(), corpus=corpus, threshold=2) print(labeling.fine_label_count) token1 = mt.ConstituentTerminal('Milliardär', 'NN') token2 = mt.ConstituentTerminal('Tisch', 'NN') label1 = labeling.token_label(token1) label2 = labeling.token_label(token2) f = io.StringIO() json.dump(labeling.serialize(), f) f.seek(0) print(f.getvalue()) instance2 = tl.deserialize_labeling(json.load(f)) self.assertTrue(isinstance(instance2, labeling.__class__)) self.assertEqual(label1, instance2.token_label(token1)) self.assertEqual(label2, instance2.token_label(token2))
def read_corpus_export(self, resource, mode="STANDARD", skip_normalization=False): """ :type resource: CorpusFile :param mode: either STANDARD or DISCO-DOP (handles variation in NEGRA format) :type mode: str :param skip_normalization: If normalization is skipped even if set in induction settings. :type skip_normalization: bool :return: corpus of constituent trees """ if resource.filter is None: def sentence_filter(_): return True else: sentence_filter = resource.filter path = resource.path if not skip_normalization and self.induction_settings.normalize: path = self.normalize_corpus(path, src='export', dest='export', renumber=False) # encoding = "iso-8859-1" encoding = "utf-8" return np.sentence_names_to_hybridtrees( {str(i) for i in range(resource.start, resource.end + 1) if i not in resource.exclude and sentence_filter(i)}, path, enc=encoding, disconnect_punctuation=self.induction_settings.disconnect_punctuation, add_vroot=True, mode=mode)
def test_suffix_labeling(self): file = "res/TIGER/tiger21/tigertraindev_root_attach.export" corpus = np.sentence_names_to_hybridtrees([str(x) for x in range(50) if x % 10 > 1], file, disconnect_punctuation=False) labeling = tl.Suffix(trees=corpus, threshold=2) label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')), labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')), labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'XY')), labeling.token_label(mt.ConstituentTerminal('1975', 'CARD')), labeling.token_label(mt.ConstituentTerminal('stronghold', 'FM')), labeling.token_label(mt.ConstituentTerminal('den', 'ART')) ] serialization = labeling.serialize() print(serialization) instance2 = tl.deserialize_labeling(serialization) label2 = [ labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')), labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')), labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'XY')), labeling.token_label(mt.ConstituentTerminal('1975', 'CARD')), labeling.token_label(mt.ConstituentTerminal('stronghold', 'FM')), labeling.token_label(mt.ConstituentTerminal('den', 'ART'))] print(label) self.assertEqual(label, label2) self.assertTrue(isinstance(instance2, labeling.__class__))
def test_brown_cluster_labeling(self): clustering = "clustering/tiger_final.clustering" file = "res/TIGER/tiger21/tigertraindev_root_attach.export" corpus = np.sentence_names_to_hybridtrees([str(x) for x in range(50) if x % 10 > 1], file, disconnect_punctuation=False) unk_strat = tl.UNKStrategySuffix(2) labeling = tl.BrownCluster(clustering=clustering, trees=corpus, unk_strategy=unk_strat, cluster_occurence_threshold=100) label = [labeling.token_label(mt.ConstituentTerminal('Auskunft', 'NN')), labeling.token_label(mt.ConstituentTerminal('1962', 'NN')), labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'XY')), labeling.token_label(mt.ConstituentTerminal('1975', 'CARD')), labeling.token_label(mt.ConstituentTerminal('um', 'FM')), labeling.token_label(mt.ConstituentTerminal('den', 'ART')) ] print(label)
def test_unk4_labeling(self): file = "res/TIGER/tiger21/tigertraindev_root_attach.export" corpus = np.sentence_names_to_hybridtrees([str(x) for x in range(50) if x % 10 > 1], file, disconnect_punctuation=False) labeling = tl.UNK4(trees=corpus, threshold=2, use_pos=False) label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')), labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')), labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')), labeling.token_label(mt.ConstituentTerminal('Ätsch', 'NN')), labeling.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))] serialization = labeling.serialize() print(serialization) instance2 = tl.deserialize_labeling(serialization) label2 = [instance2.token_label(mt.ConstituentTerminal('Tisch', 'NN')), instance2.token_label(mt.ConstituentTerminal('TISCH', 'NN')), instance2.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')), instance2.token_label(mt.ConstituentTerminal('Ätsch', 'NN')), instance2.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))] print(label) self.assertEqual(label, label2) self.assertTrue(isinstance(instance2, labeling.__class__)) labeling = tl.UNK4(trees=corpus, threshold=2, use_pos=True) label = [labeling.token_label(mt.ConstituentTerminal('Tisch', 'NN')), labeling.token_label(mt.ConstituentTerminal('TISCH', 'NN')), labeling.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')), labeling.token_label(mt.ConstituentTerminal('Ätsch', 'NN')), labeling.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))] serialization = labeling.serialize() print(serialization) instance2 = tl.deserialize_labeling(serialization) label2 = [instance2.token_label(mt.ConstituentTerminal('Tisch', 'NN')), instance2.token_label(mt.ConstituentTerminal('TISCH', 'NN')), instance2.token_label(mt.ConstituentTerminal('§"$&(-.,', 'NN')), instance2.token_label(mt.ConstituentTerminal('Ätsch', 'NN')), instance2.token_label(mt.ConstituentTerminal('Milliardär', 'NN'))] print(label) self.assertEqual(label, label2) self.assertTrue(isinstance(instance2, labeling.__class__))
def main(): inpath = sys.argv[1] outpath = sys.argv[2] begin = int(sys.argv[3]) end = int(sys.argv[4]) print(inpath) sent_ids = [str(i) for i in range(begin, end + 1)] corpus = sentence_names_to_hybridtrees(sent_ids, inpath) map(lambda x: x.strip_vroot(), corpus) with codecs.open(outpath, mode='w', encoding="utf-8") as file: lines = serialize_hybridtrees_to_negra(corpus, begin, 2000) for line in lines: if not (isinstance(line, unicode) or isinstance(line, str)): print(line) try: file.write(line) except UnicodeEncodeError: print(line, type(line)) raise
def test_negra_to_dag_parsing(self): pass names = list(map(str, [26954])) fd_, primary_file = tempfile.mkstemp(suffix='.export') with open(primary_file, mode='w') as pf: for s in names: dsg = tp.sentence_names_to_deep_syntax_graphs( [s], "res/tiger/tiger_s%s.xml" % s, hold=False, ignore_puntcuation=False)[0] dsg.set_label(dsg.label[1:]) lines = np.serialize_hybrid_dag_to_negra( [dsg], 0, 500, use_sentence_names=True) print(''.join(lines), file=pf) _, binarized_file = tempfile.mkstemp(suffix='.export') subprocess.call([ "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", primary_file, binarized_file ]) print(primary_file) print(binarized_file) corpus = np.sentence_names_to_hybridtrees(names, primary_file, secedge=True) corpus2 = np.sentence_names_to_hybridtrees(names, binarized_file, secedge=True) dag = corpus[0] print(dag) assert isinstance(dag, HybridDag) self.assertEqual(8, len(dag.token_yield())) for token in dag.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() dag_bin = corpus2[0] print(dag_bin) for token in dag_bin.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() self.assertEqual(8, len(dag_bin.token_yield())) for node, token in zip( dag_bin.nodes(), list(map(str, map(dag_bin.node_token, dag_bin.nodes())))): print(node, token) print() print(top(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'101', '500'}, top(dag_bin, {'500', '101', '102'})) print(bottom(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'})) grammar = direct_extract_lcfrs_from_prebinarized_corpus(dag_bin) print(grammar) parser = LCFRS_parser(grammar) poss = list(map(lambda x: x.pos(), dag_bin.token_yield())) print(poss) parser.set_input(poss) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) dcp_term = DCP_evaluator(der).getEvaluation() print(dcp_term[0]) dag_eval = HybridDag(dag_bin.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(dag_bin.token_yield()), False, construct_token=construct_constituent_token) print(dag_eval) for node in dag_eval.nodes(): token = dag_eval.node_token(node) if token.type() == "CONSTITUENT-CATEGORY": label = token.category() elif token.type() == "CONSTITUENT-TERMINAL": label = token.form(), token.pos() print(node, label, dag_eval.children(node), dag_eval.sec_children(node), dag_eval.sec_parents(node)) lines = np.serialize_hybridtrees_to_negra([dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='') print() with open(primary_file) as pcf: for line in pcf: print(line, end='')
def test_negra_dag_small_grammar(self): DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export' DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export' names = list([str(i) for i in range(1, 101)]) if not os.path.exists(DAG_CORPUS): print( 'run the following command to create an export corpus with dags:' ) print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' + 'res/tiger/tiger_release_aug07.corrected.16012013.xml ' + DAG_CORPUS + ' 1 50474') self.assertTrue(os.path.exists(DAG_CORPUS)) if not os.path.exists(DAG_CORPUS_BIN): print( 'run the following command to binarize the export corpus with dags:' ) print("discodop treetransforms --binarize -v 1 -h 1 " + DAG_CORPUS + " " + DAG_CORPUS_BIN) # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export') # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN]) self.assertTrue(os.path.exists(DAG_CORPUS_BIN)) corpus = np.sentence_names_to_hybridtrees(names, DAG_CORPUS, secedge=True) corpus_bin = np.sentence_names_to_hybridtrees(names, DAG_CORPUS_BIN, secedge=True) grammar = LCFRS(start="START") for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin): self.assertEqual(len(hybrid_dag.token_yield()), len(hybrid_dag_bin.token_yield())) dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus( hybrid_dag_bin) grammar.add_gram(dag_grammar) grammar.make_proper() print( "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules" % (len(grammar.nonts()), len(grammar.rules()))) parser = DiscodopKbestParser(grammar, k=1) _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_', suffix='.export') with open(RESULT_FILE, 'w') as results: for hybrid_dag in corpus: poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield())) parser.set_input(poss) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() dcp_term = DCP_evaluator(der).getEvaluation() dag_eval = HybridDag(hybrid_dag.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(hybrid_dag.token_yield()), False, construct_token=construct_constituent_token) lines = np.serialize_hybridtrees_to_negra( [dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='', file=results) parser.clear() print("Wrote results to %s" % RESULT_FILE)
def test_something(self): normal_corpus = 'res/tiger/tiger_8000.export' binarized_corpus = 'res/tiger/tiger_8000_bin.export' limit = 55000 # limit = 30 corpus_bin = sentence_names_to_hybridtrees( {str(x) for x in range(limit)}, binarized_corpus, disconnect_punctuation=False, add_vroot=True, mode="DISCO-DOP") corpus = sentence_names_to_hybridtrees({str(x) for x in range(limit)}, normal_corpus, disconnect_punctuation=False, add_vroot=True, mode="DISCO-DOP") term_labeling = terminal_labeling(corpus, threshold=4) grammar = None for htree, htree_bin in zip(corpus, corpus_bin): # print(htree_bin) try: htree_grammar = direct_extract_lcfrs_from_prebinarized_corpus( htree_bin, term_labeling=term_labeling) except Exception as e: print(e) print(htree_bin) print(htree_bin.nodes()) print(htree_bin.word_yield()) raise e # print(htree_grammar) parser_input = term_labeling.prepare_parser_input( htree.token_yield()) p = LCFRS_sDCP_Parser(htree_grammar, terminal_labelling=term_labeling) p.set_input(htree) p.parse() # p = LCFRS_parser(htree_grammar, parser_input) self.assertTrue(p.recognized()) derivs = list(p.all_derivation_trees()) # print("derivations:", len(derivs)) for der in derivs: dcp = DCP_evaluator(der).getEvaluation() sys_tree = HybridTree(htree.sent_label()) sys_tree = dcp_to_hybridtree( sys_tree, dcp, deepcopy(htree.token_yield()), ignore_punctuation=False, construct_token=construct_constituent_token) # print(sys_tree) # print(htree == sys_tree) # print(der) if htree != sys_tree: print(htree.sent_label()) print(htree) print(sys_tree) self.assertEqual(htree, sys_tree) if grammar is None: grammar = htree_grammar else: grammar.add_gram(htree_grammar) htree_grammar.make_proper() try: disco_parser = DiscodopKbestParser(htree_grammar) except ValueError as ve: print(ve) print(htree.sent_label()) print(htree) print(htree_bin) print(htree_grammar) raise ve grammar.make_proper() disco_parser = DiscodopKbestParser(grammar)
def test_negra_to_dag_parsing(self): names = list(map(str, [26954])) fd_, primary_file = tempfile.mkstemp(suffix='.export') with open(primary_file, mode='w') as pf: for s in names: dsg = tp.sentence_names_to_deep_syntax_graphs( ["s" + s], "res/tiger/tiger_s%s.xml" % s, hold=False, ignore_puntcuation=False)[0] dsg.set_label(dsg.label[1:]) lines = np.serialize_hybrid_dag_to_negra( [dsg], 0, 500, use_sentence_names=True) print(''.join(lines), file=pf) _, binarized_file = tempfile.mkstemp(suffix='.export') subprocess.call([ "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", primary_file, binarized_file ]) print(primary_file) print(binarized_file) corpus = np.sentence_names_to_hybridtrees(names, primary_file, secedge=True) corpus2 = np.sentence_names_to_hybridtrees(names, binarized_file, secedge=True) dag = corpus[0] print(dag) assert isinstance(dag, HybridDag) self.assertEqual(8, len(dag.token_yield())) for token in dag.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() dag_bin = corpus2[0] print(dag_bin) for token in dag_bin.token_yield(): print(token.form() + '/' + token.pos(), end=' ') print() self.assertEqual(8, len(dag_bin.token_yield())) for node, token in zip( dag_bin.nodes(), list(map(str, map(dag_bin.node_token, dag_bin.nodes())))): print(node, token) print() print(top(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'101', '500'}, top(dag_bin, {'500', '101', '102'})) print(bottom(dag_bin, {'500', '101', '102'})) self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'})) nont_labeling = BasicNonterminalLabeling() term_labeling = FormTerminals() # PosTerminals() grammar = direct_extract_lcfrs_from_prebinarized_corpus( dag_bin, term_labeling, nont_labeling) # print(grammar) for rule in grammar.rules(): print(rule.get_idx(), rule) print("Testing LCFRS parsing and DCP evaluation".center(80, '=')) parser = LCFRS_parser(grammar) parser_input = term_labeling.prepare_parser_input( dag_bin.token_yield()) print(parser_input) parser.set_input(parser_input) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() print(der) dcp_term = DCP_evaluator(der).getEvaluation() print(dcp_term[0]) dag_eval = HybridDag(dag_bin.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(dag_bin.token_yield()), False, construct_token=construct_constituent_token) print(dag_eval) for node in dag_eval.nodes(): token = dag_eval.node_token(node) if token.type() == "CONSTITUENT-CATEGORY": label = token.category() elif token.type() == "CONSTITUENT-TERMINAL": label = token.form(), token.pos() print(node, label, dag_eval.children(node), dag_eval.sec_children(node), dag_eval.sec_parents(node)) lines = np.serialize_hybridtrees_to_negra([dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='') print() with open(primary_file) as pcf: for line in pcf: print(line, end='') print('Testing reduct computation with Schick parser'.center(80, '=')) grammar_path = '/tmp/lcfrs_dcp_grammar.gr' derivation_manager = PyDerivationManager(grammar) with open(grammar_path, 'w') as grammar_file: nonterminal_enc, terminal_enc = linearize( grammar, nont_labeling, term_labeling, grammar_file, delimiter=' : ', nonterminal_encoder=derivation_manager.get_nonterminal_map()) print(np.negra_to_json(dag, terminal_enc, term_labeling)) json_data = np.export_corpus_to_json([dag], terminal_enc, term_labeling) corpus_path = '/tmp/json_dags.json' with open(corpus_path, 'w') as data_file: json.dump(json_data, data_file) reduct_dir = '/tmp/schick_parser_reducts' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([ ' '.join([ "java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'reduct', '-g', grammar_path, '-t', corpus_path, "--input-format", "json", "-o", reduct_dir ]) ], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] def decode_nonterminals(s): return derivation_manager.get_nonterminal_map().index_object( int(s)) for i in range(1, len(corpus) + 1): rtgs.append( read_rtg(os.path.join(reduct_dir, str(i) + '.gra'), symbol_offset=-1, rule_prefix='r', process_nonterminal=decode_nonterminals)) print("Reduct RTG") for rule in rtgs[0].rules: print(rule.lhs, "->", rule.symbol, rule.rhs) derivation_manager.get_nonterminal_map().print_index() derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize( bytes('/tmp/reduct_manager.trace', encoding='utf8')) derivations = [ LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(0, grammar) ] self.assertGreaterEqual(len(derivations), 1) if len(derivations) >= 1: print("Sentence", i) for der in derivations: print(der) self.assertTrue( der.check_integrity_recursive(der.root_id(), grammar.start()))
def get_whole_corpus(self, n=N_NEGRA_SENTENCES): return sentence_names_to_hybridtrees( names=[num_to_name(num) for num in range(n + 1)], path=NEGRA_PATH)
def get_shortest_tree(self): trees = sentence_names_to_hybridtrees( names=[num_to_name(num) for num in range(N_NEGRA_SENTENCES)], path=NEGRA_PATH) return min(trees, key=lambda tree: tree.n_yield_nodes())
def get_trees_for_single_sentence(id=37): return sentence_names_to_hybridtrees(names=[num_to_name(id)], path=NEGRA_PATH)