def test_induction_from_corpus_tree(self): dsg = sentence_names_to_deep_syntax_graphs(["s26954"], "res/tiger/tiger_s26954.xml", hold=False)[0] def label_edge(edge): if isinstance(edge.label, ConstituentTerminal): return edge.label.pos() else: return edge.label labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge) term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0] rec_part = rec_part_strategy(dsg) dcmp = compute_decomposition(dsg, rec_part) grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=labeling, terminal_labeling=term_labeling) print(grammar) parser = LCFRS_parser(grammar) parser.set_input(term_labeling_token.prepare_parser_input(dsg.sentence)) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() self.assertNotEqual(derivation, None)
def test_json_grammar_export(self): dog = build_acyclic_dog() terminals = Enumerator() data = dog.export_graph_json(terminals) with open('/tmp/json_graph_1.json', 'w') as file: json.dump(data, file) dsg = build_dsg() data = dsg.export_bihypergraph_json(terminals) with open('/tmp/json_bigraph_1.json', 'w') as file: json.dump(data, file) rule_dog = dog_se() data2 = rule_dog.export_graph_json(terminals) with open('/tmp/json_nonterminal_graph_1.json', 'w') as file: json.dump(data2, file) terminals.print_index() dsg = build_dsg() rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('right-branching')[0] rec_part = rec_part_strategy(dsg) dcmp = compute_decomposition(dsg, rec_part) grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=simple_labeling, terminal_labeling=str) print(grammar) data = export_dog_grammar_to_json(grammar, terminals) with open('/tmp/json_grammar.json', 'w') as file: json.dump(data, file) with open('/tmp/json_corpus.json', 'w') as file: json.dump(export_corpus_to_json([dsg], terminals), file)
def test_induction_with_labeling_strategies(self): dsg = build_dsg() rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('right-branching')[0] rec_part = rec_part_strategy(dsg) dcmp = compute_decomposition(dsg, rec_part) grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=simple_labeling, terminal_labeling=str) print(grammar) parser = LCFRS_parser(grammar) parser.set_input(dsg.sentence) # ["Sie", "entwickelt", "und", "druckt", "Verpackungen", "und", "Etiketten"] parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() self.assertNotEqual(derivation, None) dog, sync_list = dog_evaluation(derivation) self.assertEqual(dog, dsg.dog) self.assertEqual(len(sync_list), len(dsg.sentence)) # print(dog) # print(sync) # print(sync_list) morphism, _ = dsg.dog.compute_isomorphism(dog) for i in range(len(dsg.sentence)): self.assertListEqual(list(map(lambda x: morphism[x], dsg.get_graph_position(i))), sync_list[i])
def test_multiroot(self): tree = multi_dep_tree() term_pos = the_terminal_labeling_factory().get_strategy( 'pos').token_label fanout_1 = the_recursive_partitioning_factory().get_partitioning( 'fanout-1') for top_level_labeling_strategy in ['strict', 'child']: labeling_strategy = the_labeling_factory( ).create_simple_labeling_strategy(top_level_labeling_strategy, 'pos+deprel') for recursive_partitioning in [[direct_extraction], fanout_1, [left_branching]]: (_, grammar) = induce_grammar([tree], labeling_strategy, term_pos, recursive_partitioning, 'START') print(grammar) parser = LCFRS_parser(grammar, 'pA pB pC pD pE'.split(' ')) print(parser.best_derivation_tree()) cleaned_tokens = copy.deepcopy(tree.full_token_yield()) for token in cleaned_tokens: token.set_edge_label('_') hybrid_tree = HybridTree() hybrid_tree = parser.dcp_hybrid_tree_best_derivation( hybrid_tree, cleaned_tokens, True, construct_conll_token) print(hybrid_tree) self.assertEqual(tree, hybrid_tree)
def run_experiment(db_file, training_corpus, test_corpus, do_parse, ignore_punctuation, length_limit, labeling, terminal_labeling, partitioning, root_default_deprel, disconnected_default_deprel, max_training, max_test): labeling_choices = labeling.split('-') if len(labeling_choices) == 2: nont_labelling = label.the_labeling_factory( ).create_simple_labeling_strategy(labeling_choices[0], labeling_choices[1]) elif len(labeling_choices) > 2: nont_labelling = label.the_labeling_factory( ).create_complex_labeling_strategy(labeling_choices) # labeling == 'strict-pos-leaf:dep': # labeling == 'child-pos-leaf:dep': else: print("Error: Invalid labeling strategy: " + labeling) exit(1) rec_par = the_recursive_partitioning_factory().get_partitioning( partitioning) if rec_par is None: print("Error: Invalid recursive partitioning strategy: " + partitioning) exit(1) term_labeling_strategy = the_terminal_labeling_factory().get_strategy( terminal_labeling) if term_labeling_strategy is None: print("Error: Invalid recursive partitioning strategy: " + partitioning) exit(1) parser_type = the_parser_factory().getParser(partitioning) if parser_type is None: print("Error: Invalid parser type: " + partitioning) exit(1) connection = experiment_database.initialize_database(db_file) grammar, experiment = induce_grammar_from_file( training_corpus, connection, nont_labelling, term_labeling_strategy, rec_par, max_training, False, 'START', ignore_punctuation) if do_parse: parse_sentences_from_file(grammar, parser_type, experiment, connection, test_corpus, term_labeling_strategy.prepare_parser_input, length_limit, max_test, False, ignore_punctuation, root_default_deprel, disconnected_default_deprel) experiment_database.finalize_database(connection)
def main(): induction_settings = InductionSettings() induction_settings.recursive_partitioning = \ the_recursive_partitioning_factory().get_partitioning('fanout-2-left-to_right') induction_settings.naming_scheme = 'child' train, dev, test, test_input = setup_corpus_resources(split='NEGRA') experiment = SuppertaggingExperiment() experiment.resources[TRAINING] = train experiment.resources[VALIDATION] = dev experiment.resources[TESTING] = test experiment.resources[TESTING_INPUT] = test_input experiment.run_experiment()
def test_recursive_partitioning_transformation(self): tree = HybridTree("mytree") ids = ['a', 'b', 'c', 'd'] for f in ids: tree.add_node(f, CoNLLToken(f, '_', '_', '_', '_', '_'), True, True) if f != 'a': tree.add_child('a', f) tree.add_to_root('a') print(tree) self.assertEqual([token.form() for token in tree.token_yield()], ids) self.assertEqual(tree.recursive_partitioning(), (set([0, 1, 2, 3]), [(set([0]), []), (set([1]), []), (set([2]), []), (set([3]), [])])) print(tree.recursive_partitioning()) [fanout_1 ] = the_recursive_partitioning_factory().get_partitioning('fanout-1') print(fanout_1(tree))
def main(split, test_mode=False, quick=False, unk_threshold=4, recursive_partitioning="fanout-2-left-to-right", nonterminal_naming_scheme="child", seed=0, threads=8, em_epochs=20, em_epochs_sm=20, sm_cycles=4, merge_percentage=50.0, predicted_pos=False, parsing_mode=MULTI_OBJECTIVES, parsing_limit=False, k_best=500, directory=None ): induction_settings = InductionSettings() induction_settings.recursive_partitioning \ = the_recursive_partitioning_factory().get_partitioning(recursive_partitioning)[0] induction_settings.normalize = True induction_settings.disconnect_punctuation = False induction_settings.naming_scheme = nonterminal_naming_scheme induction_settings.isolate_pos = True experiment = ConstituentSMExperiment(induction_settings, directory=directory) experiment.organizer.seed = seed experiment.organizer.em_epochs = em_epochs experiment.organizer.em_epochs_sm = em_epochs_sm experiment.organizer.validator_type = "SIMPLE" experiment.organizer.max_sm_cycles = sm_cycles experiment.organizer.disable_split_merge = False experiment.organizer.disable_em = False experiment.organizer.merge_percentage = merge_percentage experiment.organizer.merge_type = "PERCENT" experiment.organizer.threads = threads train, dev, test, test_input = setup_corpus_resources(split, not test_mode, quick, test_pred=predicted_pos, test_second_half=TEST_SECOND_HALF) experiment.resources[TRAINING] = train experiment.resources[VALIDATION] = dev experiment.resources[TESTING] = test experiment.resources[TESTING_INPUT] = test_input if "km2003" in split: experiment.eval_postprocess_options = ("--reversetransforms=km2003wsj",) if parsing_limit: experiment.max_sentence_length_for_parsing = 40 experiment.k_best = k_best experiment.backoff = True experiment.disco_dop_params["pruning_k"] = 50000 experiment.read_stage_file() # only effective if no terminal labeling was read from stage file if experiment.terminal_labeling is None: experiment.set_terminal_labeling(terminal_labeling(experiment.read_corpus(experiment.resources[TRAINING]), threshold=unk_threshold)) if parsing_mode == MULTI_OBJECTIVES: experiment.parsing_mode = "discodop-multi-method" experiment.resources[RESULT] = ScorerAndWriter(experiment, directory=experiment.directory, logger=experiment.logger, secondary_scores=3) experiment.run_experiment() elif parsing_mode == BASE_GRAMMAR: experiment.k_best = 1 experiment.organizer.project_weights_before_parsing = False experiment.parsing_mode = "k-best-rerank-disco-dop" experiment.resources[RESULT] = ScorerAndWriter(experiment, directory=experiment.directory, logger=experiment.logger) experiment.run_experiment() elif parsing_mode == MAX_RULE_PRODUCT_ONLY: experiment.resources[RESULT] = ScorerAndWriter(experiment, directory=experiment.directory, logger=experiment.logger) experiment.parsing_mode = "max-rule-prod-disco-dop" experiment.run_experiment() elif parsing_mode == MULTI_OBJECTIVES_INDEPENDENT: experiment.parsing_mode = "latent-viterbi-disco-dop" experiment.run_experiment() experiment.parsing_mode = "k-best-rerank-disco-dop" experiment.resources[RESULT] = ScorerAndWriter(experiment, directory=experiment.directory, logger=experiment.logger) experiment.run_experiment() experiment.resources[RESULT] = ScorerAndWriter(experiment, directory=experiment.directory, logger=experiment.logger) experiment.parsing_mode = "variational-disco-dop" experiment.run_experiment() experiment.resources[RESULT] = ScorerAndWriter(experiment, directory=experiment.directory, logger=experiment.logger) experiment.parsing_mode = "max-rule-prod-disco-dop" experiment.run_experiment() else: raise ValueError("Invalid parsing mod: ", parsing_mode)
validation_genetic_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train/train.German.gold.xml' validation_genetic_corpus = build_corpus(validation_genetic_path, validation_genetic_start, validation_genetic_size, train_exclude) test_start = 40475 test_limit = test_start + 100 print("test_start =", test_start) print("test_limit =", test_limit) test_exclude = train_exclude test_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml' test_corpus = build_corpus(test_path, test_start, test_limit, test_exclude) terminal_labeling = FormPosTerminalsUnk(train_corpus, 20) recursive_partitioning = the_recursive_partitioning_factory().get_partitioning( 'fanout-1-left-to-right')[0] max_length = 2000 em_epochs = 5 seed = 0 merge_percentage = 50.0 sm_cycles = 2 threads = 10 smoothing_factor = 0.05 split_randomization = 5.0 scc_merger_threshold = -0.2 genetic_initial = 2 genetic_population = 3 genetic_cycles = 2
def test_json_corpus_grammar_export(self): start = 1 stop = 50 # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" path = "res/tiger/tiger_8000.xml" exclude = [] dsgs = sentence_names_to_deep_syntax_graphs( ['s' + str(i) for i in range(start, stop + 1) if i not in exclude] , path , hold=False) rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0] def label_edge(edge): if isinstance(edge.label, ConstituentTerminal): return edge.label.pos() else: return edge.label nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge) term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling) grammar.make_proper() terminals = Enumerator() data = export_dog_grammar_to_json(grammar, terminals) grammar_path = '/tmp/json_grammar.json' with open(grammar_path, 'w') as file: json.dump(data, file) corpus_path = '/tmp/json_corpus.json' with open(corpus_path, 'w') as file: json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file) with open('/tmp/enumerator.enum', 'w') as file: terminals.print_index(file) reduct_dir = '/tmp/reduct_grammars' if os.path.isdir(reduct_dir): shutil.rmtree(reduct_dir) os.makedirs(reduct_dir) p = subprocess.Popen([' '.join( ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t', corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print("stdout", p.stdout.name) while True: nextline = p.stdout.readline() if nextline == b'' and p.poll() is not None: break print(nextline.decode('unicode_escape'), end='') # sys.stdout.write(nextline) # sys.stdout.flush() p.wait() p.stdout.close() self.assertEqual(0, p.returncode) rtgs = [] for i in range(1, len(dsgs) + 1): rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra')) derivation_manager = PyDerivationManager(grammar) derivation_manager.convert_rtgs_to_hypergraphs(rtgs) derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8')) f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)): derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)] self.assertGreaterEqual(len(derivations), 1) if len(derivations) > 1: print("Sentence", i) for der in derivations: print(der) for der in derivations: dog, sync = dog_evaluation(der) dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync) dsg.dog.project_labels(f) dsg.sentence = list(map(f, dsg.sentence)) self.assertEqual(dsg.sentence, dsg2.sentence) morphs = dsg.dog.compute_isomorphism(dsg2.dog) self.assertFalse(morphs is None) self.assertListEqual([[morphs[0].get(node, node) for node in syncs] for syncs in dsg.synchronization], dsg2.synchronization) pass
def test_induction_on_a_corpus(self): interactive = False start = 1 stop = 50 path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml" # path = "res/tiger/tiger_8000.xml" exclude = [] dsgs = sentence_names_to_deep_syntax_graphs( ['s' + str(i) for i in range(start, stop + 1) if i not in exclude] , path , hold=False) rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0] def label_edge(edge): if isinstance(edge.label, ConstituentTerminal): return edge.label.pos() else: return edge.label nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge) term_labeling_token = PosTerminals() def term_labeling(token): if isinstance(token, ConstituentTerminal): return term_labeling_token.token_label(token) else: return token grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling, normalize=True) grammar.make_proper() parser = CFGParser(grammar) scorer = PredicateArgumentScoring() for dsg in dsgs: parser.set_input(term_labeling_token.prepare_parser_input(dsg.sentence)) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() dog, sync = dog_evaluation(derivation) dsg2 = DeepSyntaxGraph(dsg.sentence, dog, sync) f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token dsg.dog.project_labels(f) parser.clear() scorer.add_accuracy_frames( dsg.labeled_frames(guard=lambda x: len(x[1]) > 0), dsg2.labeled_frames(guard=lambda x: len(x[1]) > 0) ) # print('dsg: ', dsg.dog, '\n', [dsg.get_graph_position(i) for i in range(len(dsg.sentence))], # '\n\n parsed: ', dsg2.dog, '\n', [dsg2.get_graph_position(i+1) for i in range(len(dsg2.sentence))]) # print() if interactive: if dsg.label == 's50': pass if dsg.dog != dog: z1 = render_and_view_dog(dsg.dog, "corpus_" + dsg.label) z2 = render_and_view_dog(dog, "parsed_" + dsg.label) z1.communicate() z2.communicate() print("Labeled frames:") print("P", scorer.labeled_frame_scorer.precision(), "R", scorer.labeled_frame_scorer.recall(), "F1", scorer.labeled_frame_scorer.fmeasure()) print("Labeled dependencies:") print("P", scorer.labeled_dependency_scorer.precision(), "R", scorer.labeled_dependency_scorer.recall(), "F1", scorer.labeled_dependency_scorer.fmeasure())