コード例 #1
0
    def test_induction_from_corpus_tree(self):
        dsg = sentence_names_to_deep_syntax_graphs(["s26954"], "res/tiger/tiger_s26954.xml", hold=False)[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label
        labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]
        rec_part = rec_part_strategy(dsg)
        dcmp = compute_decomposition(dsg, rec_part)

        grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=labeling, terminal_labeling=term_labeling)

        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input(term_labeling_token.prepare_parser_input(dsg.sentence))
        parser.parse()
        self.assertTrue(parser.recognized())

        derivation = parser.best_derivation_tree()
        self.assertNotEqual(derivation, None)
コード例 #2
0
    def test_json_grammar_export(self):
        dog = build_acyclic_dog()
        terminals = Enumerator()
        data = dog.export_graph_json(terminals)
        with open('/tmp/json_graph_1.json', 'w') as file:
            json.dump(data, file)

        dsg = build_dsg()
        data = dsg.export_bihypergraph_json(terminals)
        with open('/tmp/json_bigraph_1.json', 'w') as file:
            json.dump(data, file)

        rule_dog = dog_se()
        data2 = rule_dog.export_graph_json(terminals)
        with open('/tmp/json_nonterminal_graph_1.json', 'w') as file:
            json.dump(data2, file)

        terminals.print_index()

        dsg = build_dsg()
        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('right-branching')[0]
        rec_part = rec_part_strategy(dsg)
        dcmp = compute_decomposition(dsg, rec_part)

        grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=simple_labeling, terminal_labeling=str)

        print(grammar)
        data = export_dog_grammar_to_json(grammar, terminals)
        with open('/tmp/json_grammar.json', 'w') as file:
            json.dump(data, file)

        with open('/tmp/json_corpus.json', 'w') as file:
            json.dump(export_corpus_to_json([dsg], terminals), file)
コード例 #3
0
    def test_induction_with_labeling_strategies(self):
        dsg = build_dsg()
        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('right-branching')[0]
        rec_part = rec_part_strategy(dsg)
        dcmp = compute_decomposition(dsg, rec_part)

        grammar = induce_grammar_from(dsg, rec_part, dcmp, labeling=simple_labeling, terminal_labeling=str)
        print(grammar)

        parser = LCFRS_parser(grammar)
        parser.set_input(dsg.sentence)  # ["Sie", "entwickelt", "und", "druckt", "Verpackungen", "und", "Etiketten"]
        parser.parse()
        self.assertTrue(parser.recognized())

        derivation = parser.best_derivation_tree()
        self.assertNotEqual(derivation, None)

        dog, sync_list = dog_evaluation(derivation)
        self.assertEqual(dog, dsg.dog)

        self.assertEqual(len(sync_list), len(dsg.sentence))
        # print(dog)
        # print(sync)
        # print(sync_list)

        morphism, _ = dsg.dog.compute_isomorphism(dog)

        for i in range(len(dsg.sentence)):
            self.assertListEqual(list(map(lambda x: morphism[x], dsg.get_graph_position(i))), sync_list[i])
コード例 #4
0
    def test_multiroot(self):
        tree = multi_dep_tree()
        term_pos = the_terminal_labeling_factory().get_strategy(
            'pos').token_label
        fanout_1 = the_recursive_partitioning_factory().get_partitioning(
            'fanout-1')
        for top_level_labeling_strategy in ['strict', 'child']:
            labeling_strategy = the_labeling_factory(
            ).create_simple_labeling_strategy(top_level_labeling_strategy,
                                              'pos+deprel')
            for recursive_partitioning in [[direct_extraction], fanout_1,
                                           [left_branching]]:
                (_, grammar) = induce_grammar([tree], labeling_strategy,
                                              term_pos, recursive_partitioning,
                                              'START')
                print(grammar)

                parser = LCFRS_parser(grammar, 'pA pB pC pD pE'.split(' '))
                print(parser.best_derivation_tree())

                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                for token in cleaned_tokens:
                    token.set_edge_label('_')
                hybrid_tree = HybridTree()
                hybrid_tree = parser.dcp_hybrid_tree_best_derivation(
                    hybrid_tree, cleaned_tokens, True, construct_conll_token)
                print(hybrid_tree)
                self.assertEqual(tree, hybrid_tree)
コード例 #5
0
def run_experiment(db_file, training_corpus, test_corpus, do_parse,
                   ignore_punctuation, length_limit, labeling,
                   terminal_labeling, partitioning, root_default_deprel,
                   disconnected_default_deprel, max_training, max_test):
    labeling_choices = labeling.split('-')
    if len(labeling_choices) == 2:
        nont_labelling = label.the_labeling_factory(
        ).create_simple_labeling_strategy(labeling_choices[0],
                                          labeling_choices[1])
    elif len(labeling_choices) > 2:
        nont_labelling = label.the_labeling_factory(
        ).create_complex_labeling_strategy(labeling_choices)
        # labeling == 'strict-pos-leaf:dep':
        # labeling == 'child-pos-leaf:dep':
    else:
        print("Error: Invalid labeling strategy: " + labeling)
        exit(1)

    rec_par = the_recursive_partitioning_factory().get_partitioning(
        partitioning)
    if rec_par is None:
        print("Error: Invalid recursive partitioning strategy: " +
              partitioning)
        exit(1)

    term_labeling_strategy = the_terminal_labeling_factory().get_strategy(
        terminal_labeling)
    if term_labeling_strategy is None:
        print("Error: Invalid recursive partitioning strategy: " +
              partitioning)
        exit(1)

    parser_type = the_parser_factory().getParser(partitioning)
    if parser_type is None:
        print("Error: Invalid parser type: " + partitioning)
        exit(1)

    connection = experiment_database.initialize_database(db_file)
    grammar, experiment = induce_grammar_from_file(
        training_corpus, connection, nont_labelling, term_labeling_strategy,
        rec_par, max_training, False, 'START', ignore_punctuation)
    if do_parse:
        parse_sentences_from_file(grammar, parser_type, experiment, connection,
                                  test_corpus,
                                  term_labeling_strategy.prepare_parser_input,
                                  length_limit, max_test, False,
                                  ignore_punctuation, root_default_deprel,
                                  disconnected_default_deprel)
    experiment_database.finalize_database(connection)
コード例 #6
0
def main():
    induction_settings = InductionSettings()
    induction_settings.recursive_partitioning = \
        the_recursive_partitioning_factory().get_partitioning('fanout-2-left-to_right')
    induction_settings.naming_scheme = 'child'

    train, dev, test, test_input = setup_corpus_resources(split='NEGRA')

    experiment = SuppertaggingExperiment()
    experiment.resources[TRAINING] = train
    experiment.resources[VALIDATION] = dev
    experiment.resources[TESTING] = test
    experiment.resources[TESTING_INPUT] = test_input

    experiment.run_experiment()
コード例 #7
0
    def test_recursive_partitioning_transformation(self):
        tree = HybridTree("mytree")
        ids = ['a', 'b', 'c', 'd']
        for f in ids:
            tree.add_node(f, CoNLLToken(f, '_', '_', '_', '_', '_'), True,
                          True)
            if f != 'a':
                tree.add_child('a', f)
        tree.add_to_root('a')

        print(tree)
        self.assertEqual([token.form() for token in tree.token_yield()], ids)
        self.assertEqual(tree.recursive_partitioning(),
                         (set([0, 1, 2, 3]), [(set([0]), []), (set([1]), []),
                                              (set([2]), []), (set([3]), [])]))
        print(tree.recursive_partitioning())

        [fanout_1
         ] = the_recursive_partitioning_factory().get_partitioning('fanout-1')

        print(fanout_1(tree))
コード例 #8
0
def main(split,
         test_mode=False,
         quick=False,
         unk_threshold=4,
         recursive_partitioning="fanout-2-left-to-right",
         nonterminal_naming_scheme="child",
         seed=0,
         threads=8,
         em_epochs=20,
         em_epochs_sm=20,
         sm_cycles=4,
         merge_percentage=50.0,
         predicted_pos=False,
         parsing_mode=MULTI_OBJECTIVES,
         parsing_limit=False,
         k_best=500,
         directory=None
         ):
    induction_settings = InductionSettings()
    induction_settings.recursive_partitioning \
        = the_recursive_partitioning_factory().get_partitioning(recursive_partitioning)[0]
    induction_settings.normalize = True
    induction_settings.disconnect_punctuation = False
    induction_settings.naming_scheme = nonterminal_naming_scheme
    induction_settings.isolate_pos = True

    experiment = ConstituentSMExperiment(induction_settings, directory=directory)
    experiment.organizer.seed = seed
    experiment.organizer.em_epochs = em_epochs
    experiment.organizer.em_epochs_sm = em_epochs_sm
    experiment.organizer.validator_type = "SIMPLE"
    experiment.organizer.max_sm_cycles = sm_cycles

    experiment.organizer.disable_split_merge = False
    experiment.organizer.disable_em = False
    experiment.organizer.merge_percentage = merge_percentage
    experiment.organizer.merge_type = "PERCENT"
    experiment.organizer.threads = threads

    train, dev, test, test_input = setup_corpus_resources(split,
                                                          not test_mode,
                                                          quick,
                                                          test_pred=predicted_pos,
                                                          test_second_half=TEST_SECOND_HALF)
    experiment.resources[TRAINING] = train
    experiment.resources[VALIDATION] = dev
    experiment.resources[TESTING] = test
    experiment.resources[TESTING_INPUT] = test_input

    if "km2003" in split:
        experiment.eval_postprocess_options = ("--reversetransforms=km2003wsj",)

    if parsing_limit:
        experiment.max_sentence_length_for_parsing = 40

    experiment.k_best = k_best
    experiment.backoff = True

    experiment.disco_dop_params["pruning_k"] = 50000
    experiment.read_stage_file()

    # only effective if no terminal labeling was read from stage file
    if experiment.terminal_labeling is None:
        experiment.set_terminal_labeling(terminal_labeling(experiment.read_corpus(experiment.resources[TRAINING]),
                                                           threshold=unk_threshold))

    if parsing_mode == MULTI_OBJECTIVES:
        experiment.parsing_mode = "discodop-multi-method"
        experiment.resources[RESULT] = ScorerAndWriter(experiment,
                                                       directory=experiment.directory,
                                                       logger=experiment.logger,
                                                       secondary_scores=3)
        experiment.run_experiment()
    elif parsing_mode == BASE_GRAMMAR:
        experiment.k_best = 1
        experiment.organizer.project_weights_before_parsing = False
        experiment.parsing_mode = "k-best-rerank-disco-dop"
        experiment.resources[RESULT] = ScorerAndWriter(experiment,
                                                       directory=experiment.directory,
                                                       logger=experiment.logger)
        experiment.run_experiment()
    elif parsing_mode == MAX_RULE_PRODUCT_ONLY:
        experiment.resources[RESULT] = ScorerAndWriter(experiment,
                                                       directory=experiment.directory,
                                                       logger=experiment.logger)
        experiment.parsing_mode = "max-rule-prod-disco-dop"
        experiment.run_experiment()
    elif parsing_mode == MULTI_OBJECTIVES_INDEPENDENT:
        experiment.parsing_mode = "latent-viterbi-disco-dop"
        experiment.run_experiment()

        experiment.parsing_mode = "k-best-rerank-disco-dop"
        experiment.resources[RESULT] = ScorerAndWriter(experiment,
                                                       directory=experiment.directory,
                                                       logger=experiment.logger)
        experiment.run_experiment()

        experiment.resources[RESULT] = ScorerAndWriter(experiment,
                                                       directory=experiment.directory,
                                                       logger=experiment.logger)
        experiment.parsing_mode = "variational-disco-dop"
        experiment.run_experiment()

        experiment.resources[RESULT] = ScorerAndWriter(experiment,
                                                       directory=experiment.directory,
                                                       logger=experiment.logger)
        experiment.parsing_mode = "max-rule-prod-disco-dop"
        experiment.run_experiment()
    else:
        raise ValueError("Invalid parsing mod: ", parsing_mode)
コード例 #9
0
validation_genetic_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train/train.German.gold.xml'
validation_genetic_corpus = build_corpus(validation_genetic_path,
                                         validation_genetic_start,
                                         validation_genetic_size,
                                         train_exclude)

test_start = 40475
test_limit = test_start + 100
print("test_start =", test_start)
print("test_limit =", test_limit)
test_exclude = train_exclude
test_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml'
test_corpus = build_corpus(test_path, test_start, test_limit, test_exclude)

terminal_labeling = FormPosTerminalsUnk(train_corpus, 20)
recursive_partitioning = the_recursive_partitioning_factory().get_partitioning(
    'fanout-1-left-to-right')[0]

max_length = 2000
em_epochs = 5
seed = 0
merge_percentage = 50.0
sm_cycles = 2
threads = 10
smoothing_factor = 0.05
split_randomization = 5.0

scc_merger_threshold = -0.2

genetic_initial = 2
genetic_population = 3
genetic_cycles = 2
コード例 #10
0
    def test_json_corpus_grammar_export(self):
        start = 1
        stop = 50
        # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
        path = "res/tiger/tiger_8000.xml"
        exclude = []
        dsgs = sentence_names_to_deep_syntax_graphs(
            ['s' + str(i) for i in range(start, stop + 1) if i not in exclude]
            , path
            , hold=False)

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label

        nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling)
        grammar.make_proper()

        terminals = Enumerator()

        data = export_dog_grammar_to_json(grammar, terminals)
        grammar_path = '/tmp/json_grammar.json'
        with open(grammar_path, 'w') as file:
            json.dump(data, file)

        corpus_path = '/tmp/json_corpus.json'
        with open(corpus_path, 'w') as file:
            json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file)

        with open('/tmp/enumerator.enum', 'w') as file:
            terminals.print_index(file)

        reduct_dir = '/tmp/reduct_grammars'
        if os.path.isdir(reduct_dir):
            shutil.rmtree(reduct_dir)
        os.makedirs(reduct_dir)
        p = subprocess.Popen([' '.join(
            ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t',
             corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        print("stdout", p.stdout.name)

        while True:
            nextline = p.stdout.readline()
            if nextline == b'' and p.poll() is not None:
                break
            print(nextline.decode('unicode_escape'), end='')
            # sys.stdout.write(nextline)
            # sys.stdout.flush()

        p.wait()
        p.stdout.close()
        self.assertEqual(0, p.returncode)

        rtgs = []
        for i in range(1, len(dsgs) + 1):
            rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra'))

        derivation_manager = PyDerivationManager(grammar)
        derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
        derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8'))

        f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token

        for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)):
            derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)]
            self.assertGreaterEqual(len(derivations), 1)
            if len(derivations) > 1:
                print("Sentence", i)
                for der in derivations:
                    print(der)

            for der in derivations:
                dog, sync = dog_evaluation(der)
                dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync)
                dsg.dog.project_labels(f)
                dsg.sentence = list(map(f, dsg.sentence))
                self.assertEqual(dsg.sentence, dsg2.sentence)
                morphs = dsg.dog.compute_isomorphism(dsg2.dog)
                self.assertFalse(morphs is None)
                self.assertListEqual([[morphs[0].get(node, node) for node in syncs]
                                      for syncs in dsg.synchronization], dsg2.synchronization)
        pass
コード例 #11
0
    def test_induction_on_a_corpus(self):
        interactive = False
        start = 1
        stop = 50
        path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
        # path = "res/tiger/tiger_8000.xml"
        exclude = []
        dsgs = sentence_names_to_deep_syntax_graphs(
            ['s' + str(i) for i in range(start, stop + 1) if i not in exclude]
            , path
            , hold=False)

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label
        nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()
        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling, normalize=True)
        grammar.make_proper()

        parser = CFGParser(grammar)

        scorer = PredicateArgumentScoring()

        for dsg in dsgs:
            parser.set_input(term_labeling_token.prepare_parser_input(dsg.sentence))
            parser.parse()
            self.assertTrue(parser.recognized())
            derivation = parser.best_derivation_tree()
            dog, sync = dog_evaluation(derivation)
            dsg2 = DeepSyntaxGraph(dsg.sentence, dog, sync)

            f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token
            dsg.dog.project_labels(f)
            parser.clear()

            scorer.add_accuracy_frames(
                dsg.labeled_frames(guard=lambda x: len(x[1]) > 0),
                dsg2.labeled_frames(guard=lambda x: len(x[1]) > 0)
            )

            # print('dsg: ', dsg.dog, '\n', [dsg.get_graph_position(i) for i in range(len(dsg.sentence))],
            # '\n\n parsed: ', dsg2.dog, '\n', [dsg2.get_graph_position(i+1) for i in range(len(dsg2.sentence))])
            # print()
            if interactive:
                if dsg.label == 's50':
                    pass
                if dsg.dog != dog:
                    z1 = render_and_view_dog(dsg.dog, "corpus_" + dsg.label)
                    z2 = render_and_view_dog(dog, "parsed_" + dsg.label)
                    z1.communicate()
                    z2.communicate()

        print("Labeled frames:")
        print("P", scorer.labeled_frame_scorer.precision(), "R", scorer.labeled_frame_scorer.recall(),
              "F1", scorer.labeled_frame_scorer.fmeasure())
        print("Labeled dependencies:")
        print("P", scorer.labeled_dependency_scorer.precision(), "R", scorer.labeled_dependency_scorer.recall(),
              "F1", scorer.labeled_dependency_scorer.fmeasure())