def test_basic_split_merge(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        for rule in grammar.rules():
            print(rule, file=stderr)

        print("call S/M Training", file=stderr)

        new_grammars = split_merge_training(grammar,
                                            terminal_labeling, [tree, tree2],
                                            3,
                                            5,
                                            merge_threshold=0.5,
                                            debug=False)

        for new_grammar in new_grammars:
            for i, rule in enumerate(new_grammar.rules()):
                print(i, rule, file=stderr)
            print(file=stderr)

        print("finished S/M Training", file=stderr)
    def test_trace_serialization(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        for rule in grammar.rules():
            print(rule, file=stderr)

        trace = compute_reducts(grammar, [tree, tree2], terminal_labeling)
        trace.serialize(b"/tmp/reducts.p")

        grammar_load = grammar
        trace2 = PySDCPTraceManager(grammar_load, terminal_labeling)
        trace2.load_traces_from_file(b"/tmp/reducts.p")
        trace2.serialize(b"/tmp/reducts2.p")

        with open(b"/tmp/reducts.p", "r") as f1, open(b"/tmp/reducts2.p",
                                                      "r") as f2:
            for e1, e2 in zip(f1, f2):
                self.assertEqual(e1, e2)
    def test_basic_em_training(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        for rule in grammar.rules():
            print(rule, file=stderr)

        print("compute reducts", file=stderr)

        trace = compute_reducts(grammar, [tree, tree2], terminal_labeling)

        print("call em Training", file=stderr)
        emTrainer = PyEMTrainer(trace)
        emTrainer.em_training(grammar, n_epochs=10)

        print("finished em Training", file=stderr)

        for rule in grammar.rules():
            print(rule, file=stderr)
Beispiel #4
0
    def test_grammar_export(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        _, grammar = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'),
            # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'),
            terminal_labeling.token_label,
            [direct_extraction],
            'START')
        print(max([grammar.fanout(nont) for nont in grammar.nonts()]))
        print(grammar)

        prefix = '/tmp/'
        name = 'tmpGrammar'

        name_ = export(grammar, prefix, name)

        self.assertEqual(0, compile_gf_grammar(prefix, name_))

        GFParser.preprocess_grammar(grammar)

        string = ["NP", "N", "V", "V", "V"]

        parser = GFParser(grammar, string)

        self.assertTrue(parser.recognized())

        der = parser.best_derivation_tree()
        self.assertTrue(
            der.check_integrity_recursive(der.root_id(), grammar.start()))

        print(der)

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
    def test_dcp_evaluation_with_induced_dependency_grammar(self):
        tree = hybrid_tree_1()

        print(tree)

        tree2 = hybrid_tree_2()

        print(tree2)
        # print tree.recursive_partitioning()

        labeling = the_labeling_factory().create_simple_labeling_strategy(
            'child', 'pos')
        term_pos = the_terminal_labeling_factory().get_strategy(
            'pos').token_label
        (_, grammar) = induce_grammar([tree, tree2], labeling, term_pos,
                                      [direct_extraction], 'START')

        # print grammar

        self.assertEqual(grammar.well_formed(), None)
        self.assertEqual(grammar.ordered()[0], True)
        # print max([grammar.fanout(nont) for nont in grammar.nonts()])
        print(grammar)

        parser = Parser(grammar, 'NP N V V'.split(' '))

        self.assertEqual(parser.recognized(), True)

        for item in parser.successful_root_items():
            der = Derivation()
            derivation_tree(der, item, None)
            print(der)

            hybrid_tree = derivation_to_hybrid_tree(
                der, 'NP N V V'.split(' '),
                'Piet Marie helpen lezen'.split(' '),
                construct_constituent_token)
            print(hybrid_tree)

            dcp = DCP_evaluator(der).getEvaluation()
            h_tree_2 = HybridTree()
            token_sequence = [
                construct_conll_token(form, lemma)
                for form, lemma in zip('Piet Marie helpen lezen'.split(' '),
                                       'NP N V V'.split(' '))
            ]
            dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                              construct_conll_token)
    def test_basic_sdcp_parsing_dependency(self):
        tree1 = hybrid_tree_1()
        tree2 = hybrid_tree_2()

        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree1, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        print("grammar induced. Printing rules...", file=stderr)

        for rule in grammar.rules():
            print(rule, file=stderr)

        parser_type = LCFRS_sDCP_Parser

        print("preprocessing grammar", file=stderr)

        parser_type.preprocess_grammar(grammar, terminal_labeling)

        print("invoking parser", file=stderr)

        parser = parser_type(grammar, tree1)

        print("listing derivations", file=stderr)

        for der in parser.all_derivation_trees():
            print(der)
            output_tree = HybridTree()
            tokens = tree1.token_yield()
            dcp_to_hybridtree(output_tree,
                              DCP_evaluator(der).getEvaluation(), tokens,
                              False, construct_conll_token)
            print(tree1)
            print(output_tree)

        print("completed test", file=stderr)