コード例 #1
0
 def setUp(self):
     self.tree = HybridTree()
     self.tree.add_node("v1", construct_conll_token("Piet", "NP"), True)
     self.tree.add_node("v21", construct_conll_token("Marie", "N"), True)
     self.tree.add_node("v", construct_conll_token("helpen", "VP"), True)
     self.tree.add_node("v2", construct_conll_token("lezen", "V"), True)
     self.tree.add_child("v", "v2")
     self.tree.add_child("v", "v1")
     self.tree.add_child("v2", "v21")
     self.tree.add_node("v3", construct_conll_token(".", "Punc"), True,
                        False)
     self.tree.add_to_root("v")
コード例 #2
0
def fall_back_left_branching(forms, poss):
    tree = HybridTree()
    for i, (form, pos) in enumerate(zip(forms, poss)):
        token = construct_conll_token(form, pos)
        token.set_edge_label('_')
        tree.add_node(i, token, True)
        if i == 0:
            tree.add_to_root(i)
        else:
            tree.add_child(i - 1, i)
    return tree
コード例 #3
0
    def test_grammar_export(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        _, grammar = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'),
            # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'),
            terminal_labeling.token_label,
            [direct_extraction],
            'START')
        print(max([grammar.fanout(nont) for nont in grammar.nonts()]))
        print(grammar)

        prefix = '/tmp/'
        name = 'tmpGrammar'

        name_ = export(grammar, prefix, name)

        self.assertEqual(0, compile_gf_grammar(prefix, name_))

        GFParser.preprocess_grammar(grammar)

        string = ["NP", "N", "V", "V", "V"]

        parser = GFParser(grammar, string)

        self.assertTrue(parser.recognized())

        der = parser.best_derivation_tree()
        self.assertTrue(
            der.check_integrity_recursive(der.root_id(), grammar.start()))

        print(der)

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
コード例 #4
0
    def test_cfg_parser(self):
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label, [cfg], 'START')

        for parser_class in [LCFRS_parser, CFGParser]:

            parser_class.preprocess_grammar(grammar)

            string = ["NP", "N", "V", "V", "V"]

            parser = parser_class(grammar, string)

            self.assertTrue(parser.recognized())

            der = parser.best_derivation_tree()
            self.assertTrue(
                der.check_integrity_recursive(der.root_id(), grammar.start()))

            print(der)

            print(
                derivation_to_hybrid_tree(
                    der, string, "Piet Marie helpen lezen leren".split(),
                    construct_conll_token))

            dcp = DCP_evaluator(der).getEvaluation()

            h_tree_2 = HybridTree()
            token_sequence = [
                construct_conll_token(form, lemma) for form, lemma in zip(
                    'Piet Marie helpen lezen leren'.split(' '),
                    'NP N V V V'.split(' '))
            ]
            dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                              construct_conll_token)

            print(h_tree_2)
コード例 #5
0
    def generic_parsing_test(self, parser_type, limit_train, limit_test,
                             compare_order):
        def filter_by_id(n, trees):
            j = 0
            for tree in trees:
                if j in n:
                    yield tree
                j += 1

        #params
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        # test = 'res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim, term_labelling)

        trees = parse_conll_corpus(test, False, limit_test)

        count_derivs = {}
        no_complete_match = 0

        for i, tree in enumerate(trees):
            print("Parsing tree for ", i, file=stderr)

            print(tree, file=stderr)

            parser = parser_type(grammar_prim, tree)
            self.assertTrue(parser.recognized())
            count_derivs[i] = 0

            print("Found derivations for ", i, file=stderr)
            j = 0

            derivations = []

            for der in parser.all_derivation_trees():
                self.assertTrue(
                    der.check_integrity_recursive(der.root_id(), start))

                print(count_derivs[i], file=stderr)
                print(der, file=stderr)

                output_tree = HybridTree()
                tokens = tree.token_yield()

                the_yield = der.compute_yield()
                # print >>stderr, the_yield
                tokens2 = list(
                    map(lambda pos: construct_conll_token('_', pos),
                        the_yield))

                dcp_to_hybridtree(output_tree,
                                  DCP_evaluator(der).getEvaluation(),
                                  tokens2,
                                  False,
                                  construct_conll_token,
                                  reorder=False)
                print(tree, file=stderr)
                print(output_tree, file=stderr)

                self.compare_hybrid_trees(tree, output_tree, compare_order)
                count_derivs[i] += 1
                derivations.append(der)

            self.assertTrue(
                sDCPParserTest.pairwise_different(
                    derivations, sDCPParserTest.compare_derivations))
            self.assertEqual(len(derivations), count_derivs[i])

            if count_derivs[i] == 0:
                no_complete_match += 1

        for key in count_derivs:
            print(key, count_derivs[key])

        print("# trees with no complete match:", no_complete_match)
コード例 #6
0
def parse_with_pgf(grammar, forms, poss, bin):
    """"
    :type grammar: PGF
    :return:
    :rtype:
    """
    lcfrs = grammar.languages[bin + 'grammargfconcrete']

    # sentence = "ADJD ADV _COMMA_ KOUS ADV PIS PROAV VVINF VMFIN _PUNCT_"
    sentence = ' '.join(map(escape, poss))

    try:
        i = lcfrs.parse(sentence, n=1)
        p, e = next(i)
    except (StopIteration, pgf.ParseError):
        return None

    # print_ast(gr, e, 0)
    s = lcfrs.graphvizParseTree(e)
    assert isinstance(s, str)
    s_ = s.splitlines()

    tree = HybridTree()

    # print s
    i = 0
    for line in s.splitlines():
        match = re.search(r'^\s*(n\d+)\[label="([^\s]+)"\]\s*$', line)
        if match:
            node_id = match.group(1)
            label = match.group(2)
            order = int(node_id[1:]) >= 100000
            if order:
                assert escape(poss[i]) == label
                tree.add_node(
                    node_id,
                    construct_constituent_token(form=forms[i],
                                                pos=poss[i],
                                                terminal=True), True)
                i += 1
            else:
                tree.add_node(
                    node_id,
                    construct_constituent_token(form=label,
                                                pos='_',
                                                terminal=False), False)
            # print node_id, label
            if label == 'VROOT1':
                tree.add_to_root(node_id)
            continue
        match = re.search(r'^  (n\d+) -- (n\d+)\s*$', line)
        if match:
            parent = match.group(1)
            child = match.group(2)
            tree.add_child(parent, child)
            # print line
            # print parent, child
            continue

    # print tree

    assert poss == [token.pos() for token in tree.token_yield()]
    # print the_yield

    dep_tree = HybridTree()
    head_table = defaultdict(lambda: None)
    attachment_point = defaultdict(lambda: None)
    for i, node in enumerate(tree.id_yield()):
        token = tree.node_token(node)
        dep_token = construct_conll_token(token.form(), un_escape(token.pos()))
        current = tree.parent(node)
        current = tree.parent(current)
        while current:
            current_label = tree.node_token(current).category()
            if not re.search(r'\d+X\d+$', current_label):
                s = un_escape(current_label)
                if s == 'TOP1':
                    s = 'ROOT1'
                dep_token.set_edge_label(s[:-1])
                head_table[current] = i + 1
                attachment_point[node] = current
                break
            else:
                current = tree.parent(current)
        dep_tree.add_node(i + 1, dep_token, order=True)

    # print head_table

    for node, dep_node in zip(tree.id_yield(), dep_tree.id_yield()):
        node = tree.parent(attachment_point[node])
        while node:
            if head_table[node]:
                dep_tree.add_child(head_table[node], dep_node)
                break
            node = tree.parent(node)
        if not node:
            dep_tree.add_to_root(dep_node)

    # print "dep_tree"
    # print dep_tree
    # print ' '.join(['(' + token.form() + '/' + token.deprel() + ')' for token in dep_tree.token_yield()])
    return dep_tree
コード例 #7
0
    def test_single_root_induction(self):
        tree = hybrid_tree_1()
        # print tree.children("v")
        # print tree
        #
        # for id_set in ['v v1 v2 v21'.split(' '), 'v1 v2'.split(' '),
        # 'v v21'.split(' '), ['v'], ['v1'], ['v2'], ['v21']]:
        # print id_set, 'top:', top(tree, id_set), 'bottom:', bottom(tree, id_set)
        # print id_set, 'top_max:', max(tree, top(tree, id_set)), 'bottom_max:', max(tree, bottom(tree, id_set))
        #
        # print "some rule"
        # for mem, arg in [(-1, 0), (0,0), (1,0)]:
        # print create_DCP_rule(mem, arg, top_max(tree, ['v','v1','v2','v21']), bottom_max(tree, ['v','v1','v2','v21']),
        # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1', 'v2'], ['v', 'v21']]])
        #
        #
        # print "some other rule"
        # for mem, arg in [(-1,1),(1,0)]:
        # print create_DCP_rule(mem, arg, top_max(tree, ['v1','v2']), bottom_max(tree, ['v1','v2']),
        # [(top_max(tree, l), bottom_max(tree, l)) for l in [['v1'], ['v2']]])
        #
        # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21']))
        # print 'child:' , child_labeling(tree, top_max(tree, ['v','v21']), bottom_max(tree, ['v','v21']))
        # print '---'
        # print 'strict: ', strict_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21']))
        # print 'child: ', child_labeling(tree, top_max(tree, ['v1','v21']), bottom_max(tree, ['v1','v21']))
        # print '---'
        # print 'strict:' , strict_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21']))
        # print 'child:' , child_labeling(tree, top_max(tree, ['v','v1', 'v21']), bottom_max(tree, ['v','v1', 'v21']))

        tree2 = hybrid_tree_2()

        # print tree2.children("v")
        # print tree2
        #
        # print 'siblings v211', tree2.siblings('v211')
        # print top(tree2, ['v','v1', 'v211'])
        # print top_max(tree2, ['v','v1', 'v211'])
        #
        # print '---'
        # print 'strict:' , strict_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211']))
        # print 'child:' , child_labeling(tree2, top_max(tree2, ['v','v1', 'v211']), bottom_max(tree2, ['v','v11', 'v211']))

        # rec_par = ('v v1 v2 v21'.split(' '),
        # [('v1 v2'.split(' '), [(['v1'],[]), (['v2'],[])])
        #                ,('v v21'.split(' '), [(['v'],[]), (['v21'],[])])
        #            ])
        #
        # grammar = LCFRS(nonterminal_str(tree, top_max(tree, rec_par[0]), bottom_max(tree, rec_par[0]), 'strict'))
        #
        # add_rules_to_grammar_rec(tree, rec_par, grammar, 'child')
        #
        # grammar.make_proper()
        # print grammar

        print(tree.recursive_partitioning())

        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'),
            # the_labeling_factory().create_simple_labeling_strategy('child', 'pos+deprel'),
            terminal_labeling.token_label,
            [direct_extraction],
            'START')
        print(max([grammar.fanout(nont) for nont in grammar.nonts()]))
        print(grammar)

        parser = LCFRS_parser(grammar, 'NP N V V'.split(' '))
        print(parser.best_derivation_tree())

        tokens = [
            construct_conll_token(form, pos) for form, pos in zip(
                'Piet Marie helpen lezen'.split(' '), 'NP N V V'.split(' '))
        ]
        hybrid_tree = HybridTree()
        hybrid_tree = parser.dcp_hybrid_tree_best_derivation(
            hybrid_tree, tokens, True, construct_conll_token)
        print(list(map(str, hybrid_tree.full_token_yield())))
        print(hybrid_tree)

        string = "foo"
        dcp_string = DCP_string(string)
        dcp_string.set_edge_label("bar")
        print(dcp_string, dcp_string.edge_label())

        linearize(
            grammar,
            the_labeling_factory().create_simple_labeling_strategy(
                'child', 'pos+deprel'),
            the_terminal_labeling_factory().get_strategy('pos'), sys.stdout)
コード例 #8
0
    def test_fst_compilation_left(self):
        if not test_pynini:
            return
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label,
            [left_branching], 'START')

        fst, rules = compile_wfst_from_left_branching_grammar(grammar)

        print(repr(fst))

        symboltable = fst.input_symbols()

        string = ["NP", "N", "V", "V", "V"]

        fsa = fsa_from_list_of_symbols(string, symboltable)
        self.assertEqual(
            fsa.text().decode('utf-8'),
            '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n'
        )

        b = compose(fsa, fst)

        print(b.text(symboltable, symboltable))

        print("Shortest path probability", end=' ')
        best = shortestpath(b)
        best.topsort()
        # self.assertAlmostEquals(pow(e, -float(shortestdistance(best)[-1])), 1.80844898756e-05)
        print(best.text())

        polish_rules = retrieve_rules(best)
        self.assertSequenceEqual(polish_rules, [1, 2, 3, 4, 5, 4, 9, 4, 7, 8])

        polish_rules = list(map(rules.index_object, polish_rules))

        for rule in polish_rules:
            print(rule)
        print()

        der = ReversePolishDerivation(polish_rules[0:-1])
        self.assertTrue(der.check_integrity_recursive(der.root_id()))

        print(der)

        LeftBranchingFSTParser.preprocess_grammar(grammar)
        parser = LeftBranchingFSTParser(grammar, string)
        der_ = parser.best_derivation_tree()

        print(der_)
        self.assertTrue(der_.check_integrity_recursive(der_.root_id()))

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        print(
            derivation_to_hybrid_tree(der_, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        token_sequence = [
            construct_conll_token(form, lemma)
            for form, lemma in zip('Piet Marie helpen lezen leren'.split(' '),
                                   'NP N V V V'.split(' '))
        ]
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)
コード例 #9
0
    def test_fst_compilation_right(self):
        if not test_pynini:
            return
        tree = hybrid_tree_1()
        tree2 = hybrid_tree_2()
        terminal_labeling = the_terminal_labeling_factory().get_strategy('pos')

        (_, grammar) = induce_grammar(
            [tree, tree2],
            the_labeling_factory().create_simple_labeling_strategy(
                'empty', 'pos'), terminal_labeling.token_label,
            [right_branching], 'START')

        a, rules = compile_wfst_from_right_branching_grammar(grammar)

        print(repr(a))

        symboltable = a.input_symbols()

        string = 'NP N V V V'.split(' ')

        token_sequence = [
            construct_conll_token(form, lemma) for form, lemma in zip(
                'Piet Marie helpen leren lezen'.split(' '), string)
        ]

        fsa = fsa_from_list_of_symbols(string, symboltable)
        self.assertEqual(
            '0\t1\tNP\tNP\n1\t2\tN\tN\n2\t3\tV\tV\n3\t4\tV\tV\n4\t5\tV\tV\n5\n',
            fsa.text().decode('utf-8'))

        b = compose(fsa, a)

        print(b.input_symbols())
        for i in b.input_symbols():
            print(i)

        print("Input Composition")
        print(b.text(symboltable, symboltable).decode('utf-8'))

        i = 0
        for path in paths(b):
            print(i, "th path:", path, end=' ')
            r = list(map(rules.index_object, path))
            d = PolishDerivation(r[1::])
            dcp = DCP_evaluator(d).getEvaluation()
            h = HybridTree()
            dcp_to_hybridtree(h, dcp, token_sequence, False,
                              construct_conll_token)
            h.reorder()
            if h == tree2:
                print("correct")
            else:
                print("incorrect")
            i += 1

        stats = defaultdict(lambda: 0)
        local_rule_stats(b, stats, 15)

        print(stats)

        print("Shortest path probability")
        best = shortestpath(b)
        best.topsort()
        self.assertAlmostEqual(1.80844898756e-05,
                               pow(e, -float(shortestdistance(best)[-1])))
        print(best.text())

        polish_rules = retrieve_rules(best)
        self.assertSequenceEqual(polish_rules, [8, 7, 1, 6, 2, 5, 3, 10, 3, 3])

        polish_rules = list(map(rules.index_object, polish_rules))

        print(polish_rules)

        der = PolishDerivation(polish_rules[1::])

        print(der)

        print(
            derivation_to_hybrid_tree(der, string,
                                      "Piet Marie helpen lezen leren".split(),
                                      construct_conll_token))

        dcp = DCP_evaluator(der).getEvaluation()

        h_tree_2 = HybridTree()
        dcp_to_hybridtree(h_tree_2, dcp, token_sequence, False,
                          construct_conll_token)

        print(h_tree_2)