Ejemplo n.º 1
0
    def test_corpus_split_merge_training(self):
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        limit_train = 100
        test = train
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory().create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling, term_labelling.token_label,
                                                 recursive_partitioning, start)

        # for rule in grammar.rules():
        #    print >>stderr, rule

        trees = parse_conll_corpus(train, False, limit_train)
        print("call S/M Training", file=stderr)

        new_grammars = split_merge_training(grammar_prim, term_labelling, trees, 4, 10, tie_breaking=True, init="equal",
                                            sigma=0.05, seed=50, merge_threshold=0.1)

        print("finished S/M Training", file=stderr)

        for new_grammar in new_grammars:
            for i, rule in enumerate(new_grammar.rules()):
                print(i, rule, file=stderr)
            print(file=stderr)
    def test_corpus_em_training(self):
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        limit_train = 200
        test = train
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        # for rule in grammar.rules():
        #    print >>stderr, rule

        trees = parse_conll_corpus(train, False, limit_train)

        print("compute reducts", file=stderr)

        trace = compute_reducts(grammar_prim, trees, term_labelling)

        print("call em Training", file=stderr)
        emTrainer = PyEMTrainer(trace)
        emTrainer.em_training(grammar_prim,
                              20,
                              tie_breaking=True,
                              init="equal",
                              sigma=0.05,
                              seed=50)

        print("finished em Training", file=stderr)
Ejemplo n.º 3
0
    def test_minimum_risk_parsing(self):
        limit_train = 20
        limit_test = 10
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            # print >>stderr, tree

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=50)

            self.assertTrue(parser.recognized())

            derivations = [der for der in parser.k_best_derivation_trees()]
            print("# derivations: ", len(derivations), file=stderr)
            h_trees = []
            current_weight = 0
            weights = []
            derivation_list = []
            for weight, der in derivations:

                self.assertTrue(not der in derivation_list)

                derivation_list.append(der)

                dcp = DCP_evaluator(der).getEvaluation()
                h_tree = HybridTree()
                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                                  construct_conll_token)

                h_trees.append(h_tree)
                weights.append(weight)

            if True:
                min_risk_tree = compute_minimum_risk_tree(h_trees, weights)
                if not min_risk_tree.__eq__(h_trees[0]):
                    print(h_trees[0])
                    print(min_risk_tree)
def dbtest():
    connection = openDatabase(dbfile)
    connection.text_factory = str

    create_experiment_table(connection)

    training_corpus = test_file
    test_corpus = test_file
    experiment = add_experiment(connection, 'term_pos', 'child_pos',
                                'direct_extraction', False, training_corpus,
                                test_corpus, time.time(), None)

    c = connection.cursor()
    for row in c.execute('SELECT * FROM experiments'):
        print(row)

    create_tree_table(connection)
    create_tree_node_table(connection)

    for tree in conll_parse.parse_conll_corpus(test_corpus, False):
        add_tree(connection, tree, test_corpus)

    for row in c.execute('SELECT * FROM trees'):
        print(row)

    for row2 in c.execute('SELECT * FROM tree_nodes'):
        print(row2)

    print()

    create_result_tree_table(connection)
    create_result_tree_node_table(connection)
    time_stamp = time.clock()
    for tree in conll_parse.parse_conll_corpus(test_file_modified, False):
        add_result_tree(connection, tree, test_corpus, experiment, 1, 0.142,
                        time.clock() - time_stamp, "parse")
        time_stamp = time.clock()

    for row3 in c.execute('SELECT  * FROM result_tree_nodes'):
        print(row3, type(row3[0]).__name__)

    print()

    print(experiment, type(experiment).__name__)

    for row4 in c.execute(
            '''SELECT * FROM result_trees INNER JOIN result_tree_nodes ON result_trees.rt_id = result_tree_nodes.rt_id WHERE exp_id = ?''',
        (experiment, )):
        print(row4)

    connection.close()
    def test_wsj(self):
        corpus = "res/wsj_dependency/24.conll"
        trees = parse_conll_corpus(corpus, False, 5000)
        trees = disconnect_punctuation(trees)

        x = 0
        for tree in trees:
            x += 1

        self.assertEqual(1346, x)
    def test_tiger(self):
        corpus = "res/dependency_conll/german/tiger/test/german_tiger_test.conll"
        trees = parse_conll_corpus(corpus, False, 5000)
        trees = disconnect_punctuation(trees)

        x = 0
        for tree in trees:
            x += 1

        self.assertEqual(357, x)
Ejemplo n.º 7
0
 def get_trees(self):
     if self._trees is not None:
         for tree in self._trees:
             yield tree
     else:
         self._trees = []
         for tree in length_limit(
                 parse_conll_corpus(self._path,
                                    False,
                                    limit=self._end,
                                    start=self._start), self._max_length):
             self._trees.append(tree)
             yield tree
Ejemplo n.º 8
0
def obtain_derivations(grammar, term_labelling):
    # build parser
    tree_yield = term_labelling.prepare_parser_input
    parser = GFParser_k_best(grammar, k=50)

    # parse sentences
    trees = parse_conll_corpus(test, False, limit_test)
    for i, tree in enumerate(trees):
        print(
            "Parsing sentence ",
            i,
            file=stderr,
        )

        parser.set_input(tree_yield(tree.token_yield()))
        parser.parse()

        derivations = [der for der in parser.k_best_derivation_trees()]

        print("# derivations: ", len(derivations), file=stderr)
        parser.clear()

        for der in derivations:
            yield der[1]
def main(limit=100000, ignore_punctuation=False):
    if PARSER_TYPE.__name__ != 'GFParser':
        print('GFParser not found, using', PARSER_TYPE.__name__, 'instead!')
        print('Please install grammatical framework to reproduce experiments.')

    test_limit = 10000
    trees = parse_conll_corpus(TRAIN, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar_prim) = d_i.induce_grammar(trees, PRIMARY_LABELLING, TERMINAL_LABELLING.token_label,
                                                 RECURSIVE_PARTITIONING, START)
    PARSER_TYPE.preprocess_grammar(grammar_prim)

    trees = parse_conll_corpus(TRAIN, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar_second) = d_i.induce_grammar(trees, SECONDARY_LABELLING, TERMINAL_LABELLING.token_label,
                                                   RECURSIVE_PARTITIONING, START)
    PARSER_TYPE.preprocess_grammar(grammar_second)

    trees = parse_conll_corpus(TRAIN, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar_tern) = d_i.induce_grammar(trees, TERNARY_LABELLING, TERMINAL_LABELLING.token_label,
                                                 RECURSIVE_PARTITIONING, START)
    PARSER_TYPE.preprocess_grammar(grammar_tern)

    trees = parse_conll_corpus(TEST, False, test_limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)

    total_time = 0.0

    with open(RESULT, 'w') as result_file:
        failures = 0
        for tree in trees:
            time_stamp = time.clock()

            the_parser = PARSER_TYPE(grammar_prim, TREE_YIELD(tree.token_yield()))
            if not the_parser.recognized():
                the_parser = PARSER_TYPE(grammar_second, TREE_YIELD(tree.token_yield()))
            if not the_parser.recognized():
                the_parser = PARSER_TYPE(grammar_tern, TREE_YIELD(tree.token_yield()))
            time_stamp = time.clock() - time_stamp
            total_time += time_stamp

            cleaned_tokens = copy.deepcopy(tree.full_token_yield())
            for token in cleaned_tokens:
                token.set_edge_label('_')
            h_tree = HybridTree(tree.sent_label())
            h_tree = the_parser.dcp_hybrid_tree_best_derivation(h_tree, cleaned_tokens, ignore_punctuation,
                                                                construct_conll_token)

            if h_tree:
                result_file.write(tree_to_conll_str(h_tree))
                result_file.write('\n\n')
            else:
                failures += 1
                forms = [token.form() for token in tree.full_token_yield()]
                poss = [token.pos() for token in tree.full_token_yield()]
                result_file.write(tree_to_conll_str(fall_back_left_branching(forms, poss)))
                result_file.write('\n\n')

    print("parse failures", failures)
    print("parse time", total_time)

    print("eval.pl", "no punctuation")
    p = subprocess.Popen(["perl", "util/eval.pl", "-g", TEST, "-s", RESULT, "-q"])
    p.communicate()
    print("eval.pl", "punctuation")
    p = subprocess.Popen(
        ["perl", "util/eval.pl", "-g", TEST, "-s", RESULT, "-q", "-p"])
    p.communicate()
Ejemplo n.º 10
0
def main():
    # induce grammar from a corpus
    trees = parse_conll_corpus(train, False, limit_train)
    nonterminal_labelling = the_labeling_factory(
    ).create_simple_labeling_strategy("childtop", "deprel")
    term_labelling = the_terminal_labeling_factory().get_strategy('pos')
    start = 'START'
    recursive_partitioning = [cfg]
    _, grammar = induce_grammar(trees, nonterminal_labelling,
                                term_labelling.token_label,
                                recursive_partitioning, start)

    # compute some derivations
    derivations = obtain_derivations(grammar, term_labelling)

    # create derivation manager and add derivations
    manager = PyDerivationManager(grammar)
    manager.convert_derivations_to_hypergraphs(derivations)
    manager.serialize(b"/tmp/derivations.txt")

    # build and configure split/merge trainer and supplementary objects

    rule_to_nonterminals = []
    for i in range(0, len(grammar.rule_index())):
        rule = grammar.rule_index(i)
        nonts = [
            manager.get_nonterminal_map().object_index(rule.lhs().nont())
        ] + [
            manager.get_nonterminal_map().object_index(nont)
            for nont in rule.rhs()
        ]
        rule_to_nonterminals.append(nonts)

    grammarInfo = PyGrammarInfo(grammar, manager.get_nonterminal_map())
    storageManager = PyStorageManager()
    builder = PySplitMergeTrainerBuilder(manager, grammarInfo)
    builder.set_em_epochs(20)
    builder.set_percent_merger(60.0)

    splitMergeTrainer = builder.build()

    latentAnnotation = [
        build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager)
    ]

    for i in range(max_cycles + 1):
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        # pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb'))
        smGrammar = build_sm_grammar(latentAnnotation[i],
                                     grammar,
                                     grammarInfo,
                                     rule_pruning=0.0001,
                                     rule_smoothing=0.01)
        print("Cycle: ", i, "Rules: ", len(smGrammar.rules()))

        if parsing:
            parser = GFParser(smGrammar)

            trees = parse_conll_corpus(test, False, limit_test)
            for tree in trees:
                parser.set_input(
                    term_labelling.prepare_parser_input(tree.token_yield()))
                parser.parse()
                if parser.recognized():
                    print(
                        derivation_to_hybrid_tree(
                            parser.best_derivation_tree(),
                            [token.pos() for token in tree.token_yield()],
                            [token.form() for token in tree.token_yield()],
                            construct_constituent_token))
Ejemplo n.º 11
0
def main(limit=300,
         ignore_punctuation=False,
         baseline_path=baseline_path,
         recompileGrammar=True,
         retrain=True,
         parsing=True,
         seed=1337):
    max_length = 20
    trees = length_limit(parse_conll_corpus(train, False, limit), max_length)

    if recompileGrammar or not os.path.isfile(baseline_path):
        (n_trees,
         baseline_grammar) = d_i.induce_grammar(trees, empty_labelling,
                                                term_labelling.token_label,
                                                recursive_partitioning, start)
        pickle.dump(baseline_grammar, open(baseline_path, 'wb'))
    else:
        baseline_grammar = pickle.load(open(baseline_path))

    test_limit = 10000
    print("Rules: ", len(baseline_grammar.rules()))

    if parsing:
        do_parsing(baseline_grammar, test_limit, ignore_punctuation,
                   recompileGrammar, [dir, "baseline_gf_grammar"])

    em_trained = pickle.load(open(baseline_path))
    if recompileGrammar or not os.path.isfile(reduct_path):
        trees = length_limit(parse_conll_corpus(train, False, limit),
                             max_length)
        trace = compute_reducts(em_trained, trees, term_labelling)
        trace.serialize(reduct_path)
    else:
        print("loading trace")
        trace = PySDCPTraceManager(em_trained, term_labelling)
        trace.load_traces_from_file(reduct_path)

    discr = False
    if discr:
        if recompileGrammar or not os.path.isfile(reduct_path_discr):
            trees = length_limit(parse_conll_corpus(train, False, limit),
                                 max_length)
            trace_discr = compute_LCFRS_reducts(
                em_trained,
                trees,
                term_labelling,
                nonterminal_map=trace.get_nonterminal_map())
            trace_discr.serialize(reduct_path_discr)
        else:
            print("loading trace discriminative")
            trace_discr = PyLCFRSTraceManager(em_trained,
                                              trace.get_nonterminal_map())
            trace_discr.load_traces_from_file(reduct_path_discr)

    n_epochs = 20
    init = "rfe"
    tie_breaking = True
    em_trained_path_ = em_trained_path(n_epochs, init, tie_breaking)

    if recompileGrammar or retrain or not os.path.isfile(em_trained_path_):
        emTrainer = PyEMTrainer(trace)
        emTrainer.em_training(em_trained,
                              n_epochs=n_epochs,
                              init=init,
                              tie_breaking=tie_breaking,
                              seed=seed)
        pickle.dump(em_trained, open(em_trained_path_, 'wb'))
    else:
        em_trained = pickle.load(open(em_trained_path_, 'rb'))

    if parsing:
        do_parsing(em_trained, test_limit, ignore_punctuation, recompileGrammar
                   or retrain, [dir, "em_trained_gf_grammar"])

    grammarInfo = PyGrammarInfo(baseline_grammar, trace.get_nonterminal_map())
    storageManager = PyStorageManager()

    builder = PySplitMergeTrainerBuilder(trace, grammarInfo)
    builder.set_em_epochs(n_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    if discr:
        builder.set_discriminative_expector(trace_discr,
                                            maxScale=10,
                                            threads=1)
    else:
        builder.set_simple_expector(threads=1)
    splitMergeTrainer = builder.set_percent_merger(65.0).build()

    if (not recompileGrammar) and (
            not retrain) and os.path.isfile(sm_info_path):
        print("Loading splits and weights of LA rules")
        latentAnnotation = map(
            lambda t: build_PyLatentAnnotation(t[0], t[1], t[2], grammarInfo,
                                               storageManager),
            pickle.load(open(sm_info_path, 'rb')))
    else:
        latentAnnotation = [
            build_PyLatentAnnotation_initial(em_trained, grammarInfo,
                                             storageManager)
        ]

    max_cycles = 4
    reparse = False
    # parsing = False
    for i in range(max_cycles + 1):
        if i < len(latentAnnotation):
            if reparse:
                smGrammar = latentAnnotation[i].build_sm_grammar(
                    baseline_grammar,
                    grammarInfo,
                    rule_pruning=0.0001,
                    rule_smoothing=0.01)
                print("Cycle: ", i, "Rules: ", len(smGrammar.rules()))
                do_parsing(smGrammar, test_limit, ignore_punctuation,
                           recompileGrammar or retrain,
                           [dir, "sm_cycles" + str(i) + "_gf_grammar"])
        else:
            # setting the seed to achieve reproducibility in case of continued training
            splitMergeTrainer.reset_random_seed(seed + i + 1)
            latentAnnotation.append(
                splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
            pickle.dump(map(lambda la: la.serialize(), latentAnnotation),
                        open(sm_info_path, 'wb'))
            smGrammar = latentAnnotation[i].build_sm_grammar(
                baseline_grammar,
                grammarInfo,
                rule_pruning=0.0001,
                rule_smoothing=0.1)
            print("Cycle: ", i, "Rules: ", len(smGrammar.rules()))
            if parsing:
                do_parsing(smGrammar, test_limit, ignore_punctuation,
                           recompileGrammar or retrain,
                           [dir, "sm_cycles" + str(i) + "_gf_grammar"])
Ejemplo n.º 12
0
    def test_k_best_parsing(self):
        limit_train = 20
        limit_test = 10
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            # print >>stderr, tree

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=50)

            self.assertTrue(parser.recognized())

            derivations = [der for der in parser.k_best_derivation_trees()]
            print("# derivations: ", len(derivations), file=stderr)
            h_trees = []
            current_weight = 0
            weights = []
            derivation_list = []
            for weight, der in derivations:
                # print >>stderr, exp(-weight)
                # print >>stderr, der

                self.assertTrue(not der in derivation_list)

                derivation_list.append(der)

                # TODO this should hold, but it looks like a GF bug!
                # self.assertGreaterEqual(weight, current_weight)
                current_weight = weight

                dcp = DCP_evaluator(der).getEvaluation()
                h_tree = HybridTree()
                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                                  construct_conll_token)

                h_trees.append(h_tree)
                weights.append(weight)

                # print >>stderr, h_tree

            # print a matrix indicating which derivations result
            # in the same hybrid tree
            if True:
                for i, h_tree1 in enumerate(h_trees):
                    for h_tree2 in h_trees:
                        if h_tree1 == h_tree2:
                            print("x", end=' ', file=stderr)
                        else:
                            print("", end=' ', file=stderr)
                    print(weights[i], file=stderr)
                print(file=stderr)
Ejemplo n.º 13
0
    def test_best_trees(self):
        limit_train = 5000
        limit_test = 100
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        parser_type = GFParser_k_best
        # test = '../../res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("child", "pos+deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim)
        tree_yield = term_labelling.prepare_parser_input

        trees = parse_conll_corpus(test, False, limit_test)

        for i, tree in enumerate(trees):
            print("Parsing sentence ", i, file=stderr)

            parser = parser_type(grammar_prim,
                                 tree_yield(tree.token_yield()),
                                 k=200)

            self.assertTrue(parser.recognized())

            viterbi_weight = parser.viterbi_weight()
            viterbi_deriv = parser.viterbi_derivation()

            der_to_tree = lambda der: dcp_to_hybridtree(
                HybridTree(),
                DCP_evaluator(der).getEvaluation(),
                copy.deepcopy(tree.full_token_yield()), False,
                construct_conll_token)

            viterbi_tree = der_to_tree(viterbi_deriv)

            ordered_parse_trees = parser.best_trees(der_to_tree)

            best_tree, best_weight, best_witnesses = ordered_parse_trees[0]

            for i, (parsed_tree, _, _) in enumerate(ordered_parse_trees):
                if parsed_tree.__eq__(tree):
                    print("Gold tree is ",
                          i + 1,
                          " in best tree list",
                          file=stderr)
                    break

            if (not viterbi_tree.__eq__(best_tree)
                    and viterbi_weight != best_weight):
                print("viterbi and k-best tree differ", file=stderr)
                print("viterbi: ", viterbi_weight, file=stderr)
                print("k-best: ", best_weight, best_witnesses, file=stderr)
                if False:
                    print(viterbi_tree, file=stderr)
                    print(tree_to_conll_str(viterbi_tree), file=stderr)
                    print(best_tree, file=stderr)
                    print(tree_to_conll_str(best_tree), file=stderr)
                    print("gold tree", file=stderr)
                    print(tree, file=stderr)
                    print(tree_to_conll_str(tree), file=stderr)
Ejemplo n.º 14
0
def induce_grammar_from_file(path,
                             connection,
                             nont_labelling,
                             term_labelling,
                             recursive_partitioning,
                             limit=sys.maxsize,
                             quiet=False,
                             start='START',
                             ignore_punctuation=True):
    """
    :param path: path to dependency corpus in CoNLL format
    :type path: str
    :param connection: database connection
    :type connection: Connection
    :param nont_labelling: nonterminal labeling strategy
    :type nont_labelling: AbstractLabeling
    :param term_labelling: GeneralHybridTree, NodeId -> str
    :type term_labelling: GeneralHybridTree, str -> str
    :param recursive_partitioning: GeneralHybridTree -> RecursivePartitioning
    :type recursive_partitioning: GeneralHybridTree -> [str], unknown
    :param limit: use only the first _limit_ trees for grammar induction
    :type limit: int
    :param quiet: status output
    :type quiet: bool
    :param start: set start nonterminal for grammar
    :type start: str
    :param ignore_punctuation: include punctuation into grammar
    :type ignore_punctuation: bool
    :rtype: LCFRS, int
    Extract an LCFRS/sDCP-Hybrid Grammar from a dependency corpus in CoNLL format.
    """

    experiment = experiment_database.add_experiment(
        connection, str(term_labelling), str(nont_labelling),
        ','.join([rec_par.__name__ for rec_par in recursive_partitioning]),
        ignore_punctuation, path, '', time.time(), None)

    if not quiet:
        print('Inducing grammar')
        print('file: ' + path)
        print('Nonterminal labelling strategy: ', nont_labelling.__str__())
        print('Terminal labelling strategy:    ', str(term_labelling))
        print(
            'Recursive partitioning strategy:',
            ','.join([rec_par.__name__ for rec_par in recursive_partitioning]))
        print('limit:                          ', str(limit))
        print('Ignoring punctuation            ', ignore_punctuation)
    start_clock = time.clock()

    trees = parse_conll_corpus(path, False, limit)
    trees = add_trees_to_db(path, connection, trees)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar) = d_i.induce_grammar(trees, nont_labelling,
                                            term_labelling.token_label,
                                            recursive_partitioning, start)

    end_clock = time.clock()
    if not quiet:
        print('Number of trees:                ', str(n_trees))
        print('Number of nonterminals:         ', len(grammar.nonts()))
        print('Number of rules:                ', len(grammar.rules()))
        print('Total size:                     ', grammar.size())
        print('Fanout:                         ',
              max(map(grammar.fanout, grammar.nonts())))
        print('Induction time:                 ', end_clock - start_clock,
              'seconds')

    print(experiment)
    experiment_database.add_grammar(connection, grammar, experiment)
    grammar_output = open('.tmp/grammar-' + str(experiment) + '.gra', 'w')
    linearize(grammar, nont_labelling, term_labelling, grammar_output)

    assert grammar.ordered()
    return grammar, experiment
Ejemplo n.º 15
0
def parse_sentences_from_file(grammar,
                              parser_type,
                              experiment,
                              connection,
                              path,
                              tree_yield,
                              max_length=sys.maxsize,
                              limit=sys.maxsize,
                              quiet=False,
                              ignore_punctuation=True,
                              root_default_deprel=None,
                              disconnected_default_deprel=None):
    """
    :rtype: None
    :type grammar: LCFRS
    :param path: file path for test corpus (dependency grammar in CoNLL format)
    :type path: str
    :param tree_yield: parse on words or POS or ..
    :type tree_yield: GeneralHybridTree -> list[str]
    :param max_length: don't parse sentences with yield > max_length
    :type max_length: int
    :param limit:      only parse the limit first sentences of the corpus
    :type limit: int
    :param quiet:      output status information
    :type quiet: bool
    :param ignore_punctuation: exclude punctuation from parsing
    :type ignore_punctuation: bool
    
    Parse sentences from corpus and compare derived dependency structure with gold standard information.
    """
    if not quiet:
        print("Building lookahead tables for grammar")
        parser_type.preprocess_grammar(grammar)

    experiment_database.set_experiment_test_corpus(connection, experiment,
                                                   path)

    if not quiet:
        if max_length != sys.maxsize:
            s = ', ignoring sentences with length > ' + str(max_length)
        else:
            s = ''
        print('Start parsing sentences' + s)

    trees = parse_conll_corpus(path, False, limit)
    trees = add_trees_to_db(path, connection, trees)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)

    (UAS, LAS, UEM, LEM) = (0, 0, 0, 0)
    parse = 0
    no_parse = 0
    n_gaps_gold = 0
    n_gaps_test = 0
    skipped = 0
    start_at = time.clock()
    for tree in trees:
        if len(tree.id_yield()) > max_length:
            skipped += 1
            continue
        time_stamp = time.clock()

        parser = parser_type(grammar, tree_yield(tree.token_yield()))
        time_stamp = time.clock() - time_stamp

        cleaned_tokens = copy.deepcopy(tree.full_token_yield())
        for token in cleaned_tokens:
            token.set_edge_label('_')
        h_tree = HybridTree(tree.sent_label())
        h_tree = parser.dcp_hybrid_tree_best_derivation(
            h_tree, cleaned_tokens, ignore_punctuation, construct_conll_token)

        if h_tree:
            experiment_database.add_result_tree(connection, h_tree,
                                                path, experiment, 1,
                                                parser.best(), time_stamp,
                                                'parse', root_default_deprel,
                                                disconnected_default_deprel)
            n_gaps_gold += tree.n_gaps()
            n_gaps_test += h_tree.n_gaps()
            parse += 1
            (dUAS, dLAS, dUEM, dLEM) = score_cmp_dep_trees(tree, h_tree)
            UAS += dUAS
            LAS += dLAS
            UEM += dUEM
            LEM += dLEM
        else:
            experiment_database.no_parse_result(connection, tree.sent_label(),
                                                path, experiment, time_stamp,
                                                "no_parse")
            no_parse += 1

    end_at = time.clock()
    total = parse + no_parse
    if not quiet:
        print('Parsed ' + str(parse) + ' out of ' + str(total) + ' (skipped ' +
              str(skipped) + ')')
        print('fail: ', no_parse)
        if parse > 0:
            print('UAS: ', UAS / parse)
            print('LAS: ', LAS / parse)
            print('UEM: ', UEM / parse)
            print('LEM: ', LEM / parse)
            print('n gaps (gold): ', n_gaps_gold * 1.0 / parse)
            print('n gaps (test): ', n_gaps_test * 1.0 / parse)
        print('parse time: ', end_at - start_at, 's')
        print()
Ejemplo n.º 16
0
def do_parsing(grammar_prim,
               limit,
               ignore_punctuation,
               recompile=True,
               preprocess_path=None):
    trees = parse_conll_corpus(test, False, limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)

    total_time = 0.0

    load_preprocess = preprocess_path
    if recompile or (not os.path.isfile(
            parser_type.resolve_path(preprocess_path))):
        load_preprocess = None

    parser = parser_type(grammar_prim,
                         save_preprocess=preprocess_path,
                         load_preprocess=load_preprocess)

    with open(result, 'w') as result_file:
        failures = 0
        for tree in trees:
            if len(tree.id_yield()) > limit:
                continue
            time_stamp = time.clock()

            parser.set_input(tree_yield(tree.token_yield()))
            parser.parse()
            # if not parser.recognized():
            #     parser = parser_type(grammar_second, tree_yield(tree.token_yield()))
            # if not parser.recognized():
            #     parser = parser_type(grammar_tern, tree_yield(tree.token_yield()))
            time_stamp = time.clock() - time_stamp
            total_time += time_stamp

            cleaned_tokens = copy.deepcopy(tree.full_token_yield())
            for token in cleaned_tokens:
                token.set_edge_label('_')

            h_tree = HybridTree(tree.sent_label())

            if parser_type == GFParser_k_best and parser.recognized():
                der_to_tree = lambda der: dcp_to_hybridtree(
                    HybridTree(),
                    DCP_evaluator(der).getEvaluation(),
                    copy.deepcopy(tree.full_token_yield()), False,
                    construct_conll_token)
                h_tree = parser.best_trees(der_to_tree)[0][0]
            elif parser_type == CFGParser \
                     or parser_type == GFParser \
                     or parser_type == LeftBranchingFSTParser \
                     or parser_type == RightBranchingFSTParser:
                h_tree = parser.dcp_hybrid_tree_best_derivation(
                    h_tree, cleaned_tokens, ignore_punctuation,
                    construct_conll_token)
            else:
                h_tree = None

            if h_tree:
                result_file.write(tree_to_conll_str(h_tree))
                result_file.write('\n\n')
            else:
                failures += 1
                forms = [token.form() for token in tree.full_token_yield()]
                poss = [token.pos() for token in tree.full_token_yield()]
                result_file.write(
                    tree_to_conll_str(fall_back_left_branching(forms, poss)))
                result_file.write('\n\n')

            parser.clear()

    print("parse failures", failures)
    print("parse time", total_time)

    print("eval.pl", "no punctuation")
    p = subprocess.Popen(
        ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"])
    p.communicate()
    print("eval.pl", "punctation")
    p = subprocess.Popen(
        ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"])
    p.communicate()
Ejemplo n.º 17
0
    def generic_parsing_test(self, parser_type, limit_train, limit_test,
                             compare_order):
        def filter_by_id(n, trees):
            j = 0
            for tree in trees:
                if j in n:
                    yield tree
                j += 1

        #params
        train = 'res/dependency_conll/german/tiger/train/german_tiger_train.conll'
        test = train
        # test = 'res/dependency_conll/german/tiger/test/german_tiger_test.conll'
        trees = parse_conll_corpus(train, False, limit_train)
        primary_labelling = the_labeling_factory(
        ).create_simple_labeling_strategy("childtop", "deprel")
        term_labelling = the_terminal_labeling_factory().get_strategy('pos')
        start = 'START'
        recursive_partitioning = [cfg]

        (n_trees, grammar_prim) = induce_grammar(trees, primary_labelling,
                                                 term_labelling.token_label,
                                                 recursive_partitioning, start)

        parser_type.preprocess_grammar(grammar_prim, term_labelling)

        trees = parse_conll_corpus(test, False, limit_test)

        count_derivs = {}
        no_complete_match = 0

        for i, tree in enumerate(trees):
            print("Parsing tree for ", i, file=stderr)

            print(tree, file=stderr)

            parser = parser_type(grammar_prim, tree)
            self.assertTrue(parser.recognized())
            count_derivs[i] = 0

            print("Found derivations for ", i, file=stderr)
            j = 0

            derivations = []

            for der in parser.all_derivation_trees():
                self.assertTrue(
                    der.check_integrity_recursive(der.root_id(), start))

                print(count_derivs[i], file=stderr)
                print(der, file=stderr)

                output_tree = HybridTree()
                tokens = tree.token_yield()

                the_yield = der.compute_yield()
                # print >>stderr, the_yield
                tokens2 = list(
                    map(lambda pos: construct_conll_token('_', pos),
                        the_yield))

                dcp_to_hybridtree(output_tree,
                                  DCP_evaluator(der).getEvaluation(),
                                  tokens2,
                                  False,
                                  construct_conll_token,
                                  reorder=False)
                print(tree, file=stderr)
                print(output_tree, file=stderr)

                self.compare_hybrid_trees(tree, output_tree, compare_order)
                count_derivs[i] += 1
                derivations.append(der)

            self.assertTrue(
                sDCPParserTest.pairwise_different(
                    derivations, sDCPParserTest.compare_derivations))
            self.assertEqual(len(derivations), count_derivs[i])

            if count_derivs[i] == 0:
                no_complete_match += 1

        for key in count_derivs:
            print(key, count_derivs[key])

        print("# trees with no complete match:", no_complete_match)
def trainAndEval(strategy,
                 labelling1,
                 labelling2,
                 fanout,
                 parser_type,
                 train,
                 test,
                 cDT,
                 parseStrings,
                 ignore_punctuation=False):
    file = open('results.txt', 'a')
    term_labelling = the_terminal_labeling_factory().get_strategy('pos')
    recursive_partitioning = d_i.the_recursive_partitioning_factory(
    ).get_partitioning('fanout-' + str(fanout) + strategy)
    primary_labelling = d_l.the_labeling_factory(
    ).create_simple_labeling_strategy(labelling1, labelling2)

    trees = parse_conll_corpus(train, False, train_limit)
    if ignore_punctuation:
        trees = disconnect_punctuation(trees)
    (n_trees, grammar) = d_i.induce_grammar(trees, primary_labelling,
                                            term_labelling.token_label,
                                            recursive_partitioning, start)

    # write current transformation strategy and hyperparameters to results.txt
    if strategy == '':
        file.write('rtl ' + labelling1 + ' ' + labelling2 +
                   '    maximal fanout:' + fanout)
    else:
        splitList = strategy.split('-')
        if splitList[1] == 'left':
            file.write('ltr ' + labelling1 + ' ' + labelling2 +
                       '    maximal fanout:' + fanout)
        elif splitList[1] == 'random':
            file.write('random seed:' + splitList[2] + ' ' + labelling1 + ' ' +
                       labelling2 + ' maximal fanout:' + fanout)
        elif splitList[1] == 'no':
            if splitList[4] == 'random':
                file.write('nnont fallback:random seed:' + splitList[5] + ' ' +
                           labelling1 + ' ' + labelling2 + ' maximal fanout:' +
                           fanout)
            elif splitList[4] == 'ltr':
                file.write('nnont fallback:ltr' + ' ' + labelling1 + ' ' +
                           labelling2 + ' maximal fanout:' + fanout)
            elif splitList[4] == 'rtl':
                file.write('nnont fallback:rtl' + ' ' + labelling1 + ' ' +
                           labelling2 + ' maximal fanout:' + fanout)
            else:
                file.write('nnont fallback:argmax' + ' ' + labelling1 + ' ' +
                           labelling2 + ' maximal fanout:' + fanout)
        else:  #argmax
            file.write('argmax ' + labelling1 + ' ' + labelling2 +
                       ' maximal fanout:' + fanout)
    file.write('\n')

    res = ''

    res += '#nonts:' + str(len(grammar.nonts()))
    res += ' #rules:' + str(len(grammar.rules()))

    file.write(res)
    res = ''

    # The following code is to count the number of derivations for a hypergraph (tree parser required)
    if cDT == True:
        tree_parser.preprocess_grammar(grammar, term_labelling)

        trees = parse_conll_corpus(train, False, train_limit)
        if ignore_punctuation:
            trees = disconnect_punctuation(trees)

        derCount = 0
        derMax = 0
        for tree in trees:
            parser = tree_parser(grammar, tree)  # if tree parser is used
            der = parser.count_derivation_trees()
            if der > derMax:
                derMax = der
            derCount += der

        res += "\n#derivation trees:  average: " + str(
            1.0 * derCount / n_trees)
        res += " maximal: " + str(derMax)
    file.write(res)

    res = ''
    total_time = 0.0

    # The following code works for string parsers for evaluating
    if parseStrings == True:
        parser_type.preprocess_grammar(grammar)

        trees = parse_conll_corpus(test, False, test_limit)
        if ignore_punctuation:
            trees = disconnect_punctuation(trees)

        i = 0
        with open(result, 'w') as result_file:
            failures = 0
            for tree in trees:
                time_stamp = time.clock()
                i += i
                #if (i % 100 == 0):
                #print '.',
                #sys.stdout.flush()

                parser = parser_type(grammar, tree_yield(tree.token_yield()))

                time_stamp = time.clock() - time_stamp
                total_time += time_stamp

                cleaned_tokens = copy.deepcopy(tree.full_token_yield())
                for token in cleaned_tokens:
                    token.set_edge_label('_')
                h_tree = HybridTree(tree.sent_label())
                h_tree = parser.dcp_hybrid_tree_best_derivation(
                    h_tree, cleaned_tokens, ignore_punctuation,
                    construct_conll_token)

                if h_tree:
                    result_file.write(tree_to_conll_str(h_tree))
                    result_file.write('\n\n')
                else:
                    failures += 1
                    forms = [token.form() for token in tree.full_token_yield()]
                    poss = [token.pos() for token in tree.full_token_yield()]
                    result_file.write(
                        tree_to_conll_str(
                            fall_back_left_branching_token(cleaned_tokens)))
                    result_file.write('\n\n')

        res += "\nattachment scores:\nno punctuation: "
        out = subprocess.check_output(
            ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q"])
        match = re.search(r'[^=]*= (\d+\.\d+)[^=]*= (\d+.\d+).*', out)
        res += ' labelled:' + match.group(1)  #labeled attachment score
        res += ' unlabelled:' + match.group(2)  #unlabeled attachment score
        res += "\npunctation: "
        out = subprocess.check_output(
            ["perl", "../util/eval.pl", "-g", test, "-s", result, "-q", "-p"])
        match = re.search(r'[^=]*= (\d+\.\d+)[^=]*= (\d+.\d+).*', out)
        res += ' labelled:' + match.group(1)
        res += ' unlabelled:' + match.group(2)

        res += "\nparse time: " + str(total_time)

    file.write(res)
    file.write('\n\n\n')
    file.close()