def build_score_validator(self, resource):
        self.organizer.validator = PyCandidateScoreValidator(
            self.organizer.grammarInfo, self.organizer.storageManager,
            self.score_name)

        corpus_validation = self.read_corpus(resource)
        obj_count = 0
        der_count = 0
        timeout = False

        if self.parsing_timeout:
            timeout_manager = multiprocessing.Manager()
            return_dict = timeout_manager.dict()

        for gold in corpus_validation:
            obj_count += 1
            self.parser.set_input(self.parsing_preprocess(gold))

            if self.parsing_timeout:
                timeout, derivations_ = self._compute_derivations_with_timeout(
                    return_dict)
                derivations = list(map(lambda x: x[1], derivations_))
            else:
                self.parser.parse()
                derivations = list(
                    map(lambda x: x[1], self.parser.k_best_derivation_trees()))

            manager = PyDerivationManager(self.base_grammar,
                                          self.organizer.nonterminal_map)
            manager.convert_derivations_to_hypergraphs(derivations)
            scores = []

            # derivations = self.parser.k_best_derivation_trees()
            for der in derivations:
                der_count += 1
                result = self.parsing_postprocess(self.obtain_sentence(gold),
                                                  der)
                score = self.score_object(result, gold)
                scores.append(score)

            self.organizer.validator.add_scored_candidates(
                manager, scores, self.max_score)
            # print(obj_count, self.max_score, scores)
            token = 't' if timeout else ('.' if scores else '-')
            print(token, end='', file=self.logger)
            if scores:
                print(obj_count,
                      'max',
                      max(scores),
                      'firsts',
                      scores[0:10],
                      file=self.logger)
            else:
                print(obj_count, 'max 00.00', '[]', file=self.logger)
            self.parser.clear()
Esempio n. 2
0
def main():
    # induce grammar from a corpus
    trees = parse_conll_corpus(train, False, limit_train)
    nonterminal_labelling = the_labeling_factory(
    ).create_simple_labeling_strategy("childtop", "deprel")
    term_labelling = the_terminal_labeling_factory().get_strategy('pos')
    start = 'START'
    recursive_partitioning = [cfg]
    _, grammar = induce_grammar(trees, nonterminal_labelling,
                                term_labelling.token_label,
                                recursive_partitioning, start)

    # compute some derivations
    derivations = obtain_derivations(grammar, term_labelling)

    # create derivation manager and add derivations
    manager = PyDerivationManager(grammar)
    manager.convert_derivations_to_hypergraphs(derivations)
    manager.serialize(b"/tmp/derivations.txt")

    # build and configure split/merge trainer and supplementary objects

    rule_to_nonterminals = []
    for i in range(0, len(grammar.rule_index())):
        rule = grammar.rule_index(i)
        nonts = [
            manager.get_nonterminal_map().object_index(rule.lhs().nont())
        ] + [
            manager.get_nonterminal_map().object_index(nont)
            for nont in rule.rhs()
        ]
        rule_to_nonterminals.append(nonts)

    grammarInfo = PyGrammarInfo(grammar, manager.get_nonterminal_map())
    storageManager = PyStorageManager()
    builder = PySplitMergeTrainerBuilder(manager, grammarInfo)
    builder.set_em_epochs(20)
    builder.set_percent_merger(60.0)

    splitMergeTrainer = builder.build()

    latentAnnotation = [
        build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager)
    ]

    for i in range(max_cycles + 1):
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        # pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb'))
        smGrammar = build_sm_grammar(latentAnnotation[i],
                                     grammar,
                                     grammarInfo,
                                     rule_pruning=0.0001,
                                     rule_smoothing=0.01)
        print("Cycle: ", i, "Rules: ", len(smGrammar.rules()))

        if parsing:
            parser = GFParser(smGrammar)

            trees = parse_conll_corpus(test, False, limit_test)
            for tree in trees:
                parser.set_input(
                    term_labelling.prepare_parser_input(tree.token_yield()))
                parser.parse()
                if parser.recognized():
                    print(
                        derivation_to_hybrid_tree(
                            parser.best_derivation_tree(),
                            [token.pos() for token in tree.token_yield()],
                            [token.form() for token in tree.token_yield()],
                            construct_constituent_token))
Esempio n. 3
0
def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation.get_trees():
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = map(lambda x: x[1], parser.k_best_derivation_trees())
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_derivations_to_hypergraphs(derivations)
        scores = []

        gold_labels = {}
        gold_heads = {}

        for position, id in enumerate(gold_tree.id_yield()):
            parent_id = gold_tree.parent(id)
            gold_labels[position] = gold_tree.node_token(id).deprel()
            if parent_id is None:
                assert id in gold_tree.root
                gold_heads[position] = 0
            else:
                gold_heads[position] = gold_tree.id_yield().index(
                    parent_id) + 1

        derivations = parser.k_best_derivation_trees()
        for _, der in derivations:
            der_count += 1
            h_tree = HybridTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_conll_token)

            las, uas, lac = 0, 0, 0
            for position, id in enumerate(h_tree.id_yield()):
                parent_id = h_tree.parent(id)
                if parent_id is None:
                    assert id in h_tree.root
                    head = 0
                else:
                    head = h_tree.id_yield().index(parent_id) + 1
                label = h_tree.node_token(id).deprel()

                if gold_heads[position] == head:
                    uas += 1
                if gold_labels[position] == label:
                    lac += 1
                if gold_heads[position] == head and gold_labels[
                        position] == label:
                    las += 1

            if validationMethod == "LAS":
                scores.append(las)
            elif validationMethod == "UAS":
                scores.append(uas)
            elif validationMethod == "LAC":
                scores.append(lac)

        max_score = len(gold_tree.id_yield())
        validator.add_scored_candidates(manager, scores, max_score)
        print(tree_count, max_score, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator