def build_score_validator(self, resource): self.organizer.validator = PyCandidateScoreValidator( self.organizer.grammarInfo, self.organizer.storageManager, self.score_name) corpus_validation = self.read_corpus(resource) obj_count = 0 der_count = 0 timeout = False if self.parsing_timeout: timeout_manager = multiprocessing.Manager() return_dict = timeout_manager.dict() for gold in corpus_validation: obj_count += 1 self.parser.set_input(self.parsing_preprocess(gold)) if self.parsing_timeout: timeout, derivations_ = self._compute_derivations_with_timeout( return_dict) derivations = list(map(lambda x: x[1], derivations_)) else: self.parser.parse() derivations = list( map(lambda x: x[1], self.parser.k_best_derivation_trees())) manager = PyDerivationManager(self.base_grammar, self.organizer.nonterminal_map) manager.convert_derivations_to_hypergraphs(derivations) scores = [] # derivations = self.parser.k_best_derivation_trees() for der in derivations: der_count += 1 result = self.parsing_postprocess(self.obtain_sentence(gold), der) score = self.score_object(result, gold) scores.append(score) self.organizer.validator.add_scored_candidates( manager, scores, self.max_score) # print(obj_count, self.max_score, scores) token = 't' if timeout else ('.' if scores else '-') print(token, end='', file=self.logger) if scores: print(obj_count, 'max', max(scores), 'firsts', scores[0:10], file=self.logger) else: print(obj_count, 'max 00.00', '[]', file=self.logger) self.parser.clear()
def main(): # induce grammar from a corpus trees = parse_conll_corpus(train, False, limit_train) nonterminal_labelling = the_labeling_factory( ).create_simple_labeling_strategy("childtop", "deprel") term_labelling = the_terminal_labeling_factory().get_strategy('pos') start = 'START' recursive_partitioning = [cfg] _, grammar = induce_grammar(trees, nonterminal_labelling, term_labelling.token_label, recursive_partitioning, start) # compute some derivations derivations = obtain_derivations(grammar, term_labelling) # create derivation manager and add derivations manager = PyDerivationManager(grammar) manager.convert_derivations_to_hypergraphs(derivations) manager.serialize(b"/tmp/derivations.txt") # build and configure split/merge trainer and supplementary objects rule_to_nonterminals = [] for i in range(0, len(grammar.rule_index())): rule = grammar.rule_index(i) nonts = [ manager.get_nonterminal_map().object_index(rule.lhs().nont()) ] + [ manager.get_nonterminal_map().object_index(nont) for nont in rule.rhs() ] rule_to_nonterminals.append(nonts) grammarInfo = PyGrammarInfo(grammar, manager.get_nonterminal_map()) storageManager = PyStorageManager() builder = PySplitMergeTrainerBuilder(manager, grammarInfo) builder.set_em_epochs(20) builder.set_percent_merger(60.0) splitMergeTrainer = builder.build() latentAnnotation = [ build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager) ] for i in range(max_cycles + 1): latentAnnotation.append( splitMergeTrainer.split_merge_cycle(latentAnnotation[-1])) # pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb')) smGrammar = build_sm_grammar(latentAnnotation[i], grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.01) print("Cycle: ", i, "Rules: ", len(smGrammar.rules())) if parsing: parser = GFParser(smGrammar) trees = parse_conll_corpus(test, False, limit_test) for tree in trees: parser.set_input( term_labelling.prepare_parser_input(tree.token_yield())) parser.parse() if parser.recognized(): print( derivation_to_hybrid_tree( parser.best_derivation_tree(), [token.pos() for token in tree.token_yield()], [token.form() for token in tree.token_yield()], construct_constituent_token))
def build_score_validator(baseline_grammar, grammarInfo, nont_map, storageManager, term_labelling, parser, corpus_validation, validationMethod): validator = PyCandidateScoreValidator(grammarInfo, storageManager, validationMethod) # parser = GFParser(baseline_grammar) tree_count = 0 der_count = 0 for gold_tree in corpus_validation.get_trees(): tree_count += 1 parser.set_input( term_labelling.prepare_parser_input(gold_tree.token_yield())) parser.parse() derivations = map(lambda x: x[1], parser.k_best_derivation_trees()) manager = PyDerivationManager(baseline_grammar, nont_map) manager.convert_derivations_to_hypergraphs(derivations) scores = [] gold_labels = {} gold_heads = {} for position, id in enumerate(gold_tree.id_yield()): parent_id = gold_tree.parent(id) gold_labels[position] = gold_tree.node_token(id).deprel() if parent_id is None: assert id in gold_tree.root gold_heads[position] = 0 else: gold_heads[position] = gold_tree.id_yield().index( parent_id) + 1 derivations = parser.k_best_derivation_trees() for _, der in derivations: der_count += 1 h_tree = HybridTree() cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield()) dcp = DCP_evaluator(der).getEvaluation() dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_conll_token) las, uas, lac = 0, 0, 0 for position, id in enumerate(h_tree.id_yield()): parent_id = h_tree.parent(id) if parent_id is None: assert id in h_tree.root head = 0 else: head = h_tree.id_yield().index(parent_id) + 1 label = h_tree.node_token(id).deprel() if gold_heads[position] == head: uas += 1 if gold_labels[position] == label: lac += 1 if gold_heads[position] == head and gold_labels[ position] == label: las += 1 if validationMethod == "LAS": scores.append(las) elif validationMethod == "UAS": scores.append(uas) elif validationMethod == "LAC": scores.append(lac) max_score = len(gold_tree.id_yield()) validator.add_scored_candidates(manager, scores, max_score) print(tree_count, max_score, scores) parser.clear() print("trees used for validation ", tree_count, "with", der_count * 1.0 / tree_count, "derivations on average") return validator