def test_basic_sdcp_parsing_constituency(self): tree1 = constituent_tree_1() tree2 = constituent_tree_2() tree3 = constituent_tree_1_pos_stripped() terminal_labeling = FormTerminals() # [tree1, tree2], 1, filter=["VP"]) fanout = 1 grammar = LCFRS('START') for tree in [tree1, tree2]: tree_part = tree.unlabelled_structure() part = fanout_limited_partitioning(tree_part, fanout) tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling) grammar.add_gram(tree_grammar) grammar.make_proper() print("grammar induced. Printing rules...", file=stderr) for rule in grammar.rules(): print(rule, file=stderr) parser_type = LCFRS_sDCP_Parser print("preprocessing grammar", file=stderr) parser_type.preprocess_grammar(grammar, terminal_labeling, debug=True) print("invoking parser", file=stderr) parser = parser_type(grammar, tree1) print("listing derivations", file=stderr) for der in parser.all_derivation_trees(): print(der) output_tree = ConstituentTree(tree1.sent_label()) tokens = [construct_constituent_token(token.form(), '--', True) for token in tree1.token_yield()] dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_constituent_token) print(tree1) print(output_tree) parser = parser_type(grammar, tree3) print(parser.recognized()) for der in parser.all_derivation_trees(): print(der) output_tree = ConstituentTree(tree3.sent_label()) tokens = [construct_constituent_token(token.form(), '--', True) for token in tree3.token_yield()] dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_constituent_token) print(tree3) print(output_tree) print("completed test", file=stderr)
def parsing_postprocess(self, sentence, derivation, label=None): full_yield, id_yield, full_token_yield, token_yield = sentence dcp_tree = ConstituentTree(label) punctuation_positions = [i + 1 for i, idx in enumerate(full_yield) if idx not in id_yield] cleaned_tokens = copy.deepcopy(full_token_yield) dcp = DCP_evaluator(derivation).getEvaluation() dcp_to_hybridtree(dcp_tree, dcp, cleaned_tokens, False, construct_constituent_token, punct_positions=punctuation_positions) if True or self.strip_vroot: dcp_tree.strip_vroot() return dcp_tree
def test_induction_and_parsing_with_pos_recovery(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=FormTerminals()) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos())
def tree2(): tree = ConstituentTree("1") for i, t in enumerate(["a", "b", "d", "c"]): tree.add_leaf(str(i), "P" + t, t) tree.set_label('r0', 'C') tree.set_label('r1', 'A') tree.set_label('r2', 'B') tree.add_to_root('r0') tree.add_child('r0', 'r1') tree.add_child('r0', 'r2') tree.add_child('r1', '0') tree.add_child('r1', '3') tree.add_child('r2', '1') tree.add_child('r2', '2') print(tree, tree.word_yield()) return tree
def constituent_tree_1(): tree = ConstituentTree("s1") tree.add_leaf("f1", "VP", "hat") tree.add_leaf("f2", "ADV", "schnell") tree.add_leaf("f3", "VP", "gearbeitet") tree.add_punct("f4", "PUNC", ".") tree.set_label("V", "V") tree.add_child("V", "f1") tree.add_child("V", "f3") tree.set_label("ADV", "ADV") tree.add_child("ADV", "f2") tree.set_label("VP", "VP") tree.add_child("VP", "V") tree.add_child("VP", "ADV") tree.add_to_root("VP") return tree
def constituent_tree_2(): tree = ConstituentTree("s2") tree.add_leaf("l1", "N", "John") tree.add_leaf("l2", "V", "hit") tree.add_leaf("l3", "D", "the") tree.add_leaf("l4", "N", "Ball") tree.add_punct("l5", "PUNC", ".") tree.set_label("NP", "NP") tree.add_child("NP", "l3") tree.add_child("NP", "l4") tree.set_label("VP", "VP") tree.add_child("VP", "l2") tree.add_child("VP", "NP") tree.set_label("S", "S") tree.add_child("S", "l1") tree.add_child("S", "VP") tree.add_to_root("S") return tree
def setUp(self): tree = ConstituentTree("s1") tree.add_leaf("f1", "VAFIN", "hat", morph=[("number", "Sg"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")]) tree.add_leaf("f3", "VVPP", "gearbeitet") tree.add_punct("f4", "PUNC", ".") tree.add_child("VP2", "f1") tree.add_child("VP2", "f3") tree.add_child("ADVP", "f2") tree.add_child("VP1", "VP2") tree.add_child("VP1", "ADVP") tree.set_label("VP2", "VP") tree.set_label("VP1", "VP") tree.set_label("ADVP", "ADVP") self.tree = tree tree2 = ConstituentTree("s2") tree2.add_leaf("f1", "VAFIN", "haben", morph=[("number", "Pl"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")]) tree2.add_leaf("f3", "VVPP", "gekocht") tree2.add_punct("f4", "PUNC", ".") tree2.add_child("VP2", "f1") tree2.add_child("VP2", "f3") tree2.add_child("ADVP", "f2") tree2.add_child("VP1", "VP2") tree2.add_child("VP1", "ADVP") tree2.set_label("VP2", "VP") tree2.set_label("VP1", "VP") tree2.set_label("ADVP", "ADVP") tree2.add_to_root("VP1") self.tree2 = tree2 self.tree3 = ConstituentTree("s3") self.tree3.add_leaf("f1", "ADJA", "Allgemeiner", edge="NK", morph=[("number", "Sg")]) self.tree3.add_leaf("f2", "ADJA", "Deutscher", edge="NK", morph=[("degree", "Pos"), ("number", "Sg")]) self.tree3.add_leaf("f3", "NN", "Fahrrad", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) self.tree3.add_leaf("f4", "NN", "Club", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) for i in range(1, 5): self.tree3.add_child("NP", "f" + str(i)) self.tree3.set_label("NP", "NP") self.tree3.add_to_root("NP")
class ConstituentTreeTest(unittest.TestCase): def test_something(self): tree = self.tree print("rooted", tree.root) tree.add_to_root("VP1") print("rooted", tree.root) print(tree) print("sent label", tree.sent_label()) print("leaves", tree.leaves()) print("is leaf (leaves)", [(x, tree.is_leaf(x)) for (x, _, _) in tree.leaves()]) print("is leaf (internal)", [(x, tree.is_leaf(x)) for x in tree.ids()]) print("leaf index", [(x, tree.leaf_index(x)) for x in ["f1", "f2", "f3"]]) print("pos yield", tree.pos_yield()) print("word yield", tree.word_yield()) # reentrant # parent print("ids", tree.ids()) # reorder print("n nodes", tree.n_nodes()) print("n gaps", tree.n_gaps()) print("fringe VP", tree.fringe("VP")) print("fringe V", tree.fringe("V")) print("empty fringe", tree.empty_fringe()) print("complete?", tree.complete()) print("max n spans", tree.max_n_spans()) print("unlabelled structure", tree.unlabelled_structure()) print("labelled spans", tree.labelled_spans()) def test_induction(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") feature_log1 = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(tree, rec_part(tree), feature_logging=feature_log1, naming=naming) for key in feature_log1: print(key, feature_log1[key]) print(grammar) feats = defaultdict(lambda: 0) grammar_ = fringe_extract_lcfrs(tree, rec_part(tree), isolate_pos=True, feature_logging=feats, naming=naming) print(grammar_) for key in feats: print(key, feats[key]) print("Adding 2nd grammar to first") grammar.add_gram(grammar_, feature_logging=(feature_log1, feats)) for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print("Adding 3rd grammar to first") feats3 = defaultdict(lambda: 0) grammar3 = fringe_extract_lcfrs(self.tree2, rec_part(self.tree2), isolate_pos=True, feature_logging=feats3, naming=naming) grammar.add_gram(grammar3, feature_logging=(feature_log1, feats3)) print() for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print() print("New feature log") print() for key in feature_log1: print(key, feature_log1[key]) grammar.make_proper() build_nont_splits_dict(grammar, feature_log1, nonterminals=Enumerator()) print(grammar.rule_index(0)) print(grammar.rule_index(2)) def test_markovized_induction(self): naming = 'strict-markov-v-2-h-0' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True) print(grammar) def test_induction_2(self): def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) features = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(self.tree3, rec_part(self.tree3), naming="child", feature_logging=features, isolate_pos=True) grammar.make_proper() if False: for idx in range(0, len(grammar.rules())): print(grammar.rule_index(idx)) for key in features: if key[0] == idx: print(key, features[key]) print() for key in features: if type(key[0]) == int: continue print(key, features[key]) nont_splits, root_weights, rule_weights, _ = build_nont_splits_dict( grammar, features, nonterminals=Enumerator(), feat_function=pos_cat_feats, debug=True) print(nont_splits) print(root_weights) print(rule_weights) def setUp(self): tree = ConstituentTree("s1") tree.add_leaf("f1", "VAFIN", "hat", morph=[("number", "Sg"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")]) tree.add_leaf("f3", "VVPP", "gearbeitet") tree.add_punct("f4", "PUNC", ".") tree.add_child("VP2", "f1") tree.add_child("VP2", "f3") tree.add_child("ADVP", "f2") tree.add_child("VP1", "VP2") tree.add_child("VP1", "ADVP") tree.set_label("VP2", "VP") tree.set_label("VP1", "VP") tree.set_label("ADVP", "ADVP") self.tree = tree tree2 = ConstituentTree("s2") tree2.add_leaf("f1", "VAFIN", "haben", morph=[("number", "Pl"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")]) tree2.add_leaf("f3", "VVPP", "gekocht") tree2.add_punct("f4", "PUNC", ".") tree2.add_child("VP2", "f1") tree2.add_child("VP2", "f3") tree2.add_child("ADVP", "f2") tree2.add_child("VP1", "VP2") tree2.add_child("VP1", "ADVP") tree2.set_label("VP2", "VP") tree2.set_label("VP1", "VP") tree2.set_label("ADVP", "ADVP") tree2.add_to_root("VP1") self.tree2 = tree2 self.tree3 = ConstituentTree("s3") self.tree3.add_leaf("f1", "ADJA", "Allgemeiner", edge="NK", morph=[("number", "Sg")]) self.tree3.add_leaf("f2", "ADJA", "Deutscher", edge="NK", morph=[("degree", "Pos"), ("number", "Sg")]) self.tree3.add_leaf("f3", "NN", "Fahrrad", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) self.tree3.add_leaf("f4", "NN", "Club", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) for i in range(1, 5): self.tree3.add_child("NP", "f" + str(i)) self.tree3.set_label("NP", "NP") self.tree3.add_to_root("NP")
def flat_dummy_constituent_tree(token_yield, full_token_yield, dummy_label, dummy_root, label=None, gold_pos=True): """ :param token_yield: connected yield of a parse tree :type token_yield: list[ConstituentTerminal] :param full_token_yield: full yield of the parse tree :type full_token_yield: list[ConstituentTerminal] :return: dummy constituent tree :rtype: ConstituentTree generates a flat dummy tree for a given yield where all nodes are attached under the root """ tree = ConstituentTree(label) # generate root node root_id = 'n_root' tree.add_node(root_id, ConstituentCategory(dummy_root)) tree.add_to_root(root_id) parent = root_id # create all leaves and punctuation for token in full_token_yield: pos = token.pos() if gold_pos else '--' if token not in token_yield: tree.add_punct(full_token_yield.index(token), pos, token.form()) else: idx = full_token_yield.index(token) tree.add_leaf(idx, pos, token.form(), morph=token.morph_feats(), lemma=token.lemma()) tree.add_child(parent, idx) return tree
def dummy_constituent_tree(token_yield, full_token_yield, dummy_label, dummy_root, label=None): """ :param token_yield: connected yield of a parse tree :type token_yield: list[ConstituentTerminal] :param full_token_yield: full yield of the parse tree :type full_token_yield: list[ConstituentTerminal] :return: dummy constituent tree :rtype: ConstituentTree generates a dummy tree for a given yield using dummy_label as inner node symbol """ tree = ConstituentTree(label) # create all leaves and punctuation for token in full_token_yield: if token not in token_yield: tree.add_punct(full_token_yield.index(token), token.pos(), token.form()) else: tree.add_leaf(full_token_yield.index(token), token.pos(), token.form()) # generate root node root_id = 'n0' tree.add_node(root_id, ConstituentCategory(dummy_root)) tree.add_to_root(root_id) parent = root_id if len(token_yield) > 1: i = 1 # generate inner nodes of branching tree for token in token_yield[:-2]: node = ConstituentCategory(str(dummy_label)) tree.add_node('n' + str(i), node) tree.add_child(parent, 'n' + str(i)) tree.add_child(parent, full_token_yield.index(token)) parent = 'n' + str(i) i += 1 token = token_yield[len(token_yield) - 2] tree.add_child(parent, full_token_yield.index(token)) token = token_yield[len(token_yield) - 1] tree.add_child(parent, full_token_yield.index(token)) elif len(token_yield) == 1: tree.add_child(parent, full_token_yield.index(token_yield[0])) return tree
def main(): # train_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/train/train.German.gold.xml' # corpus = sentence_names_to_hybridtrees(["s" + str(i) for i in range(1, 10)], file_name=train_path, hold=False) train_path = '../res/SPMRL_SHARED_2014_NO_ARABIC/GERMAN_SPMRL/gold/xml/dev/dev.German.gold.xml' names = ["s" + str(i) for i in range(40675, 40700)] names = ['s40564'] corpus = sentence_names_to_hybridtrees(names, path=train_path, hold=False) cp = TreeComparator() tree_sys = ConstituentTree() tree_sys.add_node('0', ConstituentCategory('PN')) tree_sys.add_node('1', corpus[0].token_yield()[0], order=True) tree_sys.add_punct("3", '$.', '.') tree_sys.add_to_root('0') tree_sys.add_child('0', '1') param = build_param() for i, hybridtree in enumerate(corpus): print(i) # discotree = convert_tree(hybridtree) tree, sent = convert_tree(hybridtree) tree2, sent2 = convert_tree(tree_sys) if i == 11: pass # print(discotree) # print(discotree.draw()) # print(DrawTree(discotree, discotree.sent)) print(DrawTree(tree, sent)) print(' '.join(map(lambda x: x.form(), hybridtree.full_token_yield()))) print(DrawTree(tree2, sent2)) print(tree[::-1]) print('POS', tree.pos()) result = TreePairResult(i, tree, sent, tree2, sent2, param) print(result.scores()) print("Comparator: ", cp.compare_hybridtrees(hybridtree, hybridtree))
def build_score_validator(baseline_grammar, grammarInfo, nont_map, storageManager, term_labelling, parser, corpus_validation, validationMethod): validator = PyCandidateScoreValidator(grammarInfo, storageManager, validationMethod) # parser = GFParser(baseline_grammar) tree_count = 0 der_count = 0 for gold_tree in corpus_validation: tree_count += 1 parser.set_input( term_labelling.prepare_parser_input(gold_tree.token_yield())) parser.parse() derivations = [der for _, der in parser.k_best_derivation_trees()] manager = PyDerivationManager(baseline_grammar, nont_map) manager.convert_hypergraphs(derivations) scores = [] relevant = set([tuple(t) for t in gold_tree.labelled_spans()]) for der in derivations: der_count += 1 h_tree = ConstituentTree() cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield()) dcp = DCP_evaluator(der).getEvaluation() dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False, construct_constituent_token) retrieved = set([tuple(t) for t in h_tree.labelled_spans()]) inters = retrieved & relevant # in case of parse failure there are two options here: # - parse failure -> no spans at all, thus precision = 1 # - parse failure -> a dummy tree with all spans wrong, thus precision = 0 precision = 1.0 * len(inters) / len(retrieved) \ if len(retrieved) > 0 else 0 recall = 1.0 * len(inters) / len(relevant) \ if len(relevant) > 0 else 0 fmeasure = 2.0 * precision * recall / (precision + recall) \ if precision + recall > 0 else 0 if validationMethod == "F1": scores.append(fmeasure) elif validationMethod == "Precision": scores.append(precision) elif validationMethod == "Recall": scores.append(recall) else: raise () validator.add_scored_candidates(manager, scores, 1.0 if len(relevant) > 0 else 0.0) # print(tree_count, scores) parser.clear() print("trees used for validation ", tree_count, "with", der_count * 1.0 / tree_count, "derivations on average") return validator
def do_parsing(parser, corpus): accuracy = ParseAccuracyPenalizeFailures() system_trees = [] start_at = time.time() n = 0 for tree in corpus: if not tree.complete() \ or tree.empty_fringe() \ or not 0 < len(tree.word_yield()) <= max_length: continue parser.set_input( terminal_labeling.prepare_parser_input(tree.token_yield())) parser.parse() if not parser.recognized(): relevant = tree.labelled_spans() accuracy.add_failure(relevant) system_trees.append( dummy_constituent_tree(tree.token_yield(), tree.full_token_yield(), "NP", "S")) # print('failure', tree.sent_label()) # for testing else: n += 1 dcp_tree = ConstituentTree() punctuation_positions = [ i + 1 for i, idx in enumerate(tree.full_yield()) if idx not in tree.id_yield() ] dcp_tree = parser.dcp_hybrid_tree_best_derivation( dcp_tree, tree.full_token_yield(), False, construct_constituent_token, punctuation_positions=punctuation_positions) retrieved = dcp_tree.labelled_spans() relevant = tree.labelled_spans() accuracy.add_accuracy(retrieved, relevant) system_trees.append(dcp_tree) parser.clear() end_at = time.time() print('Parsed:', n) if accuracy.n() > 0: print('Recall:', accuracy.recall()) print('Precision:', accuracy.precision()) print('F-measure:', accuracy.fmeasure()) print('Parse failures:', accuracy.n_failures()) else: print('No successful parsing') print('time:', end_at - start_at) print('') name = parse_results # do not overwrite existing result files i = 1 while os.path.isfile( os.path.join(parse_results_prefix, name + parse_results_suffix)): i += 1 name = parse_results + '_' + str(i) path = os.path.join(parse_results_prefix, name + parse_results_suffix) # # with open(path, 'w') as result_file: # print('Exporting parse trees of length <=', max_length, 'to', str(path)) # map(lambda x: x.strip_vroot(), system_trees) # result_file.writelines(hybridtrees_to_sentence_names(system_trees, test_start, max_length)) return accuracy
def test_stanford_unking_scheme(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) terminal_labeling = StanfordUNKing([tree]) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=terminal_labeling) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos()) rules = terminal_labeling.create_smoothed_rules() print(rules) new_rules = {} for rule in grammar.rules(): if rule.rhs() == []: assert len(rule.dcp()) == 1 dcp = rule.dcp()[0] assert len(dcp.rhs()) == 1 term = dcp.rhs()[0] head = term.head() pos = head.pos() for tag, form in rules: if tag == pos: lhs = LCFRS_lhs(rule.lhs().nont()) lhs.add_arg([form]) new_rules[lhs, dcp] = rules[tag, form] for lhs, dcp in new_rules: print(str(lhs), str(dcp), new_rules[(lhs, dcp)]) tokens = [ construct_constituent_token('hat', '--', True), construct_constituent_token('HAT', '--', True) ] self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), '_UNK') terminal_labeling.test_mode = True self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), 'hat')
class ConstituentTreeTest(unittest.TestCase): def test_basic_tree_methods(self): tree = self.tree print("rooted", tree.root) tree.add_to_root("VP1") print("rooted", tree.root) print(tree) print("sent label", tree.sent_label()) print("leaves", tree.leaves()) print("is leaf (leaves)", [(x, tree.is_leaf(x)) for (x, _, _) in tree.leaves()]) print("is leaf (internal)", [(x, tree.is_leaf(x)) for x in tree.ids()]) print("leaf index", [(x, tree.leaf_index(x)) for x in ["f1", "f2", "f3"]]) print("pos yield", tree.pos_yield()) print("word yield", tree.word_yield()) # reentrant # parent print("ids", tree.ids()) # reorder print("n nodes", tree.n_nodes()) print("n gaps", tree.n_gaps()) print("fringe VP", tree.fringe("VP")) print("fringe V", tree.fringe("V")) print("empty fringe", tree.empty_fringe()) print("complete?", tree.complete()) print("max n spans", tree.max_n_spans()) print("unlabelled structure", tree.unlabelled_structure()) print("labelled spans", tree.labelled_spans()) def test_induction(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") feature_log1 = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(tree, rec_part(tree), feature_logging=feature_log1, naming=naming) for key in feature_log1: print(key, feature_log1[key]) print(grammar) feats = defaultdict(lambda: 0) grammar_ = fringe_extract_lcfrs(tree, rec_part(tree), isolate_pos=True, feature_logging=feats, naming=naming) print(grammar_) for key in feats: print(key, feats[key]) print("Adding 2nd grammar to first") grammar.add_gram(grammar_, feature_logging=(feature_log1, feats)) for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print("Adding 3rd grammar to first") feats3 = defaultdict(lambda: 0) grammar3 = fringe_extract_lcfrs(self.tree2, rec_part(self.tree2), isolate_pos=True, feature_logging=feats3, naming=naming) grammar.add_gram(grammar3, feature_logging=(feature_log1, feats3)) print() for idx in range(0, len(grammar.rules())): print(idx, grammar.rule_index(idx)) print() print("New feature log") print() for key in feature_log1: print(key, feature_log1[key]) grammar.make_proper() build_nont_splits_dict(grammar, feature_log1, nonterminals=Enumerator()) print(grammar.rule_index(0)) print(grammar.rule_index(2)) def test_markovized_induction(self): naming = 'strict-markov-v-2-h-0' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True) print(grammar) def test_induction_and_parsing_with_pos_recovery(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=FormTerminals()) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos()) def test_stanford_unking_scheme(self): naming = 'child' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) tree = self.tree tree.add_to_root("VP1") print(tree) terminal_labeling = StanfordUNKing([tree]) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True, term_labeling=terminal_labeling) print(grammar) parser = LCFRS_parser(grammar) parser.set_input([token.form() for token in tree.token_yield()]) parser.parse() self.assertTrue(parser.recognized()) derivation = parser.best_derivation_tree() e = DCP_evaluator(derivation) dcp_term = e.getEvaluation() print(str(dcp_term[0])) t = ConstituentTree() dcp_to_hybridtree( t, dcp_term, [ construct_constituent_token(token.form(), '--', True) for token in tree.token_yield() ], ignore_punctuation=False, construct_token=construct_constituent_token) print(t) self.assertEqual(len(tree.token_yield()), len(t.token_yield())) for tok1, tok2 in zip(tree.token_yield(), t.token_yield()): self.assertEqual(tok1.form(), tok2.form()) self.assertEqual(tok1.pos(), tok2.pos()) rules = terminal_labeling.create_smoothed_rules() print(rules) new_rules = {} for rule in grammar.rules(): if rule.rhs() == []: assert len(rule.dcp()) == 1 dcp = rule.dcp()[0] assert len(dcp.rhs()) == 1 term = dcp.rhs()[0] head = term.head() pos = head.pos() for tag, form in rules: if tag == pos: lhs = LCFRS_lhs(rule.lhs().nont()) lhs.add_arg([form]) new_rules[lhs, dcp] = rules[tag, form] for lhs, dcp in new_rules: print(str(lhs), str(dcp), new_rules[(lhs, dcp)]) tokens = [ construct_constituent_token('hat', '--', True), construct_constituent_token('HAT', '--', True) ] self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), '_UNK') terminal_labeling.test_mode = True self.assertEqual(terminal_labeling.token_label(tokens[0]), 'hat') self.assertEqual(terminal_labeling.token_label(tokens[1]), 'hat') def test_induction_with_spans(self): naming = 'child-spans' def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) # return fanout_k_left_to_right(tree, 1) tree = self.tree tree.add_to_root("VP1") print(tree) grammar = fringe_extract_lcfrs(tree, rec_part(tree), naming=naming, isolate_pos=True) print(grammar) def test_induction_2(self): def rec_part(tree): return left_branching_partitioning(len(tree.id_yield())) features = defaultdict(lambda: 0) grammar = fringe_extract_lcfrs(self.tree3, rec_part(self.tree3), naming="child", feature_logging=features, isolate_pos=True) grammar.make_proper() if False: for idx in range(0, len(grammar.rules())): print(grammar.rule_index(idx)) for key in features: if key[0] == idx: print(key, features[key]) print() for key in features: if type(key[0]) == int: continue print(key, features[key]) nont_splits, root_weights, rule_weights, _ = build_nont_splits_dict( grammar, features, nonterminals=Enumerator(), feat_function=pos_cat_feats, debug=True) print(nont_splits) print(root_weights) print(rule_weights) def setUp(self): tree = ConstituentTree("s1") tree.add_leaf("f1", "VAFIN", "hat", morph=[("number", "Sg"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree.add_leaf("f2", "ADV", "schnell", morph=[("degree", "Pos")]) tree.add_leaf("f3", "VVPP", "gearbeitet") tree.add_punct("f4", "PUNC", ".") tree.add_child("VP2", "f1") tree.add_child("VP2", "f3") tree.add_child("ADVP", "f2") tree.add_child("VP1", "VP2") tree.add_child("VP1", "ADVP") tree.set_label("VP2", "VP") tree.set_label("VP1", "VP") tree.set_label("ADVP", "ADVP") self.tree = tree tree2 = ConstituentTree("s2") tree2.add_leaf("f1", "VAFIN", "haben", morph=[("number", "Pl"), ("person", "3"), ("tense", "Past"), ("mood", "Ind")]) tree2.add_leaf("f2", "ADV", "gut", morph=[("degree", "Pos")]) tree2.add_leaf("f3", "VVPP", "gekocht") tree2.add_punct("f4", "PUNC", ".") tree2.add_child("VP2", "f1") tree2.add_child("VP2", "f3") tree2.add_child("ADVP", "f2") tree2.add_child("VP1", "VP2") tree2.add_child("VP1", "ADVP") tree2.set_label("VP2", "VP") tree2.set_label("VP1", "VP") tree2.set_label("ADVP", "ADVP") tree2.add_to_root("VP1") self.tree2 = tree2 self.tree3 = ConstituentTree("s3") self.tree3.add_leaf("f1", "ADJA", "Allgemeiner", edge="NK", morph=[("number", "Sg")]) self.tree3.add_leaf("f2", "ADJA", "Deutscher", edge="NK", morph=[("degree", "Pos"), ("number", "Sg")]) self.tree3.add_leaf("f3", "NN", "Fahrrad", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) self.tree3.add_leaf("f4", "NN", "Club", edge="NK", morph=[("number", "Sg"), ("gender", "Neut")]) for i in range(1, 5): self.tree3.add_child("NP", "f" + str(i)) self.tree3.set_label("NP", "NP") self.tree3.add_to_root("NP")
def sentence_names_to_hybridtrees(names, path, enc="utf-8", disconnect_punctuation=True, add_vroot=False, mode="STANDARD", secedge=False): """ :param names: list of sentence identifiers :type names: list[str] :param path: path to corpus :type path: str :param enc: file encoding :type enc: str :param disconnect_punctuation: disconnect :type disconnect_punctuation: bool :param add_vroot: adds a virtual root node labelled 'VROOT' :type add_vroot: bool :param mode: either 'STANDARD' (no lemma field) or 'DISCODOP' (lemma field) :type mode: str :param secedge: add secondary edges :type secedge: bool :return: list of constituent structures (HybridTrees or HybridDags) from file_name whose names are in names """ negra = codecs.open(expanduser(path), encoding=enc) trees = [] tree = None name = '' n_leaves = 0 for line in negra: match_mode = DISCODOP_HEADER.match(line) if match_mode: mode = "DISCO-DOP" continue match_sent_start = BOS.search(line) match_sent_end = EOS.match(line) if mode == "STANDARD": match_nont = \ STANDARD_NONTERMINAL.match(line) match_term = \ STANDARD_TERMINAL.match(line) elif mode == "DISCO-DOP": match_nont = DISCODOP_NONTERMINAL.match(line) match_term = DISCODOP_TERMINAL.match(line) if match_sent_start: this_name = match_sent_start.group(1) if this_name in names: name = this_name if secedge: tree = HybridDag(name) else: tree = ConstituentTree(name) n_leaves = 0 if add_vroot: tree.set_label('0', 'VROOT') tree.add_to_root('0') elif match_sent_end: this_name = match_sent_end.group(1) if name == this_name: tree.reorder() trees += [tree] tree = None elif tree: if match_nont: id = match_nont.group(1) if mode == "STANDARD": OFFSET = 0 else: OFFSET = 1 nont = match_nont.group(2 + OFFSET) edge = match_nont.group(4 + OFFSET) parent = match_nont.group(5 + OFFSET) # print(match_nont.groups(), len(match_nont.groups())) secedges = [] if not secedge or match_nont.group(6 + OFFSET) is None else \ match_nont.group(6 + OFFSET).split() tree.add_node(id, ConstituentCategory(nont), False, True) tree.node_token(id).set_edge_label(edge) if parent == '0' and not add_vroot: tree.add_to_root(id) else: tree.add_child(parent, id) if secedge and secedges: # print(secedges) for sei in range(0, len(secedges) // 2, 2): # sec_label = secedges[sei] assert secedges[sei] == edge sec_parent = secedges[sei + 1] tree.add_sec_child(sec_parent, id) elif match_term: if mode == "STANDARD": OFFSET = 0 else: OFFSET = 1 word = match_term.group(1) pos = match_term.group(2 + OFFSET) edge = match_term.group(4 + OFFSET) parent = match_term.group(5 + OFFSET) # print(match_term.groups(), len(match_term.groups())) secedges = [] if not secedge or match_term.group(6 + OFFSET) is None else \ match_term.group(6 + OFFSET).split() n_leaves += 1 leaf_id = str(100 + n_leaves) if parent == '0' and disconnect_punctuation: tree.add_punct(leaf_id, pos, word) else: if parent == '0' and not add_vroot: tree.add_to_root(leaf_id) else: tree.add_child(parent, leaf_id) token = ConstituentTerminal(word, pos, edge, None, '--') tree.add_node(leaf_id, token, True, True) tree.node_token(leaf_id).set_edge_label(edge) if secedge and secedges: # print(secedges) for sei in range(0, len(secedges) // 2, 2): # sec_label = secedges[sei] assert secedges[sei] == edge sec_parent = secedges[sei + 1] tree.add_sec_child(sec_parent, leaf_id) negra.close() return trees