def induce_grammar_from(dsg, rec_par, decomp, labeling=(lambda x, y: str(x)), terminal_labeling=id, terminal_labeling_lcfrs=None, start="START", normalize=True, enforce_outputs=True): if terminal_labeling_lcfrs is None: terminal_labeling_lcfrs = terminal_labeling lcfrs = LCFRS(start=start) ordered_nodes = dsg.dog.ordered_nodes() rhs_nont = induce_grammar_rec(lcfrs, dsg, rec_par, decomp, labeling, terminal_labeling, terminal_labeling_lcfrs , normalize, enforce_outputs, ordered_nodes=ordered_nodes) rhs_top = dsg.dog.top(decomp[0]) # construct a chain rule from START to initial nonterminal of decomposition # LCFRS part lcfrs_lhs = LCFRS_lhs(start) lcfrs_lhs.add_arg([LCFRS_var(0, 0)]) # DOG part dog = DirectedOrderedGraph() assert len(dsg.dog.inputs) == 0 assert not enforce_outputs or len(dsg.dog.outputs) > 0 for i in range(len(rhs_top)): dog.add_node(i) for output in dsg.dog.outputs: dog.add_to_outputs(rhs_top.index(output)) dog.add_nonterminal_edge([], [i for i in range(len(rhs_top))], enforce_outputs) # no sync sync = [] lcfrs.add_rule(lcfrs_lhs, [rhs_nont], weight=1.0, dcp=[dog, sync]) return lcfrs
def induce(method=direct_extract_lcfrs): merged_gram = LCFRS(start=induction_start) first = first_training_sentence() last = last_training_sentence() print('Inducing grammar from', CORPUS, first, '-', last, \ 'using method', method.__name__) n = do_range(first, last, lambda tree: add_gram(tree, merged_gram, method), lambda tree: tree.complete() and not tree.empty_fringe()) print('Trained on size:', n) merged_gram.make_proper() return merged_gram
def induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, terminal_labeling, start="START", normalize=True): grammar = LCFRS(start=start) for dsg in dsgs: rec_part = rec_part_strategy(dsg) # if calc_fanout(rec_part) > 1 or calc_rank(rec_part) > 2: # rec_part = rec_part_strategy(dsg) # assert False decomp = compute_decomposition(dsg, rec_part) dsg_grammar = induce_grammar_from(dsg, rec_part, decomp, nonterminal_labeling, terminal_labeling, terminal_labeling, start, normalize) grammar.add_gram(dsg_grammar) return grammar
def build_grammar(self): grammar = LCFRS("S") lhs1 = LCFRS_lhs("S") lhs1.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) rule_1 = grammar.add_rule(lhs1, ["S", "S"]) lhs2 = LCFRS_lhs("S") lhs2.add_arg(["a"]) rule_2 = grammar.add_rule(lhs2, []) lhs3 = LCFRS_lhs("A") lhs3.add_arg(["a"]) rule_3 = grammar.add_rule(lhs3, []) return grammar, rule_1.get_idx(), rule_2.get_idx()
def build_paper_grammar(): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"]) # rule 2 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0,0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["B", "B"]) grammar.make_proper() return grammar
def test_basic_sdcp_parsing_constituency(self): tree1 = constituent_tree_1() tree2 = constituent_tree_2() tree3 = constituent_tree_1_pos_stripped() terminal_labeling = FormTerminals() # [tree1, tree2], 1, filter=["VP"]) fanout = 1 grammar = LCFRS('START') for tree in [tree1, tree2]: tree_part = tree.unlabelled_structure() part = fanout_limited_partitioning(tree_part, fanout) tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling) grammar.add_gram(tree_grammar) grammar.make_proper() print("grammar induced. Printing rules...", file=stderr) for rule in grammar.rules(): print(rule, file=stderr) parser_type = LCFRS_sDCP_Parser print("preprocessing grammar", file=stderr) parser_type.preprocess_grammar(grammar, terminal_labeling, debug=True) print("invoking parser", file=stderr) parser = parser_type(grammar, tree1) print("listing derivations", file=stderr) for der in parser.all_derivation_trees(): print(der) output_tree = ConstituentTree(tree1.sent_label()) tokens = [construct_constituent_token(token.form(), '--', True) for token in tree1.token_yield()] dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_constituent_token) print(tree1) print(output_tree) parser = parser_type(grammar, tree3) print(parser.recognized()) for der in parser.all_derivation_trees(): print(der) output_tree = ConstituentTree(tree3.sent_label()) tokens = [construct_constituent_token(token.form(), '--', True) for token in tree3.token_yield()] dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False, construct_constituent_token) print(tree3) print(output_tree) print("completed test", file=stderr)
def induce(trees, partition_builder, terminal_labeling, nonterminal_counts, start=START): grammar = LCFRS(start=start) n_trees = len(trees) for i, tree in enumerate(trees): # if not i % 1000: # print(f'starting induction on tree {i} out of {n_trees}') # tree contains nodes if tree.n_yield_nodes(): partition = partition_builder(tree=tree) # print(pretty_print_partition(partition=partition)) __rec_induce(tree=tree, grammar=grammar, string_partition=partition, terminal_labeling=terminal_labeling, nonterminal_counts=nonterminal_counts) return grammar
def induce_grammar(self, corpus, start="START"): grammar = LCFRS(start=start) for obj in corpus: obj = self.preprocess_before_induction(obj) obj_grammar, features = self.induce_from(obj) if obj_grammar is None: continue if features is None: grammar.add_gram(obj_grammar, None) else: grammar.add_gram(obj_grammar, (self.feature_log, features)) self.postprocess_grammar(grammar) self.base_grammar = grammar _, path = tempfile.mkstemp(suffix=".base.grammar", dir=self.directory) with open(path, 'wb') as f: pickle.dump(self.base_grammar, f) self.stage_dict["base_grammar"] = path
def direct_extract_lcfrs_from_prebinarized_corpus(tree, term_labeling=PosTerminals(), nont_labeling=BasicNonterminalLabeling(), isolate_pos=True): gram = LCFRS(start=START) root = tree.root[0] if root in tree.full_yield(): lhs = LCFRS_lhs(START) label = term_labeling.token_label(tree.node_token(root)) lhs.add_arg([label]) dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_term(DCP_index(0, edge_label=tree.node_token(root).edge()), [])]) gram.add_rule(lhs, [], dcp=[dcp_rule]) else: first, _, _ = direct_extract_lcfrs_prebinarized_recur(tree, root, gram, term_labeling, nont_labeling, isolate_pos) lhs = LCFRS_lhs(START) lhs.add_arg([LCFRS_var(0, 0)]) dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)]) gram.add_rule(lhs, [first], dcp=[dcp_rule]) return gram
def induce_grammar(trees, nont_labelling, term_labelling, recursive_partitioning, start_nont='START'): """ :rtype: LCFRS :param trees: corpus of HybridTree (i.e. list (or Generator for lazy IO)) :type trees: __generator[HybridTree] :type nont_labelling: AbstractLabeling :param term_labelling: HybridTree, NodeId -> str :param recursive_partitioning: HybridTree -> RecursivePartitioning :type start_nont: str :rtype: int, LCFRS Top level method to induce an LCFRS/DCP-hybrid grammar for dependency parsing. """ grammar = LCFRS(start_nont) n_trees = 0 for tree in trees: n_trees += 1 for rec_par in recursive_partitioning: match = re.search(r'no_new_nont', rec_par.__name__) if match: rec_par_int = rec_par(tree, grammar.nonts(), nont_labelling) else: rec_par_int = rec_par(tree) rec_par_nodes = tree.node_id_rec_par(rec_par_int) (_, _, nont_name) = add_rules_to_grammar_rec(tree, rec_par_nodes, grammar, nont_labelling, term_labelling) # Add rule from top start symbol to top most nonterminal for the hybrid tree lhs = LCFRS_lhs(start_nont) lhs.add_arg([LCFRS_var(0, 0)]) rhs = [nont_name] dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)]) grammar.add_rule(lhs, rhs, 1.0, [dcp_rule]) grammar.make_proper() return n_trees, grammar
def test_negra_dag_small_grammar(self): DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export' DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export' names = list([str(i) for i in range(1, 101)]) if not os.path.exists(DAG_CORPUS): print( 'run the following command to create an export corpus with dags:' ) print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' + 'res/tiger/tiger_release_aug07.corrected.16012013.xml ' + DAG_CORPUS + ' 1 50474') self.assertTrue(os.path.exists(DAG_CORPUS)) if not os.path.exists(DAG_CORPUS_BIN): print( 'run the following command to binarize the export corpus with dags:' ) print("discodop treetransforms --binarize -v 1 -h 1 " + DAG_CORPUS + " " + DAG_CORPUS_BIN) # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export') # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN]) self.assertTrue(os.path.exists(DAG_CORPUS_BIN)) corpus = np.sentence_names_to_hybridtrees(names, DAG_CORPUS, secedge=True) corpus_bin = np.sentence_names_to_hybridtrees(names, DAG_CORPUS_BIN, secedge=True) grammar = LCFRS(start="START") for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin): self.assertEqual(len(hybrid_dag.token_yield()), len(hybrid_dag_bin.token_yield())) dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus( hybrid_dag_bin) grammar.add_gram(dag_grammar) grammar.make_proper() print( "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules" % (len(grammar.nonts()), len(grammar.rules()))) parser = DiscodopKbestParser(grammar, k=1) _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_', suffix='.export') with open(RESULT_FILE, 'w') as results: for hybrid_dag in corpus: poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield())) parser.set_input(poss) parser.parse() self.assertTrue(parser.recognized()) der = parser.best_derivation_tree() dcp_term = DCP_evaluator(der).getEvaluation() dag_eval = HybridDag(hybrid_dag.sent_label()) dcp_to_hybriddag(dag_eval, dcp_term, copy.deepcopy(hybrid_dag.token_yield()), False, construct_token=construct_constituent_token) lines = np.serialize_hybridtrees_to_negra( [dag_eval], 1, 500, use_sentence_names=True) for line in lines: print(line, end='', file=results) parser.clear() print("Wrote results to %s" % RESULT_FILE)
def __test_projection(self, split_weights, goal_weights, merge_method=False): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "A"]) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) lhs = LCFRS_lhs("A") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], weight=2.0) grammar.make_proper() # print(grammar) nonterminal_map = Enumerator() grammarInfo = PyGrammarInfo(grammar, nonterminal_map) storageManager = PyStorageManager() la = build_PyLatentAnnotation([1, 2], [1.0], split_weights, grammarInfo, storageManager) # parser = LCFRS_parser(grammar) # parser.set_input(["a", "b"]) # parser.parse() # der = parser.best_derivation_tree() # print(la.serialize()) if merge_method: la.project_weights(grammar, grammarInfo) else: splits, _, _ = la.serialize() merge_sources = [[[ split for split in range(0, splits[nont_idx]) ]] for nont_idx in range(0, nonterminal_map.get_counter())] # print("Projecting to fine grammar LA", file=self.logger) coarse_la = la.project_annotation_by_merging(grammarInfo, merge_sources, debug=False) coarse_la.project_weights(grammar, grammarInfo) # print(grammar) for i in range(3): self.assertAlmostEqual( grammar.rule_index(i).weight(), goal_weights[i])
def build_nm_grammar(): grammar = LCFRS("START") # rule 0 lhs = LCFRS_lhs("START") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["S"]) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0), LCFRS_var(0, 1), LCFRS_var(1, 1)]) grammar.add_rule(lhs, ["N", "M"]) for nont, term in [("A", "a"), ("B", "b"), ("C", "c"), ("D", "d")]: # rule 2 lhs = LCFRS_lhs(nont) lhs.add_arg([term]) grammar.add_rule(lhs, []) for nont, nont_, c1, c2 in [("N", "N'", "A", "C"), ("M", "M'", "B", "D")]: # rule 3 lhs = LCFRS_lhs(nont) lhs.add_arg([LCFRS_var(0, 0)]) lhs.add_arg([LCFRS_var(1, 0)]) grammar.add_rule(lhs, [c1, c2]) # rule 4 lhs = LCFRS_lhs(nont) lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) lhs.add_arg([LCFRS_var(0,1)]) grammar.add_rule(lhs, [nont_, c1]) # rule 5 lhs = LCFRS_lhs(nont_) lhs.add_arg([LCFRS_var(0, 0)]) lhs.add_arg([LCFRS_var(0, 1), LCFRS_var(1, 0)]) grammar.add_rule(lhs, [nont, c2]) grammar.make_proper() return grammar
def test_la_viterbi_parsing_3(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() print(nontMap.object_index("S")) print(nontMap.object_index("B")) la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.latent_viterbi_derivation(True) print(der) der2 = None for w, der_ in parser.k_best_derivation_trees(): if der2 is None: der2 = der_ print(w, der_) print(der2)
def build_grammar(): grammar = LCFRS("START") # rule 0 lhs = LCFRS_lhs("START") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["S"]) # rule 1 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["S", "S"]) # rule 1.5 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["S", "S"], dcp=["1.5"]) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) # rule 3 lhs = LCFRS_lhs("S") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], weight=2.0) # rule 4 lhs = LCFRS_lhs("S") lhs.add_arg(["b"]) grammar.add_rule(lhs, [], dcp=["4"]) # rule 5 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, []) grammar.make_proper() return grammar
def test_projection_based_parser_k_best_hack(self): grammar = LCFRS("S") # rule 0 lhs = LCFRS_lhs("B") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.25) # rule 1 lhs = LCFRS_lhs("A") lhs.add_arg(["a"]) grammar.add_rule(lhs, [], 0.5) # rule 2 lhs = LCFRS_lhs("S") lhs.add_arg([LCFRS_var(0, 0)]) grammar.add_rule(lhs, ["B"], 1.0) # rule 3 lhs = LCFRS_lhs("A") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.5) # rule 4 lhs = LCFRS_lhs("B") lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)]) grammar.add_rule(lhs, ["A", "B"], 0.75) grammar.make_proper() inp = ["a"] * 3 nontMap = Enumerator() gi = PyGrammarInfo(grammar, nontMap) sm = PyStorageManager() la = build_PyLatentAnnotation_initial(grammar, gi, sm) parser = Coarse_to_fine_parser(grammar, la, gi, nontMap, base_parser_type=GFParser_k_best) parser.set_input(inp) parser.parse() self.assertTrue(parser.recognized()) der = parser.max_rule_product_derivation() print(der) der = parser.best_derivation_tree() print(der) for node in der.ids(): print(der.getRule(node), der.spanned_ranges(node))
def main(): # # induce or load grammar # if not os.path.isfile(grammar_path): # grammar = LCFRS('START') # for tree in train_corpus: # if not tree.complete() or tree.empty_fringe(): # continue # part = recursive_partitioning(tree) # tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling) # grammar.add_gram(tree_grammar) # grammar.make_proper() # pickle.dump(grammar, open(grammar_path, 'wb')) # else: # grammar = pickle.load(open(grammar_path, 'rb')) grammar = LCFRS('START') for tree in train_corpus: if not tree.complete() or tree.empty_fringe(): continue part = recursive_partitioning(tree) tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling) grammar.add_gram(tree_grammar) grammar.make_proper() # # compute or load reducts # if not os.path.isfile(reduct_path): # traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling) # traceTrain.serialize(reduct_path) # else: # traceTrain = PySDCPTraceManager(grammar, terminal_labeling) # traceTrain.load_traces_from_file(reduct_path) traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling) traceValidationGenetic = compute_reducts(grammar, validation_genetic_corpus, terminal_labeling) traceValidation = compute_reducts(grammar, validation_corpus, terminal_labeling) # prepare EM training grammarInfo = PyGrammarInfo(grammar, traceTrain.get_nonterminal_map()) if not grammarInfo.check_for_consistency(): print("[Genetic] GrammarInfo is not consistent!") storageManager = PyStorageManager() em_builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo) em_builder.set_em_epochs(em_epochs) em_builder.set_simple_expector(threads=threads) emTrainer = em_builder.build() # randomize initial weights and do em training la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager) la_no_splits.add_random_noise(seed=seed) emTrainer.em_train(la_no_splits) la_no_splits.project_weights(grammar, grammarInfo) # emTrainerOld = PyEMTrainer(traceTrain) # emTrainerOld.em_training(grammar, 30, "rfe", tie_breaking=True) # compute parses for validation set baseline_parser = GFParser_k_best(grammar, k=k_best) validator = build_score_validator(grammar, grammarInfo, traceTrain.get_nonterminal_map(), storageManager, terminal_labeling, baseline_parser, validation_corpus, validationMethod) del baseline_parser # prepare SM training builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo) builder.set_em_epochs(em_epochs) builder.set_split_randomization(1.0, seed + 1) builder.set_simple_expector(threads=threads) builder.set_score_validator(validator, validationDropIterations) builder.set_smoothing_factor(smoothingFactor=smoothing_factor) builder.set_split_randomization(percent=split_randomization) splitMergeTrainer = builder.set_scc_merger(threshold=scc_merger_threshold, threads=threads).build() splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing") splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing") # set initial latent annotation latentAnnotations = [] for i in range(0, genetic_initial): splitMergeTrainer.reset_random_seed(seed + i + 1) la = splitMergeTrainer.split_merge_cycle(la_no_splits) if not la.check_for_validity(): print('[Genetic] Initial LA', i, 'is not consistent! (See details before)') if not la.is_proper(): print('[Genetic] Initial LA', i, 'is not proper!') heapq.heappush( latentAnnotations, (evaluate_la(grammar, grammarInfo, la, traceValidationGenetic, validation_genetic_corpus), i, la)) print('[Genetic] added initial LA', i) (fBest, idBest, laBest) = min(latentAnnotations) validation_score = evaluate_la(grammar, grammarInfo, laBest, traceValidation, test_corpus) print("[Genetic] Started with best F-Score (Test) of", validation_score, "from Annotation ", idBest) geneticCount = genetic_initial random.seed(seed) for round in range(1, genetic_cycles + 1): print("[Genetic] Starting Recombination Round ", round) # newpopulation = list(latentAnnotations) newpopulation = [] # Cross all candidates! for leftIndex in range(0, len(latentAnnotations)): (fLeft, idLeft, left) = latentAnnotations[leftIndex] # TODO: How to determine NTs to keep? # do SM-Training print("[Genetic] do SM-training on", idLeft, "and create LA", geneticCount) la = splitMergeTrainer.split_merge_cycle(la) if not la.check_for_validity(): print( '[Genetic] Split/Merge introduced invalid weights into LA', geneticCount) if not la.is_proper(): print( '[Genetic] Split/Merge introduced problems with properness of LA', geneticCount) fscore = evaluate_la(grammar, grammarInfo, la, traceValidationGenetic, validation_genetic_corpus) print("[Genetic] LA", geneticCount, "has F-score: ", fscore) heapq.heappush(newpopulation, (fscore, geneticCount, la)) geneticCount += 1 heapq.heapify(newpopulation) latentAnnotations = heapq.nsmallest( genetic_population, heapq.merge(latentAnnotations, newpopulation)) heapq.heapify(latentAnnotations) (fBest, idBest, laBest) = min(latentAnnotations) validation_score = evaluate_la(grammar, grammarInfo, laBest, traceValidation, test_corpus) print("[Genetic] Best LA", idBest, "has F-Score (Test) of ", validation_score)