Example #1
0
def induce_grammar_from(dsg, rec_par, decomp, labeling=(lambda x, y: str(x)), terminal_labeling=id, terminal_labeling_lcfrs=None, start="START",
                        normalize=True, enforce_outputs=True):
    if terminal_labeling_lcfrs is None:
        terminal_labeling_lcfrs = terminal_labeling
    lcfrs = LCFRS(start=start)
    ordered_nodes = dsg.dog.ordered_nodes()
    rhs_nont = induce_grammar_rec(lcfrs, dsg, rec_par, decomp, labeling, terminal_labeling, terminal_labeling_lcfrs
                                  , normalize, enforce_outputs, ordered_nodes=ordered_nodes)
    rhs_top = dsg.dog.top(decomp[0])

    # construct a chain rule from START to initial nonterminal of decomposition
    # LCFRS part
    lcfrs_lhs = LCFRS_lhs(start)
    lcfrs_lhs.add_arg([LCFRS_var(0, 0)])

    # DOG part
    dog = DirectedOrderedGraph()
    assert len(dsg.dog.inputs) == 0
    assert not enforce_outputs or len(dsg.dog.outputs) > 0
    for i in range(len(rhs_top)):
        dog.add_node(i)
    for output in dsg.dog.outputs:
        dog.add_to_outputs(rhs_top.index(output))
    dog.add_nonterminal_edge([], [i for i in range(len(rhs_top))], enforce_outputs)

    # no sync
    sync = []
    lcfrs.add_rule(lcfrs_lhs, [rhs_nont], weight=1.0, dcp=[dog, sync])

    return lcfrs
def induce(method=direct_extract_lcfrs):
    merged_gram = LCFRS(start=induction_start)
    first = first_training_sentence()
    last = last_training_sentence()
    print('Inducing grammar from', CORPUS, first, '-', last, \
        'using method', method.__name__)
    n = do_range(first, last, lambda tree: add_gram(tree, merged_gram, method),
                 lambda tree: tree.complete() and not tree.empty_fringe())
    print('Trained on size:', n)
    merged_gram.make_proper()
    return merged_gram
Example #3
0
def induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, terminal_labeling, start="START",
                          normalize=True):
    grammar = LCFRS(start=start)
    for dsg in dsgs:
        rec_part = rec_part_strategy(dsg)
        # if calc_fanout(rec_part) > 1 or calc_rank(rec_part) > 2:
        #     rec_part = rec_part_strategy(dsg)
        #     assert False
        decomp = compute_decomposition(dsg, rec_part)
        dsg_grammar = induce_grammar_from(dsg, rec_part, decomp, nonterminal_labeling, terminal_labeling, terminal_labeling, start,
                                          normalize)
        grammar.add_gram(dsg_grammar)
    return grammar
Example #4
0
    def build_grammar(self):
        grammar = LCFRS("S")

        lhs1 = LCFRS_lhs("S")
        lhs1.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        rule_1 = grammar.add_rule(lhs1, ["S", "S"])

        lhs2 = LCFRS_lhs("S")
        lhs2.add_arg(["a"])
        rule_2 = grammar.add_rule(lhs2, [])

        lhs3 = LCFRS_lhs("A")
        lhs3.add_arg(["a"])
        rule_3 = grammar.add_rule(lhs3, [])

        return grammar, rule_1.get_idx(), rule_2.get_idx()
Example #5
0
    def build_paper_grammar():
        grammar = LCFRS("S")
        # rule 0
        lhs = LCFRS_lhs("B")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [])

        # rule 1
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0)])
        grammar.add_rule(lhs, ["B"])

        # rule 2
        lhs = LCFRS_lhs("B")
        lhs.add_arg([LCFRS_var(0,0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["B", "B"])

        grammar.make_proper()
        return grammar
Example #6
0
    def test_basic_sdcp_parsing_constituency(self):
        tree1 = constituent_tree_1()
        tree2 = constituent_tree_2()
        tree3 = constituent_tree_1_pos_stripped()

        terminal_labeling = FormTerminals() # [tree1, tree2], 1, filter=["VP"])
        fanout = 1

        grammar = LCFRS('START')
        for tree in [tree1, tree2]:
            tree_part = tree.unlabelled_structure()
            part = fanout_limited_partitioning(tree_part, fanout)
            tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling)
            grammar.add_gram(tree_grammar)
        grammar.make_proper()

        print("grammar induced. Printing rules...", file=stderr)

        for rule in grammar.rules():
            print(rule, file=stderr)

        parser_type = LCFRS_sDCP_Parser

        print("preprocessing grammar", file=stderr)

        parser_type.preprocess_grammar(grammar, terminal_labeling, debug=True)

        print("invoking parser", file=stderr)

        parser = parser_type(grammar, tree1)

        print("listing derivations", file=stderr)

        for der in parser.all_derivation_trees():
            print(der)
            output_tree = ConstituentTree(tree1.sent_label())
            tokens = [construct_constituent_token(token.form(), '--', True) for token in tree1.token_yield()]
            dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False,
                              construct_constituent_token)
            print(tree1)
            print(output_tree)

        parser = parser_type(grammar, tree3)
        print(parser.recognized())
        for der in parser.all_derivation_trees():
            print(der)
            output_tree = ConstituentTree(tree3.sent_label())
            tokens = [construct_constituent_token(token.form(), '--', True) for token in tree3.token_yield()]
            dcp_to_hybridtree(output_tree, DCP_evaluator(der).getEvaluation(), tokens, False,
                              construct_constituent_token)
            print(tree3)
            print(output_tree)

        print("completed test", file=stderr)
Example #7
0
def induce(trees,
           partition_builder,
           terminal_labeling,
           nonterminal_counts,
           start=START):
    grammar = LCFRS(start=start)
    n_trees = len(trees)
    for i, tree in enumerate(trees):
        # if not i % 1000:
        #     print(f'starting induction on tree {i} out of {n_trees}')
        # tree contains nodes
        if tree.n_yield_nodes():
            partition = partition_builder(tree=tree)
            # print(pretty_print_partition(partition=partition))
            __rec_induce(tree=tree,
                         grammar=grammar,
                         string_partition=partition,
                         terminal_labeling=terminal_labeling,
                         nonterminal_counts=nonterminal_counts)
    return grammar
 def induce_grammar(self, corpus, start="START"):
     grammar = LCFRS(start=start)
     for obj in corpus:
         obj = self.preprocess_before_induction(obj)
         obj_grammar, features = self.induce_from(obj)
         if obj_grammar is None:
             continue
         if features is None:
             grammar.add_gram(obj_grammar, None)
         else:
             grammar.add_gram(obj_grammar, (self.feature_log, features))
     self.postprocess_grammar(grammar)
     self.base_grammar = grammar
     _, path = tempfile.mkstemp(suffix=".base.grammar", dir=self.directory)
     with open(path, 'wb') as f:
         pickle.dump(self.base_grammar, f)
         self.stage_dict["base_grammar"] = path
def direct_extract_lcfrs_from_prebinarized_corpus(tree,
                                                  term_labeling=PosTerminals(),
                                                  nont_labeling=BasicNonterminalLabeling(),
                                                  isolate_pos=True):
    gram = LCFRS(start=START)
    root = tree.root[0]
    if root in tree.full_yield():
        lhs = LCFRS_lhs(START)
        label = term_labeling.token_label(tree.node_token(root))
        lhs.add_arg([label])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_term(DCP_index(0, edge_label=tree.node_token(root).edge()), [])])
        gram.add_rule(lhs, [], dcp=[dcp_rule])
    else:
        first, _, _ = direct_extract_lcfrs_prebinarized_recur(tree, root, gram, term_labeling, nont_labeling, isolate_pos)
        lhs = LCFRS_lhs(START)
        lhs.add_arg([LCFRS_var(0, 0)])
        dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)])
        gram.add_rule(lhs, [first], dcp=[dcp_rule])
    return gram
Example #10
0
def induce_grammar(trees, nont_labelling, term_labelling, recursive_partitioning, start_nont='START'):
    """
    :rtype: LCFRS
    :param trees: corpus of HybridTree (i.e. list (or Generator for lazy IO))
    :type trees: __generator[HybridTree]
    :type nont_labelling: AbstractLabeling
    :param term_labelling: HybridTree, NodeId -> str
    :param recursive_partitioning: HybridTree -> RecursivePartitioning
    :type start_nont: str
    :rtype: int, LCFRS

    Top level method to induce an LCFRS/DCP-hybrid grammar for dependency parsing.
    """
    grammar = LCFRS(start_nont)
    n_trees = 0
    for tree in trees:
        n_trees += 1
        for rec_par in recursive_partitioning:
            match = re.search(r'no_new_nont', rec_par.__name__)
            if match:
                rec_par_int = rec_par(tree, grammar.nonts(), nont_labelling)
            else:
                rec_par_int = rec_par(tree)

            rec_par_nodes = tree.node_id_rec_par(rec_par_int)

            (_, _, nont_name) = add_rules_to_grammar_rec(tree, rec_par_nodes, grammar, nont_labelling, term_labelling)

            # Add rule from top start symbol to top most nonterminal for the hybrid tree
            lhs = LCFRS_lhs(start_nont)
            lhs.add_arg([LCFRS_var(0, 0)])
            rhs = [nont_name]
            dcp_rule = DCP_rule(DCP_var(-1, 0), [DCP_var(0, 0)])

            grammar.add_rule(lhs, rhs, 1.0, [dcp_rule])

    grammar.make_proper()
    return n_trees, grammar
Example #11
0
    def test_negra_dag_small_grammar(self):
        DAG_CORPUS = 'res/tiger/tiger_full_with_sec_edges.export'
        DAG_CORPUS_BIN = 'res/tiger/tiger_full_with_sec_edges_bin_h1_v1.export'
        names = list([str(i) for i in range(1, 101)])
        if not os.path.exists(DAG_CORPUS):
            print(
                'run the following command to create an export corpus with dags:'
            )
            print('\tPYTHONPATH=. util/tiger_dags_to_negra.py ' +
                  'res/tiger/tiger_release_aug07.corrected.16012013.xml ' +
                  DAG_CORPUS + ' 1 50474')
        self.assertTrue(os.path.exists(DAG_CORPUS))

        if not os.path.exists(DAG_CORPUS_BIN):
            print(
                'run the following command to binarize the export corpus with dags:'
            )
            print("discodop treetransforms --binarize -v 1 -h 1 " +
                  DAG_CORPUS + " " + DAG_CORPUS_BIN)
            # _, DAG_CORPUS_BIN = tempfile.mkstemp(prefix='corpus_bin_', suffix='.export')
            # subprocess.call(["discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1", DAG_CORPUS, DAG_CORPUS_BIN])
        self.assertTrue(os.path.exists(DAG_CORPUS_BIN))
        corpus = np.sentence_names_to_hybridtrees(names,
                                                  DAG_CORPUS,
                                                  secedge=True)
        corpus_bin = np.sentence_names_to_hybridtrees(names,
                                                      DAG_CORPUS_BIN,
                                                      secedge=True)

        grammar = LCFRS(start="START")

        for hybrid_dag, hybrid_dag_bin in zip(corpus, corpus_bin):
            self.assertEqual(len(hybrid_dag.token_yield()),
                             len(hybrid_dag_bin.token_yield()))

            dag_grammar = direct_extract_lcfrs_from_prebinarized_corpus(
                hybrid_dag_bin)
            grammar.add_gram(dag_grammar)

        grammar.make_proper()
        print(
            "Extracted LCFRS/DCP-hybrid grammar with %i nonterminals and %i rules"
            % (len(grammar.nonts()), len(grammar.rules())))

        parser = DiscodopKbestParser(grammar, k=1)

        _, RESULT_FILE = tempfile.mkstemp(prefix='parser_results_',
                                          suffix='.export')

        with open(RESULT_FILE, 'w') as results:
            for hybrid_dag in corpus:

                poss = list(map(lambda x: x.pos(), hybrid_dag.token_yield()))
                parser.set_input(poss)
                parser.parse()
                self.assertTrue(parser.recognized())
                der = parser.best_derivation_tree()

                dcp_term = DCP_evaluator(der).getEvaluation()
                dag_eval = HybridDag(hybrid_dag.sent_label())
                dcp_to_hybriddag(dag_eval,
                                 dcp_term,
                                 copy.deepcopy(hybrid_dag.token_yield()),
                                 False,
                                 construct_token=construct_constituent_token)
                lines = np.serialize_hybridtrees_to_negra(
                    [dag_eval], 1, 500, use_sentence_names=True)
                for line in lines:
                    print(line, end='', file=results)
                parser.clear()

        print("Wrote results to %s" % RESULT_FILE)
    def __test_projection(self,
                          split_weights,
                          goal_weights,
                          merge_method=False):
        grammar = LCFRS("S")
        # rule 0
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "A"])

        # rule 1
        lhs = LCFRS_lhs("A")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [])

        lhs = LCFRS_lhs("A")
        lhs.add_arg(["b"])
        grammar.add_rule(lhs, [], weight=2.0)

        grammar.make_proper()
        # print(grammar)

        nonterminal_map = Enumerator()
        grammarInfo = PyGrammarInfo(grammar, nonterminal_map)
        storageManager = PyStorageManager()

        la = build_PyLatentAnnotation([1, 2], [1.0], split_weights,
                                      grammarInfo, storageManager)

        # parser = LCFRS_parser(grammar)
        # parser.set_input(["a", "b"])
        # parser.parse()
        # der = parser.best_derivation_tree()

        # print(la.serialize())
        if merge_method:
            la.project_weights(grammar, grammarInfo)
        else:
            splits, _, _ = la.serialize()
            merge_sources = [[[
                split for split in range(0, splits[nont_idx])
            ]] for nont_idx in range(0, nonterminal_map.get_counter())]

            # print("Projecting to fine grammar LA", file=self.logger)
            coarse_la = la.project_annotation_by_merging(grammarInfo,
                                                         merge_sources,
                                                         debug=False)
            coarse_la.project_weights(grammar, grammarInfo)

        # print(grammar)
        for i in range(3):
            self.assertAlmostEqual(
                grammar.rule_index(i).weight(), goal_weights[i])
Example #13
0
    def build_nm_grammar():
        grammar = LCFRS("START")
        # rule 0
        lhs = LCFRS_lhs("START")
        lhs.add_arg([LCFRS_var(0, 0)])
        grammar.add_rule(lhs, ["S"])

        # rule 1
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0), LCFRS_var(0, 1), LCFRS_var(1, 1)])
        grammar.add_rule(lhs, ["N", "M"])

        for nont, term in [("A", "a"), ("B", "b"), ("C", "c"), ("D", "d")]:
            # rule 2
            lhs = LCFRS_lhs(nont)
            lhs.add_arg([term])
            grammar.add_rule(lhs, [])

        for nont, nont_, c1, c2 in [("N", "N'", "A", "C"), ("M", "M'", "B", "D")]:
            # rule 3
            lhs = LCFRS_lhs(nont)
            lhs.add_arg([LCFRS_var(0, 0)])
            lhs.add_arg([LCFRS_var(1, 0)])
            grammar.add_rule(lhs, [c1, c2])

            # rule 4
            lhs = LCFRS_lhs(nont)
            lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
            lhs.add_arg([LCFRS_var(0,1)])
            grammar.add_rule(lhs, [nont_, c1])

            # rule 5
            lhs = LCFRS_lhs(nont_)
            lhs.add_arg([LCFRS_var(0, 0)])
            lhs.add_arg([LCFRS_var(0, 1), LCFRS_var(1, 0)])
            grammar.add_rule(lhs, [nont, c2])

        grammar.make_proper()
        return grammar
Example #14
0
    def test_la_viterbi_parsing_3(self):
        grammar = LCFRS("S")

        # rule 0
        lhs = LCFRS_lhs("B")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.25)

        # rule 1
        lhs = LCFRS_lhs("A")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.5)

        # rule 2
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0)])
        grammar.add_rule(lhs, ["B"], 1.0)

        # rule 3
        lhs = LCFRS_lhs("A")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.5)

        # rule 4
        lhs = LCFRS_lhs("B")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.75)

        grammar.make_proper()

        inp = ["a"] * 3

        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        print(nontMap.object_index("S"))
        print(nontMap.object_index("B"))

        la = build_PyLatentAnnotation_initial(grammar, gi, sm)
        parser = DiscodopKbestParser(grammar, la=la, nontMap=nontMap, grammarInfo=gi, latent_viterbi_mode=True)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.latent_viterbi_derivation(True)
        print(der)

        der2 = None

        for w, der_ in parser.k_best_derivation_trees():
            if der2 is None:
                der2 = der_
            print(w, der_)

        print(der2)
Example #15
0
    def build_grammar():
        grammar = LCFRS("START")
        # rule 0
        lhs = LCFRS_lhs("START")
        lhs.add_arg([LCFRS_var(0, 0)])
        grammar.add_rule(lhs, ["S"])

        # rule 1
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["S", "S"])

        # rule 1.5
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["S", "S"], dcp=["1.5"])

        # rule 2
        lhs = LCFRS_lhs("S")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [])

        # rule 3
        lhs = LCFRS_lhs("S")
        lhs.add_arg(["b"])
        grammar.add_rule(lhs, [], weight=2.0)

        # rule 4
        lhs = LCFRS_lhs("S")
        lhs.add_arg(["b"])
        grammar.add_rule(lhs, [], dcp=["4"])

        # rule 5
        lhs = LCFRS_lhs("A")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [])

        grammar.make_proper()
        return grammar
Example #16
0
    def test_projection_based_parser_k_best_hack(self):
        grammar = LCFRS("S")

        # rule 0
        lhs = LCFRS_lhs("B")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.25)

        # rule 1
        lhs = LCFRS_lhs("A")
        lhs.add_arg(["a"])
        grammar.add_rule(lhs, [], 0.5)

        # rule 2
        lhs = LCFRS_lhs("S")
        lhs.add_arg([LCFRS_var(0, 0)])
        grammar.add_rule(lhs, ["B"], 1.0)

        # rule 3
        lhs = LCFRS_lhs("A")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.5)

        # rule 4
        lhs = LCFRS_lhs("B")
        lhs.add_arg([LCFRS_var(0, 0), LCFRS_var(1, 0)])
        grammar.add_rule(lhs, ["A", "B"], 0.75)

        grammar.make_proper()

        inp = ["a"] * 3
        nontMap = Enumerator()
        gi = PyGrammarInfo(grammar, nontMap)
        sm = PyStorageManager()
        la = build_PyLatentAnnotation_initial(grammar, gi, sm)

        parser = Coarse_to_fine_parser(grammar,
                                       la,
                                       gi,
                                       nontMap,
                                       base_parser_type=GFParser_k_best)
        parser.set_input(inp)
        parser.parse()
        self.assertTrue(parser.recognized())
        der = parser.max_rule_product_derivation()
        print(der)

        der = parser.best_derivation_tree()
        print(der)

        for node in der.ids():
            print(der.getRule(node), der.spanned_ranges(node))
Example #17
0
def main():
    # # induce or load grammar
    # if not os.path.isfile(grammar_path):
    #     grammar = LCFRS('START')
    #     for tree in train_corpus:
    #         if not tree.complete() or tree.empty_fringe():
    #             continue
    #         part = recursive_partitioning(tree)
    #         tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling)
    #         grammar.add_gram(tree_grammar)
    #     grammar.make_proper()
    #     pickle.dump(grammar, open(grammar_path, 'wb'))
    # else:
    #     grammar = pickle.load(open(grammar_path, 'rb'))

    grammar = LCFRS('START')
    for tree in train_corpus:
        if not tree.complete() or tree.empty_fringe():
            continue
        part = recursive_partitioning(tree)
        tree_grammar = fringe_extract_lcfrs(tree,
                                            part,
                                            naming='child',
                                            term_labeling=terminal_labeling)
        grammar.add_gram(tree_grammar)
    grammar.make_proper()

    # # compute or load reducts
    # if not os.path.isfile(reduct_path):
    #     traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling)
    #     traceTrain.serialize(reduct_path)
    # else:
    #     traceTrain = PySDCPTraceManager(grammar, terminal_labeling)
    #     traceTrain.load_traces_from_file(reduct_path)

    traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling)
    traceValidationGenetic = compute_reducts(grammar,
                                             validation_genetic_corpus,
                                             terminal_labeling)
    traceValidation = compute_reducts(grammar, validation_corpus,
                                      terminal_labeling)

    # prepare EM training
    grammarInfo = PyGrammarInfo(grammar, traceTrain.get_nonterminal_map())
    if not grammarInfo.check_for_consistency():
        print("[Genetic] GrammarInfo is not consistent!")

    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=threads)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    # emTrainerOld = PyEMTrainer(traceTrain)
    # emTrainerOld.em_training(grammar, 30, "rfe", tie_breaking=True)

    # compute parses for validation set
    baseline_parser = GFParser_k_best(grammar, k=k_best)
    validator = build_score_validator(grammar, grammarInfo,
                                      traceTrain.get_nonterminal_map(),
                                      storageManager, terminal_labeling,
                                      baseline_parser, validation_corpus,
                                      validationMethod)
    del baseline_parser

    # prepare SM training
    builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=threads)
    builder.set_score_validator(validator, validationDropIterations)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    splitMergeTrainer = builder.set_scc_merger(threshold=scc_merger_threshold,
                                               threads=threads).build()

    splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotations = []
    for i in range(0, genetic_initial):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        la = splitMergeTrainer.split_merge_cycle(la_no_splits)
        if not la.check_for_validity():
            print('[Genetic] Initial LA', i,
                  'is not consistent! (See details before)')
        if not la.is_proper():
            print('[Genetic] Initial LA', i, 'is not proper!')
        heapq.heappush(
            latentAnnotations,
            (evaluate_la(grammar, grammarInfo, la, traceValidationGenetic,
                         validation_genetic_corpus), i, la))
        print('[Genetic]    added initial LA', i)
    (fBest, idBest, laBest) = min(latentAnnotations)
    validation_score = evaluate_la(grammar, grammarInfo, laBest,
                                   traceValidation, test_corpus)
    print("[Genetic] Started with best F-Score (Test) of", validation_score,
          "from Annotation ", idBest)

    geneticCount = genetic_initial
    random.seed(seed)
    for round in range(1, genetic_cycles + 1):
        print("[Genetic] Starting Recombination Round ", round)
        # newpopulation = list(latentAnnotations)
        newpopulation = []
        # Cross all candidates!
        for leftIndex in range(0, len(latentAnnotations)):
            (fLeft, idLeft, left) = latentAnnotations[leftIndex]
            # TODO: How to determine NTs to keep?

            # do SM-Training
            print("[Genetic] do SM-training on", idLeft, "and create LA",
                  geneticCount)
            la = splitMergeTrainer.split_merge_cycle(la)
            if not la.check_for_validity():
                print(
                    '[Genetic] Split/Merge introduced invalid weights into LA',
                    geneticCount)
            if not la.is_proper():
                print(
                    '[Genetic] Split/Merge introduced problems with properness of LA',
                    geneticCount)

            fscore = evaluate_la(grammar, grammarInfo, la,
                                 traceValidationGenetic,
                                 validation_genetic_corpus)
            print("[Genetic] LA", geneticCount, "has F-score: ", fscore)
            heapq.heappush(newpopulation, (fscore, geneticCount, la))
            geneticCount += 1
        heapq.heapify(newpopulation)
        latentAnnotations = heapq.nsmallest(
            genetic_population, heapq.merge(latentAnnotations, newpopulation))
        heapq.heapify(latentAnnotations)
        (fBest, idBest, laBest) = min(latentAnnotations)
        validation_score = evaluate_la(grammar, grammarInfo, laBest,
                                       traceValidation, test_corpus)
        print("[Genetic] Best LA", idBest, "has F-Score (Test) of ",
              validation_score)