def do_em_training(self):
        em_builder = PySplitMergeTrainerBuilder(
            self.organizer.training_reducts, self.organizer.grammarInfo)
        em_builder.set_em_epochs(self.organizer.em_epochs)
        em_builder.set_simple_expector(threads=self.organizer.threads)
        em_builder.set_scc_merger(self.organizer.merge_threshold)
        em_builder.set_scc_merge_threshold_function(
            self.organizer.merge_interpolation_factor)
        self.organizer.emTrainer = emTrainer = em_builder.build()

        initial_la = self.create_initial_la()

        emTrainer.em_train(initial_la)
        try:
            initial_la.project_weights(self.base_grammar,
                                       self.organizer.grammarInfo)
        except Exception as exc:
            nont_idx = exc.args[0]
            splits, root_weights, rule_weights = initial_la.serialize()
            nont = self.organizer.nonterminal_map.index_object(nont_idx)
            print(nont, nont_idx, splits[nont_idx], file=self.logger)
            for rule in self.base_grammar.lhs_nont_to_rules(nont):
                print(rule, rule_weights[rule.get_idx()], file=self.logger)
            raise

        self.organizer.latent_annotations[0] = initial_la
        self.organizer.last_sm_cycle = 0
        self.save_current_la()
    def prepare_split_merge_trainer(self):
        # prepare SM training
        builder = PySplitMergeTrainerBuilder(self.organizer.training_reducts,
                                             self.organizer.grammarInfo)
        builder.set_em_epochs(self.organizer.em_epochs_sm)
        builder.set_simple_expector(threads=self.organizer.threads)
        if self.organizer.validator_type == "SCORE":
            builder.set_score_validator(
                self.organizer.validator,
                self.organizer.validationDropIterations)
        elif self.organizer.validator_type == "SIMPLE":
            builder.set_simple_validator(
                self.organizer.validation_reducts,
                self.organizer.validationDropIterations)
        builder.set_smoothing_factor(
            smoothingFactor=self.organizer.smoothing_factor,
            smoothingFactorUnary=self.organizer.smoothing_factor_unary)
        builder.set_split_randomization(
            percent=self.organizer.split_randomization,
            seed=self.organizer.seed + 1)

        # set merger
        if self.organizer.merge_type == "SCC":
            builder.set_scc_merger(self.organizer.merge_threshold)
        elif self.organizer.merge_type == "THRESHOLD":
            builder.set_threshold_merger(self.organizer.merge_threshold)
        else:
            builder.set_percent_merger(self.organizer.merge_percentage)

        self.custom_sm_options(builder)
        self.organizer.splitMergeTrainer = builder.build()

        if self.organizer.validator_type in ["SCORE", "SIMPLE"]:
            self.organizer.splitMergeTrainer.setMaxDrops(
                self.organizer.validationDropIterations, mode="smoothing")
            self.organizer.splitMergeTrainer.setMinEpochs(
                self.organizer.min_epochs)
            self.organizer.splitMergeTrainer.setMinEpochs(
                self.organizer.min_epochs_smoothing, mode="smoothing")
            self.organizer.splitMergeTrainer.setIgnoreFailures(
                self.organizer.ignore_failures_smoothing, mode="smoothing")
        self.organizer.splitMergeTrainer.setEMepochs(
            self.organizer.em_epochs_sm, mode="smoothing")
def main():
    # induce grammar from a corpus
    trees = parse_conll_corpus(train, False, limit_train)
    nonterminal_labelling = the_labeling_factory(
    ).create_simple_labeling_strategy("childtop", "deprel")
    term_labelling = the_terminal_labeling_factory().get_strategy('pos')
    start = 'START'
    recursive_partitioning = [cfg]
    _, grammar = induce_grammar(trees, nonterminal_labelling,
                                term_labelling.token_label,
                                recursive_partitioning, start)

    # compute some derivations
    derivations = obtain_derivations(grammar, term_labelling)

    # create derivation manager and add derivations
    manager = PyDerivationManager(grammar)
    manager.convert_derivations_to_hypergraphs(derivations)
    manager.serialize(b"/tmp/derivations.txt")

    # build and configure split/merge trainer and supplementary objects

    rule_to_nonterminals = []
    for i in range(0, len(grammar.rule_index())):
        rule = grammar.rule_index(i)
        nonts = [
            manager.get_nonterminal_map().object_index(rule.lhs().nont())
        ] + [
            manager.get_nonterminal_map().object_index(nont)
            for nont in rule.rhs()
        ]
        rule_to_nonterminals.append(nonts)

    grammarInfo = PyGrammarInfo(grammar, manager.get_nonterminal_map())
    storageManager = PyStorageManager()
    builder = PySplitMergeTrainerBuilder(manager, grammarInfo)
    builder.set_em_epochs(20)
    builder.set_percent_merger(60.0)

    splitMergeTrainer = builder.build()

    latentAnnotation = [
        build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager)
    ]

    for i in range(max_cycles + 1):
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        # pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb'))
        smGrammar = build_sm_grammar(latentAnnotation[i],
                                     grammar,
                                     grammarInfo,
                                     rule_pruning=0.0001,
                                     rule_smoothing=0.01)
        print("Cycle: ", i, "Rules: ", len(smGrammar.rules()))

        if parsing:
            parser = GFParser(smGrammar)

            trees = parse_conll_corpus(test, False, limit_test)
            for tree in trees:
                parser.set_input(
                    term_labelling.prepare_parser_input(tree.token_yield()))
                parser.parse()
                if parser.recognized():
                    print(
                        derivation_to_hybrid_tree(
                            parser.best_derivation_tree(),
                            [token.pos() for token in tree.token_yield()],
                            [token.form() for token in tree.token_yield()],
                            construct_constituent_token))
Exemple #4
0
def main():
    # # induce or load grammar
    # if not os.path.isfile(grammar_path):
    #     grammar = LCFRS('START')
    #     for tree in train_corpus:
    #         if not tree.complete() or tree.empty_fringe():
    #             continue
    #         part = recursive_partitioning(tree)
    #         tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling)
    #         grammar.add_gram(tree_grammar)
    #     grammar.make_proper()
    #     pickle.dump(grammar, open(grammar_path, 'wb'))
    # else:
    #     grammar = pickle.load(open(grammar_path, 'rb'))

    grammar = LCFRS('START')
    for tree in train_corpus:
        if not tree.complete() or tree.empty_fringe():
            continue
        part = recursive_partitioning(tree)
        tree_grammar = fringe_extract_lcfrs(tree,
                                            part,
                                            naming='child',
                                            term_labeling=terminal_labeling)
        grammar.add_gram(tree_grammar)
    grammar.make_proper()

    # # compute or load reducts
    # if not os.path.isfile(reduct_path):
    #     traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling)
    #     traceTrain.serialize(reduct_path)
    # else:
    #     traceTrain = PySDCPTraceManager(grammar, terminal_labeling)
    #     traceTrain.load_traces_from_file(reduct_path)

    traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling)
    traceValidationGenetic = compute_reducts(grammar,
                                             validation_genetic_corpus,
                                             terminal_labeling)
    traceValidation = compute_reducts(grammar, validation_corpus,
                                      terminal_labeling)

    # prepare EM training
    grammarInfo = PyGrammarInfo(grammar, traceTrain.get_nonterminal_map())
    if not grammarInfo.check_for_consistency():
        print("[Genetic] GrammarInfo is not consistent!")

    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=threads)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    # emTrainerOld = PyEMTrainer(traceTrain)
    # emTrainerOld.em_training(grammar, 30, "rfe", tie_breaking=True)

    # compute parses for validation set
    baseline_parser = GFParser_k_best(grammar, k=k_best)
    validator = build_score_validator(grammar, grammarInfo,
                                      traceTrain.get_nonterminal_map(),
                                      storageManager, terminal_labeling,
                                      baseline_parser, validation_corpus,
                                      validationMethod)
    del baseline_parser

    # prepare SM training
    builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=threads)
    builder.set_score_validator(validator, validationDropIterations)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    splitMergeTrainer = builder.set_scc_merger(threshold=scc_merger_threshold,
                                               threads=threads).build()

    splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotations = []
    for i in range(0, genetic_initial):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        la = splitMergeTrainer.split_merge_cycle(la_no_splits)
        if not la.check_for_validity():
            print('[Genetic] Initial LA', i,
                  'is not consistent! (See details before)')
        if not la.is_proper():
            print('[Genetic] Initial LA', i, 'is not proper!')
        heapq.heappush(
            latentAnnotations,
            (evaluate_la(grammar, grammarInfo, la, traceValidationGenetic,
                         validation_genetic_corpus), i, la))
        print('[Genetic]    added initial LA', i)
    (fBest, idBest, laBest) = min(latentAnnotations)
    validation_score = evaluate_la(grammar, grammarInfo, laBest,
                                   traceValidation, test_corpus)
    print("[Genetic] Started with best F-Score (Test) of", validation_score,
          "from Annotation ", idBest)

    geneticCount = genetic_initial
    random.seed(seed)
    for round in range(1, genetic_cycles + 1):
        print("[Genetic] Starting Recombination Round ", round)
        # newpopulation = list(latentAnnotations)
        newpopulation = []
        # Cross all candidates!
        for leftIndex in range(0, len(latentAnnotations)):
            (fLeft, idLeft, left) = latentAnnotations[leftIndex]
            # TODO: How to determine NTs to keep?

            # do SM-Training
            print("[Genetic] do SM-training on", idLeft, "and create LA",
                  geneticCount)
            la = splitMergeTrainer.split_merge_cycle(la)
            if not la.check_for_validity():
                print(
                    '[Genetic] Split/Merge introduced invalid weights into LA',
                    geneticCount)
            if not la.is_proper():
                print(
                    '[Genetic] Split/Merge introduced problems with properness of LA',
                    geneticCount)

            fscore = evaluate_la(grammar, grammarInfo, la,
                                 traceValidationGenetic,
                                 validation_genetic_corpus)
            print("[Genetic] LA", geneticCount, "has F-score: ", fscore)
            heapq.heappush(newpopulation, (fscore, geneticCount, la))
            geneticCount += 1
        heapq.heapify(newpopulation)
        latentAnnotations = heapq.nsmallest(
            genetic_population, heapq.merge(latentAnnotations, newpopulation))
        heapq.heapify(latentAnnotations)
        (fBest, idBest, laBest) = min(latentAnnotations)
        validation_score = evaluate_la(grammar, grammarInfo, laBest,
                                       traceValidation, test_corpus)
        print("[Genetic] Best LA", idBest, "has F-Score (Test) of ",
              validation_score)
def run_experiment(rec_part_strategy,
                   nonterminal_labeling,
                   exp,
                   reorder_children,
                   binarize=True):
    start = 1
    stop = 7000

    test_start = 7001
    test_stop = 7200

    # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
    corpus_path = "res/tiger/tiger_8000.xml"
    exclude = []
    train_dsgs = sentence_names_to_deep_syntax_graphs(
        ['s' + str(i) for i in range(start, stop + 1) if i not in exclude],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)
    test_dsgs = sentence_names_to_deep_syntax_graphs(
        [
            's' + str(i)
            for i in range(test_start, test_stop + 1) if i not in exclude
        ],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)

    # Grammar induction
    term_labeling_token = PosTerminals()

    def term_labeling(token):
        if isinstance(token, ConstituentTerminal):
            return term_labeling_token.token_label(token)
        else:
            return token

    if binarize:

        def modify_token(token):
            if isinstance(token, ConstituentCategory):
                token_new = deepcopy(token)
                token_new.set_category(token.category() + '-BAR')
                return token_new
            elif isinstance(token, str):
                return token + '-BAR'
            else:
                assert False

        train_dsgs = [
            dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs
        ]

        def is_bin(token):
            if isinstance(token, ConstituentCategory):
                if token.category().endswith('-BAR'):
                    return True
            elif isinstance(token, str):
                if token.endswith('-BAR'):
                    return True
            return False

        def debinarize(dsg):
            return dsg.debinarize(is_bin=is_bin)

    else:
        debinarize = id

    grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy,
                                    nonterminal_labeling, term_labeling)
    grammar.make_proper()

    print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules()))

    parser = GFParser_k_best(grammar, k=500)
    return do_parsing(parser,
                      test_dsgs,
                      term_labeling_token,
                      oracle=True,
                      debinarize=debinarize)

    # Compute reducts, i.e., intersect grammar with each training dsg
    basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp))
    reduct_dir = path.join(basedir, 'reduct_grammars')

    terminal_map = Enumerator()
    if not os.path.isdir(basedir):
        os.makedirs(basedir)
    data = export_dog_grammar_to_json(grammar, terminal_map)
    grammar_path = path.join(basedir, 'grammar.json')
    with open(grammar_path, 'w') as file:
        json.dump(data, file)

    corpus_path = path.join(basedir, 'corpus.json')
    with open(corpus_path, 'w') as file:
        json.dump(
            export_corpus_to_json(train_dsgs,
                                  terminal_map,
                                  terminal_labeling=term_labeling), file)

    with open(path.join(basedir, 'enumerator.enum'), 'w') as file:
        terminal_map.print_index(file)

    if os.path.isdir(reduct_dir):
        shutil.rmtree(reduct_dir)
    os.makedirs(reduct_dir)
    p = subprocess.Popen([
        ' '.join([
            "java", "-jar",
            os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g',
            grammar_path, '-t', corpus_path, "-o", reduct_dir
        ])
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)

    while True:
        nextline = p.stdout.readline()
        if nextline == '' and p.poll() is not None:
            break
        sys.stdout.write(nextline)
        sys.stdout.flush()

    p.wait()
    p.stdout.close()

    rtgs = []
    for i in range(1, len(train_dsgs) + 1):
        rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra')))

    derivation_manager = PyDerivationManager(grammar)
    derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
    derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace'))

    # Training
    ## prepare EM training
    em_epochs = 20
    seed = 0
    smoothing_factor = 0.01
    split_randomization = 0.01
    sm_cycles = 2
    merge_percentage = 50.0
    grammarInfo = PyGrammarInfo(grammar,
                                derivation_manager.get_nonterminal_map())
    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=THREADS)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token)
    return
    ## prepare SM training
    builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=THREADS)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    # builder.set_scc_merger(-0.2)
    builder.set_percent_merger(merge_percentage)
    splitMergeTrainer = builder.build()

    # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotation = [la_no_splits]

    # carry out split/merge training and do parsing
    parsing_method = "filter-ctf"
    # parsing_method = "single-best-annotation"
    k_best = 50
    for i in range(1, sm_cycles + 1):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        print("Cycle: ", i)
        if parsing_method == "single-best-annotation":
            smGrammar = latentAnnotation[i].build_sm_grammar(
                grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1)
            print("Rules in smoothed grammar: ", len(smGrammar.rules()))
            parser = GFParser(smGrammar)
        elif parsing_method == "filter-ctf":
            latentAnnotation[-1].project_weights(grammar, grammarInfo)
            parser = Coarse_to_fine_parser(
                grammar,
                latentAnnotation[-1],
                grammarInfo,
                derivation_manager.get_nonterminal_map(),
                base_parser_type=GFParser_k_best,
                k=k_best)
        else:
            raise (Exception())
        do_parsing(parser, test_dsgs, term_labeling_token)
        del parser