Beispiel #1
0
 def initialize_parser(self):
     if "disco-dop" in self.parsing_mode:
         self.parser = DiscodopKbestParser(grammar=self.base_grammar,
                                           k=self.k_best,
                                           beam_beta=self.disco_dop_params["beam_beta"],
                                           beam_delta=self.disco_dop_params["beam_delta"],
                                           pruning_k=self.disco_dop_params["pruning_k"],
                                           cfg_ctf=self.disco_dop_params["cfg_ctf"])
     else:
         self.parser = GFParser_k_best(grammar=self.base_grammar, k=self.k_best,
                                       save_preprocessing=(self.directory, "gfgrammar"))
Beispiel #2
0
 def initialize_parser(self):
     save_preprocess = (self.directory, "mygrammar")
     k = 1 if not self.organizer.disable_split_merge or self.oracle_parsing else self.k_best
     if "disco-dop" in self.parsing_mode:
         self.parser = DiscodopKbestParser(grammar=self.base_grammar, k=self.k_best,
                                           cfg_ctf=self.disco_dop_params["cfg_ctf"],
                                           pruning_k=self.disco_dop_params["pruning_k"],
                                           beam_beta=self.disco_dop_params["beam_beta"],
                                           beam_delta=self.disco_dop_params["beam_delta"]
                                          )
     else:
         self.parser = GFParser_k_best(self.base_grammar, save_preprocessing=save_preprocess, k=k)
def obtain_derivations(grammar, term_labelling):
    # build parser
    tree_yield = term_labelling.prepare_parser_input
    parser = GFParser_k_best(grammar, k=50)

    # parse sentences
    trees = parse_conll_corpus(test, False, limit_test)
    for i, tree in enumerate(trees):
        print(
            "Parsing sentence ",
            i,
            file=stderr,
        )

        parser.set_input(tree_yield(tree.token_yield()))
        parser.parse()

        derivations = [der for der in parser.k_best_derivation_trees()]

        print("# derivations: ", len(derivations), file=stderr)
        parser.clear()

        for der in derivations:
            yield der[1]
    def prepare_sm_parser(self):
        last_la = self.organizer.latent_annotations[
            self.organizer.last_sm_cycle]
        if self.parsing_mode == "discodop-multi-method":
            if self.organizer.project_weights_before_parsing:
                self.project_weights()
            self.parser = DiscodopKbestParser(
                self.base_grammar,
                k=self.k_best,
                la=last_la,
                nontMap=self.organizer.nonterminal_map,
                variational=False,
                sum_op=False,
                cfg_ctf=self.disco_dop_params["cfg_ctf"],
                beam_beta=self.disco_dop_params["beam_beta"],
                beam_delta=self.disco_dop_params["beam_delta"],
                pruning_k=self.disco_dop_params["pruning_k"],
                grammarInfo=self.organizer.grammarInfo,
                projection_mode=False,
                latent_viterbi_mode=True,
                secondaries=[
                    "VARIATIONAL", "MAX-RULE-PRODUCT", "LATENT-RERANK"
                ])
            self.parser.k_best_reranker = Coarse_to_fine_parser(
                self.base_grammar,
                last_la,
                self.organizer.grammarInfo,
                self.organizer.nonterminal_map,
                base_parser=self.parser)

        elif self.parsing_mode == "best-latent-derivation":
            grammar = build_sm_grammar(last_la,
                                       self.base_grammar,
                                       self.organizer.grammarInfo,
                                       rule_pruning=0.0001,
                                       rule_smoothing=0.1)
            self.parser = GFParser_k_best(grammar=grammar,
                                          k=1,
                                          save_preprocessing=(self.directory,
                                                              "gfgrammar"))
        elif self.parsing_mode in {
                method + engine
                for method in {"k-best-rerank", "latent-viterbi"}
                for engine in {"-GF", "-disco-dop", ""}
        }:
            if self.organizer.project_weights_before_parsing:
                self.project_weights()
            if "disco-dop" in self.parsing_mode:
                engine = DiscodopKbestParser(
                    grammar=self.base_grammar,
                    k=self.k_best,
                    la=last_la,
                    nontMap=self.organizer.nonterminal_map,
                    grammarInfo=self.organizer.grammarInfo,
                    cfg_ctf=self.disco_dop_params["cfg_ctf"],
                    beam_beta=self.disco_dop_params["beam_beta"],
                    beam_delta=self.disco_dop_params["beam_beta"],
                    pruning_k=self.disco_dop_params["pruning_k"],
                    latent_viterbi_mode="latent-viterbi" in self.parsing_mode)
            else:
                engine = GFParser_k_best(grammar=self.base_grammar,
                                         k=self.k_best,
                                         heuristics=self.heuristics,
                                         save_preprocessing=(self.directory,
                                                             "gfgrammar"))
            if "latent-viterbi" in self.parsing_mode:
                self.parser = engine
            else:
                self.parser = Coarse_to_fine_parser(
                    self.base_grammar,
                    last_la,
                    self.organizer.grammarInfo,
                    self.organizer.nonterminal_map,
                    base_parser=engine)
        elif self.parsing_mode in {
                method + "%s" % engine
                for method in {"max-rule-prod", "max-rule-sum", "variational"}
                for engine in {"-GF", "-disco-dop", ""}
        }:
            if self.organizer.project_weights_before_parsing:
                self.project_weights()
            if "GF" in self.parsing_mode:
                self.parser = Coarse_to_fine_parser(
                    self.base_grammar,
                    last_la,
                    self.organizer.grammarInfo,
                    nontMap=self.organizer.nonterminal_map,
                    base_parser_type=GFParser_k_best,
                    k=self.k_best,
                    heuristics=self.heuristics,
                    save_preprocessing=(self.directory, "gfgrammar"),
                    mode=self.parsing_mode,
                    variational="variational" in self.parsing_mode,
                    sum_op="sum" in self.parsing_mode)
            else:
                self.parser = DiscodopKbestParser(
                    self.base_grammar,
                    k=self.k_best,
                    la=last_la,
                    nontMap=self.organizer.nonterminal_map,
                    variational="variational" in self.parsing_mode,
                    sum_op="sum" in self.parsing_mode,
                    cfg_ctf=self.disco_dop_params["cfg_ctf"],
                    beam_beta=self.disco_dop_params["beam_beta"],
                    beam_delta=self.disco_dop_params["beam_delta"],
                    pruning_k=self.disco_dop_params["pruning_k"],
                    grammarInfo=self.organizer.grammarInfo,
                    projection_mode=True)

        else:
            raise ValueError("Unknown parsing mode %s" % self.parsing_mode)
Beispiel #5
0
def main():
    # # induce or load grammar
    # if not os.path.isfile(grammar_path):
    #     grammar = LCFRS('START')
    #     for tree in train_corpus:
    #         if not tree.complete() or tree.empty_fringe():
    #             continue
    #         part = recursive_partitioning(tree)
    #         tree_grammar = fringe_extract_lcfrs(tree, part, naming='child', term_labeling=terminal_labeling)
    #         grammar.add_gram(tree_grammar)
    #     grammar.make_proper()
    #     pickle.dump(grammar, open(grammar_path, 'wb'))
    # else:
    #     grammar = pickle.load(open(grammar_path, 'rb'))

    grammar = LCFRS('START')
    for tree in train_corpus:
        if not tree.complete() or tree.empty_fringe():
            continue
        part = recursive_partitioning(tree)
        tree_grammar = fringe_extract_lcfrs(tree,
                                            part,
                                            naming='child',
                                            term_labeling=terminal_labeling)
        grammar.add_gram(tree_grammar)
    grammar.make_proper()

    # # compute or load reducts
    # if not os.path.isfile(reduct_path):
    #     traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling)
    #     traceTrain.serialize(reduct_path)
    # else:
    #     traceTrain = PySDCPTraceManager(grammar, terminal_labeling)
    #     traceTrain.load_traces_from_file(reduct_path)

    traceTrain = compute_reducts(grammar, train_corpus, terminal_labeling)
    traceValidationGenetic = compute_reducts(grammar,
                                             validation_genetic_corpus,
                                             terminal_labeling)
    traceValidation = compute_reducts(grammar, validation_corpus,
                                      terminal_labeling)

    # prepare EM training
    grammarInfo = PyGrammarInfo(grammar, traceTrain.get_nonterminal_map())
    if not grammarInfo.check_for_consistency():
        print("[Genetic] GrammarInfo is not consistent!")

    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=threads)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    # emTrainerOld = PyEMTrainer(traceTrain)
    # emTrainerOld.em_training(grammar, 30, "rfe", tie_breaking=True)

    # compute parses for validation set
    baseline_parser = GFParser_k_best(grammar, k=k_best)
    validator = build_score_validator(grammar, grammarInfo,
                                      traceTrain.get_nonterminal_map(),
                                      storageManager, terminal_labeling,
                                      baseline_parser, validation_corpus,
                                      validationMethod)
    del baseline_parser

    # prepare SM training
    builder = PySplitMergeTrainerBuilder(traceTrain, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=threads)
    builder.set_score_validator(validator, validationDropIterations)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    splitMergeTrainer = builder.set_scc_merger(threshold=scc_merger_threshold,
                                               threads=threads).build()

    splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotations = []
    for i in range(0, genetic_initial):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        la = splitMergeTrainer.split_merge_cycle(la_no_splits)
        if not la.check_for_validity():
            print('[Genetic] Initial LA', i,
                  'is not consistent! (See details before)')
        if not la.is_proper():
            print('[Genetic] Initial LA', i, 'is not proper!')
        heapq.heappush(
            latentAnnotations,
            (evaluate_la(grammar, grammarInfo, la, traceValidationGenetic,
                         validation_genetic_corpus), i, la))
        print('[Genetic]    added initial LA', i)
    (fBest, idBest, laBest) = min(latentAnnotations)
    validation_score = evaluate_la(grammar, grammarInfo, laBest,
                                   traceValidation, test_corpus)
    print("[Genetic] Started with best F-Score (Test) of", validation_score,
          "from Annotation ", idBest)

    geneticCount = genetic_initial
    random.seed(seed)
    for round in range(1, genetic_cycles + 1):
        print("[Genetic] Starting Recombination Round ", round)
        # newpopulation = list(latentAnnotations)
        newpopulation = []
        # Cross all candidates!
        for leftIndex in range(0, len(latentAnnotations)):
            (fLeft, idLeft, left) = latentAnnotations[leftIndex]
            # TODO: How to determine NTs to keep?

            # do SM-Training
            print("[Genetic] do SM-training on", idLeft, "and create LA",
                  geneticCount)
            la = splitMergeTrainer.split_merge_cycle(la)
            if not la.check_for_validity():
                print(
                    '[Genetic] Split/Merge introduced invalid weights into LA',
                    geneticCount)
            if not la.is_proper():
                print(
                    '[Genetic] Split/Merge introduced problems with properness of LA',
                    geneticCount)

            fscore = evaluate_la(grammar, grammarInfo, la,
                                 traceValidationGenetic,
                                 validation_genetic_corpus)
            print("[Genetic] LA", geneticCount, "has F-score: ", fscore)
            heapq.heappush(newpopulation, (fscore, geneticCount, la))
            geneticCount += 1
        heapq.heapify(newpopulation)
        latentAnnotations = heapq.nsmallest(
            genetic_population, heapq.merge(latentAnnotations, newpopulation))
        heapq.heapify(latentAnnotations)
        (fBest, idBest, laBest) = min(latentAnnotations)
        validation_score = evaluate_la(grammar, grammarInfo, laBest,
                                       traceValidation, test_corpus)
        print("[Genetic] Best LA", idBest, "has F-Score (Test) of ",
              validation_score)
def run_experiment(rec_part_strategy,
                   nonterminal_labeling,
                   exp,
                   reorder_children,
                   binarize=True):
    start = 1
    stop = 7000

    test_start = 7001
    test_stop = 7200

    # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
    corpus_path = "res/tiger/tiger_8000.xml"
    exclude = []
    train_dsgs = sentence_names_to_deep_syntax_graphs(
        ['s' + str(i) for i in range(start, stop + 1) if i not in exclude],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)
    test_dsgs = sentence_names_to_deep_syntax_graphs(
        [
            's' + str(i)
            for i in range(test_start, test_stop + 1) if i not in exclude
        ],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)

    # Grammar induction
    term_labeling_token = PosTerminals()

    def term_labeling(token):
        if isinstance(token, ConstituentTerminal):
            return term_labeling_token.token_label(token)
        else:
            return token

    if binarize:

        def modify_token(token):
            if isinstance(token, ConstituentCategory):
                token_new = deepcopy(token)
                token_new.set_category(token.category() + '-BAR')
                return token_new
            elif isinstance(token, str):
                return token + '-BAR'
            else:
                assert False

        train_dsgs = [
            dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs
        ]

        def is_bin(token):
            if isinstance(token, ConstituentCategory):
                if token.category().endswith('-BAR'):
                    return True
            elif isinstance(token, str):
                if token.endswith('-BAR'):
                    return True
            return False

        def debinarize(dsg):
            return dsg.debinarize(is_bin=is_bin)

    else:
        debinarize = id

    grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy,
                                    nonterminal_labeling, term_labeling)
    grammar.make_proper()

    print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules()))

    parser = GFParser_k_best(grammar, k=500)
    return do_parsing(parser,
                      test_dsgs,
                      term_labeling_token,
                      oracle=True,
                      debinarize=debinarize)

    # Compute reducts, i.e., intersect grammar with each training dsg
    basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp))
    reduct_dir = path.join(basedir, 'reduct_grammars')

    terminal_map = Enumerator()
    if not os.path.isdir(basedir):
        os.makedirs(basedir)
    data = export_dog_grammar_to_json(grammar, terminal_map)
    grammar_path = path.join(basedir, 'grammar.json')
    with open(grammar_path, 'w') as file:
        json.dump(data, file)

    corpus_path = path.join(basedir, 'corpus.json')
    with open(corpus_path, 'w') as file:
        json.dump(
            export_corpus_to_json(train_dsgs,
                                  terminal_map,
                                  terminal_labeling=term_labeling), file)

    with open(path.join(basedir, 'enumerator.enum'), 'w') as file:
        terminal_map.print_index(file)

    if os.path.isdir(reduct_dir):
        shutil.rmtree(reduct_dir)
    os.makedirs(reduct_dir)
    p = subprocess.Popen([
        ' '.join([
            "java", "-jar",
            os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g',
            grammar_path, '-t', corpus_path, "-o", reduct_dir
        ])
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)

    while True:
        nextline = p.stdout.readline()
        if nextline == '' and p.poll() is not None:
            break
        sys.stdout.write(nextline)
        sys.stdout.flush()

    p.wait()
    p.stdout.close()

    rtgs = []
    for i in range(1, len(train_dsgs) + 1):
        rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra')))

    derivation_manager = PyDerivationManager(grammar)
    derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
    derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace'))

    # Training
    ## prepare EM training
    em_epochs = 20
    seed = 0
    smoothing_factor = 0.01
    split_randomization = 0.01
    sm_cycles = 2
    merge_percentage = 50.0
    grammarInfo = PyGrammarInfo(grammar,
                                derivation_manager.get_nonterminal_map())
    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=THREADS)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token)
    return
    ## prepare SM training
    builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=THREADS)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    # builder.set_scc_merger(-0.2)
    builder.set_percent_merger(merge_percentage)
    splitMergeTrainer = builder.build()

    # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotation = [la_no_splits]

    # carry out split/merge training and do parsing
    parsing_method = "filter-ctf"
    # parsing_method = "single-best-annotation"
    k_best = 50
    for i in range(1, sm_cycles + 1):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        print("Cycle: ", i)
        if parsing_method == "single-best-annotation":
            smGrammar = latentAnnotation[i].build_sm_grammar(
                grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1)
            print("Rules in smoothed grammar: ", len(smGrammar.rules()))
            parser = GFParser(smGrammar)
        elif parsing_method == "filter-ctf":
            latentAnnotation[-1].project_weights(grammar, grammarInfo)
            parser = Coarse_to_fine_parser(
                grammar,
                latentAnnotation[-1],
                grammarInfo,
                derivation_manager.get_nonterminal_map(),
                base_parser_type=GFParser_k_best,
                k=k_best)
        else:
            raise (Exception())
        do_parsing(parser, test_dsgs, term_labeling_token)
        del parser
 def initialize_parser(self):
     self.parser = GFParser_k_best(self.base_grammar, k=self.k_best)