def __projection_based_derivation_tree(self, la, variational=False, op=prod):
        manager = PyDerivationManager(self.grammar, self.nontMap)
        derivations = [der for _, der in self.base_parser.k_best_derivation_trees()]
        manager.convert_derivations_to_hypergraph(derivations)
        manager.set_io_cycle_limit(200)
        manager.set_io_precision(0.000001)
        self.debug = False
        self.log_mode = True
        edge_weights = py_edge_weight_projection(la, manager, variational=variational, debug=self.debug,
                                                 log_mode=self.log_mode)
        der = manager.viterbi_derivation(0, edge_weights, self.grammar, op=op, log_mode=self.log_mode)
        if der is None:
            if True or self.debug:
                nans = 0
                infs = 0
                zeros = 0
                for weight in edge_weights:
                    if math.isnan(weight):
                        nans += 1
                    if math.isinf(weight):
                        infs += 1
                    if weight == 0.0:
                        zeros += 1
                print("[", len(edge_weights), nans, infs, zeros, "]")
                if len(edge_weights) < 200:
                    print("orig:", edge_weights)
                    edge_weights = py_edge_weight_projection(la, manager, variational=variational, debug=True, log_mode=self.log_mode)
                    print("1:", edge_weights)
                    edge_weights = py_edge_weight_projection(la, manager, variational=variational, debug=True,
                                                             log_mode=self.log_mode)
                    print("2:", edge_weights)

            print("p", end="")
            _, der = next(self.k_best_derivation_trees())
        return der
    def build_score_validator(self, resource):
        self.organizer.validator = PyCandidateScoreValidator(
            self.organizer.grammarInfo, self.organizer.storageManager,
            self.score_name)

        corpus_validation = self.read_corpus(resource)
        obj_count = 0
        der_count = 0
        timeout = False

        if self.parsing_timeout:
            timeout_manager = multiprocessing.Manager()
            return_dict = timeout_manager.dict()

        for gold in corpus_validation:
            obj_count += 1
            self.parser.set_input(self.parsing_preprocess(gold))

            if self.parsing_timeout:
                timeout, derivations_ = self._compute_derivations_with_timeout(
                    return_dict)
                derivations = list(map(lambda x: x[1], derivations_))
            else:
                self.parser.parse()
                derivations = list(
                    map(lambda x: x[1], self.parser.k_best_derivation_trees()))

            manager = PyDerivationManager(self.base_grammar,
                                          self.organizer.nonterminal_map)
            manager.convert_derivations_to_hypergraphs(derivations)
            scores = []

            # derivations = self.parser.k_best_derivation_trees()
            for der in derivations:
                der_count += 1
                result = self.parsing_postprocess(self.obtain_sentence(gold),
                                                  der)
                score = self.score_object(result, gold)
                scores.append(score)

            self.organizer.validator.add_scored_candidates(
                manager, scores, self.max_score)
            # print(obj_count, self.max_score, scores)
            token = 't' if timeout else ('.' if scores else '-')
            print(token, end='', file=self.logger)
            if scores:
                print(obj_count,
                      'max',
                      max(scores),
                      'firsts',
                      scores[0:10],
                      file=self.logger)
            else:
                print(obj_count, 'max 00.00', '[]', file=self.logger)
            self.parser.clear()
    def __projection_based_derivation_tree(self, la, variational=False, op=prod):
        if self.nontMap is None:
            print("A nonterminal map is required for weight projection based parsing!")
            return None
        manager = PyDerivationManager(self.grammar, self.nontMap)
        manager.convert_chart_to_hypergraph(self.chart, self.disco_grammar, debug=False)
        if self.grammarInfo is not None:
            assert manager.is_consistent_with_grammar(self.grammarInfo)
        manager.set_io_cycle_limit(200)
        manager.set_io_precision(0.000001)

        if not isinstance(la, list):
            la = [la]

        edge_weights = None

        for l in la:
            edge_weights_l = py_edge_weight_projection(l, manager, variational=variational, debug=self.debug,
                                                 log_mode=self.log_mode)
            if edge_weights is None:
                edge_weights = edge_weights_l
            else:
                if self.log_mode:
                    edge_weights = [w1 + w2 for w1, w2 in zip(edge_weights, edge_weights_l)]
                else:
                    edge_weights = [op(w1, w2) for w1, w2 in zip(edge_weights, edge_weights_l)]

        if self.debug:
            nans = 0
            infs = 0
            zeros = 0
            for weight in edge_weights:
                if weight == float("nan"):
                    nans += 1
                if weight == float("inf") or weight == float("-inf"):
                    infs += 1
                if weight == 0.0:
                    zeros += 1
            print("[", len(edge_weights), nans, infs, zeros, "]")
            if len(edge_weights) < 100:
                print(edge_weights)

        der = manager.viterbi_derivation(0, edge_weights, self.grammar, op=op, log_mode=self.log_mode)
        if der is None:
            print("p", end="")
            der = self.latent_viterbi_derivation(debug=self.debug)
        if der is not None:
            der = LCFRSDerivationWrapper(der)
        if der is None:
            _, der = next(self.k_best_derivation_trees())
        return der
 def latent_viterbi_derivation(self, debug=False):
     manager = PyDerivationManager(self.grammar, self.nontMap)
     manager.convert_chart_to_hypergraph(self.chart, self.disco_grammar, debug=False)
     if debug:
         manager.serialize(b'/tmp/my_debug_hypergraph.hg')
     if isinstance(self.la, list):
         la = self.la[0]
     else:
         la = self.la
     vit_der = manager.latent_viterbi_derivation(0, la, self.grammar, debug=debug)
     # if len(self.input) < 15 and not debug:
     #     for weight, der in self.k_best_derivation_trees():
     #         if der != vit_der:
     #             print(weight, der, vit_der)
     #             vit_der2 = self.latent_viterbi_derivation(debug=True)
     #             print("vit2", vit_der2)
     #             if vit_der2 != vit_der:
     #                 print("first and second viterbi derivation differ")
     #             if vit_der2 == der:
     #                 print("second viterbi derivation = 1-best-disco-dop derivation")
     #         print("##############################", flush=True)
     #         break
     #         # raise Exception("too much to read")
     if vit_der is not None:
         vit_der = LCFRSDerivationWrapper(vit_der)
     return vit_der
Exemple #5
0
    def test_something(self):
        grammar, r1, r2 = self.build_grammar()
        nont_map = Enumerator()
        grammarInfo = PyGrammarInfo(grammar, nont_map)

        def w(x):
            return "S", x

        rtg = RTG(w(3))
        rtg.construct_and_add_rule(w(3), r1, [w(1), w(2)])
        rtg.construct_and_add_rule(w(3), r1, [w(2), w(1)])
        rtg.construct_and_add_rule(w(2), r1, [w(1), w(1)])
        rtg.construct_and_add_rule(w(1), r2, [])

        rtg2 = RTG(("A", 3))

        rtg3 = RTG(w(3))
        rtg3.construct_and_add_rule(w(3), r1, [w(1), w(2)])
        rtg3.construct_and_add_rule(w(3), r1, [w(2), w(1)])
        rtg3.construct_and_add_rule(w(2), r2, [w(1), w(1)])
        rtg3.construct_and_add_rule(w(1), r2, [])

        traces = PyDerivationManager(grammar, nont_map)
        traces.convert_rtgs_to_hypergraphs([rtg, rtg2, rtg3])

        self.assertTrue(
            traces.is_consistent_with_grammar(grammarInfo, traceId=0))
        self.assertFalse(
            traces.is_consistent_with_grammar(grammarInfo, traceId=1))
        self.assertFalse(
            traces.is_consistent_with_grammar(grammarInfo, traceId=2))
def main():
    # induce grammar from a corpus
    trees = parse_conll_corpus(train, False, limit_train)
    nonterminal_labelling = the_labeling_factory(
    ).create_simple_labeling_strategy("childtop", "deprel")
    term_labelling = the_terminal_labeling_factory().get_strategy('pos')
    start = 'START'
    recursive_partitioning = [cfg]
    _, grammar = induce_grammar(trees, nonterminal_labelling,
                                term_labelling.token_label,
                                recursive_partitioning, start)

    # compute some derivations
    derivations = obtain_derivations(grammar, term_labelling)

    # create derivation manager and add derivations
    manager = PyDerivationManager(grammar)
    manager.convert_derivations_to_hypergraphs(derivations)
    manager.serialize(b"/tmp/derivations.txt")

    # build and configure split/merge trainer and supplementary objects

    rule_to_nonterminals = []
    for i in range(0, len(grammar.rule_index())):
        rule = grammar.rule_index(i)
        nonts = [
            manager.get_nonterminal_map().object_index(rule.lhs().nont())
        ] + [
            manager.get_nonterminal_map().object_index(nont)
            for nont in rule.rhs()
        ]
        rule_to_nonterminals.append(nonts)

    grammarInfo = PyGrammarInfo(grammar, manager.get_nonterminal_map())
    storageManager = PyStorageManager()
    builder = PySplitMergeTrainerBuilder(manager, grammarInfo)
    builder.set_em_epochs(20)
    builder.set_percent_merger(60.0)

    splitMergeTrainer = builder.build()

    latentAnnotation = [
        build_PyLatentAnnotation_initial(grammar, grammarInfo, storageManager)
    ]

    for i in range(max_cycles + 1):
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        # pickle.dump(map(lambda la: la.serialize(), latentAnnotation), open(sm_info_path, 'wb'))
        smGrammar = build_sm_grammar(latentAnnotation[i],
                                     grammar,
                                     grammarInfo,
                                     rule_pruning=0.0001,
                                     rule_smoothing=0.01)
        print("Cycle: ", i, "Rules: ", len(smGrammar.rules()))

        if parsing:
            parser = GFParser(smGrammar)

            trees = parse_conll_corpus(test, False, limit_test)
            for tree in trees:
                parser.set_input(
                    term_labelling.prepare_parser_input(tree.token_yield()))
                parser.parse()
                if parser.recognized():
                    print(
                        derivation_to_hybrid_tree(
                            parser.best_derivation_tree(),
                            [token.pos() for token in tree.token_yield()],
                            [token.form() for token in tree.token_yield()],
                            construct_constituent_token))
Exemple #7
0
    def test_individual_parsing_stages(self):
        grammar = self.build_grammar()

        for r in transform_grammar(grammar):
            pprint(r)

        rule_list = list(transform_grammar(grammar))
        pprint(rule_list)
        disco_grammar = Grammar(rule_list, start=grammar.start())
        print(disco_grammar)

        inp = ["a"] * 3
        estimates = 'SXlrgaps', getestimates(disco_grammar, 40, grammar.start())
        print(type(estimates))
        chart, msg = parse(inp, disco_grammar, estimates=estimates)
        print(chart)
        print(msg)
        chart.filter()
        print("filtered chart")
        print(disco_grammar.nonterminals)
        print(type(disco_grammar.nonterminals))

        print(chart)
        # print(help(chart))

        root = chart.root()
        print("root", root, type(root))
        print(chart.indices(root))
        print(chart.itemstr(root))
        print(chart.stats())
        print("root label", chart.label(root))
        print(root, chart.itemid1(chart.label(root), chart.indices(root)))
        for i in range(1, chart.numitems() + 1):
            print(i, chart.label(i), chart.indices(i), chart.numedges(i))
            if True or len(chart.indices(i)) > 1:
                for edge_num in range(chart.numedges(i)):
                    edge = chart.getEdgeForItem(i, edge_num)
                    if isinstance(edge, tuple):
                        print("\t", disco_grammar.nonterminalstr(chart.label(i)) + "[" + str(i) + "]", "->", ' '.join([disco_grammar.nonterminalstr(chart.label(j)) + "[" + str(j) + "]" for j in [edge[1], edge[2]] if j != 0]))
                    else:
                        print("\t", disco_grammar.nonterminalstr(chart.label(i)) + "[" + str(i) + "]", "->", inp[edge])
        print(chart.getEdgeForItem(root, 0))
        # print(lazykbest(chart, 5))

        manager = PyDerivationManager(grammar)
        manager.convert_chart_to_hypergraph(chart, disco_grammar, debug=True)

        file = tempfile.mktemp()
        print(file)
        manager.serialize(bytes(file, encoding="utf-8"))

        gi = PyGrammarInfo(grammar, manager.get_nonterminal_map())
        sm = PyStorageManager()
        la = build_PyLatentAnnotation_initial(grammar, gi, sm)

        vec = py_edge_weight_projection(la, manager, variational=True, debug=True, log_mode=False)
        print(vec)
        self.assertEqual([1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 1.0], vec)

        vec = py_edge_weight_projection(la, manager, variational=False, debug=True, log_mode=False)
        print(vec)
        self.assertEqual([1.0, 1.0, 1.0, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 1.0], vec)

        der = manager.viterbi_derivation(0, vec, grammar)
        print(der)

        # print(disco_grammar.rulenos)
        # print(disco_grammar.numrules)
        # print(disco_grammar.lexicalbylhs)
        # print(disco_grammar.lexicalbyword)
        # print(disco_grammar.lexicalbynum)
        # print(disco_grammar.origrules, type(disco_grammar.origrules))
        # print(disco_grammar.numbinary)
        # print(disco_grammar.numunary)
        # print(disco_grammar.toid)
        # print(disco_grammar.tolabel)
        # print(disco_grammar.bitpar)
        # striplabelre = re.compile(r'-\d+$')
        # msg = disco_grammar.getmapping(None, None)
        # disco_grammar.getrulemapping(disco_grammar, striplabelre)
        # mapping = disco_grammar.rulemapping
        # print(mapping)
        # for idx, group in enumerate(mapping):
        #     print("Index", idx)
        #     for elem in group:
        #         print(grammar.rule_index(elem))

        # for _, item in zip(range(20), chart.parseforest):
        #     edge = chart.parseforest[item]
        #     print(item, item.binrepr(), item.__repr__(), item.lexidx())
        #     print(type(edge))
        for _ in range(5):
            vec2 = py_edge_weight_projection(la, manager, debug=True, log_mode=True)
            print(vec2)
Exemple #8
0
    def test_negra_to_dag_parsing(self):
        names = list(map(str, [26954]))

        fd_, primary_file = tempfile.mkstemp(suffix='.export')
        with open(primary_file, mode='w') as pf:

            for s in names:
                dsg = tp.sentence_names_to_deep_syntax_graphs(
                    ["s" + s],
                    "res/tiger/tiger_s%s.xml" % s,
                    hold=False,
                    ignore_puntcuation=False)[0]
                dsg.set_label(dsg.label[1:])
                lines = np.serialize_hybrid_dag_to_negra(
                    [dsg], 0, 500, use_sentence_names=True)
                print(''.join(lines), file=pf)

        _, binarized_file = tempfile.mkstemp(suffix='.export')
        subprocess.call([
            "discodop", "treetransforms", "--binarize", "-v", "1", "-h", "1",
            primary_file, binarized_file
        ])

        print(primary_file)
        print(binarized_file)

        corpus = np.sentence_names_to_hybridtrees(names,
                                                  primary_file,
                                                  secedge=True)
        corpus2 = np.sentence_names_to_hybridtrees(names,
                                                   binarized_file,
                                                   secedge=True)
        dag = corpus[0]
        print(dag)

        assert isinstance(dag, HybridDag)
        self.assertEqual(8, len(dag.token_yield()))
        for token in dag.token_yield():
            print(token.form() + '/' + token.pos(), end=' ')
        print()

        dag_bin = corpus2[0]
        print(dag_bin)

        for token in dag_bin.token_yield():
            print(token.form() + '/' + token.pos(), end=' ')
        print()
        self.assertEqual(8, len(dag_bin.token_yield()))

        for node, token in zip(
                dag_bin.nodes(),
                list(map(str, map(dag_bin.node_token, dag_bin.nodes())))):
            print(node, token)

        print()
        print(top(dag_bin, {'500', '101', '102'}))
        self.assertSetEqual({'101', '500'}, top(dag_bin,
                                                {'500', '101', '102'}))
        print(bottom(dag_bin, {'500', '101', '102'}))
        self.assertSetEqual({'502'}, bottom(dag_bin, {'500', '101', '102'}))

        nont_labeling = BasicNonterminalLabeling()
        term_labeling = FormTerminals()  # PosTerminals()

        grammar = direct_extract_lcfrs_from_prebinarized_corpus(
            dag_bin, term_labeling, nont_labeling)
        # print(grammar)

        for rule in grammar.rules():
            print(rule.get_idx(), rule)

        print("Testing LCFRS parsing and DCP evaluation".center(80, '='))

        parser = LCFRS_parser(grammar)

        parser_input = term_labeling.prepare_parser_input(
            dag_bin.token_yield())
        print(parser_input)
        parser.set_input(parser_input)

        parser.parse()

        self.assertTrue(parser.recognized())

        der = parser.best_derivation_tree()
        print(der)

        dcp_term = DCP_evaluator(der).getEvaluation()

        print(dcp_term[0])

        dag_eval = HybridDag(dag_bin.sent_label())
        dcp_to_hybriddag(dag_eval,
                         dcp_term,
                         copy.deepcopy(dag_bin.token_yield()),
                         False,
                         construct_token=construct_constituent_token)

        print(dag_eval)
        for node in dag_eval.nodes():
            token = dag_eval.node_token(node)
            if token.type() == "CONSTITUENT-CATEGORY":
                label = token.category()
            elif token.type() == "CONSTITUENT-TERMINAL":
                label = token.form(), token.pos()

            print(node, label, dag_eval.children(node),
                  dag_eval.sec_children(node), dag_eval.sec_parents(node))

        lines = np.serialize_hybridtrees_to_negra([dag_eval],
                                                  1,
                                                  500,
                                                  use_sentence_names=True)
        for line in lines:
            print(line, end='')

        print()

        with open(primary_file) as pcf:
            for line in pcf:
                print(line, end='')

        print('Testing reduct computation with Schick parser'.center(80, '='))

        grammar_path = '/tmp/lcfrs_dcp_grammar.gr'
        derivation_manager = PyDerivationManager(grammar)

        with open(grammar_path, 'w') as grammar_file:
            nonterminal_enc, terminal_enc = linearize(
                grammar,
                nont_labeling,
                term_labeling,
                grammar_file,
                delimiter=' : ',
                nonterminal_encoder=derivation_manager.get_nonterminal_map())

        print(np.negra_to_json(dag, terminal_enc, term_labeling))
        json_data = np.export_corpus_to_json([dag], terminal_enc,
                                             term_labeling)

        corpus_path = '/tmp/json_dags.json'
        with open(corpus_path, 'w') as data_file:
            json.dump(json_data, data_file)

        reduct_dir = '/tmp/schick_parser_reducts'
        if os.path.isdir(reduct_dir):
            shutil.rmtree(reduct_dir)
        os.makedirs(reduct_dir)

        p = subprocess.Popen([
            ' '.join([
                "java", "-jar",
                os.path.join("util",
                             SCHICK_PARSER_JAR), 'reduct', '-g', grammar_path,
                '-t', corpus_path, "--input-format", "json", "-o", reduct_dir
            ])
        ],
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        print("stdout", p.stdout.name)

        while True:
            nextline = p.stdout.readline()
            if nextline == b'' and p.poll() is not None:
                break
            print(nextline.decode('unicode_escape'), end='')
            # sys.stdout.write(nextline)
            # sys.stdout.flush()

        p.wait()
        p.stdout.close()
        self.assertEqual(0, p.returncode)
        rtgs = []

        def decode_nonterminals(s):
            return derivation_manager.get_nonterminal_map().index_object(
                int(s))

        for i in range(1, len(corpus) + 1):
            rtgs.append(
                read_rtg(os.path.join(reduct_dir,
                                      str(i) + '.gra'),
                         symbol_offset=-1,
                         rule_prefix='r',
                         process_nonterminal=decode_nonterminals))

        print("Reduct RTG")
        for rule in rtgs[0].rules:
            print(rule.lhs, "->", rule.symbol, rule.rhs)

        derivation_manager.get_nonterminal_map().print_index()
        derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
        derivation_manager.serialize(
            bytes('/tmp/reduct_manager.trace', encoding='utf8'))
        derivations = [
            LCFRSDerivationWrapper(der)
            for der in derivation_manager.enumerate_derivations(0, grammar)
        ]
        self.assertGreaterEqual(len(derivations), 1)

        if len(derivations) >= 1:
            print("Sentence", i)
            for der in derivations:
                print(der)
                self.assertTrue(
                    der.check_integrity_recursive(der.root_id(),
                                                  grammar.start()))
Exemple #9
0
def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation:
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = [der for _, der in parser.k_best_derivation_trees()]
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_hypergraphs(derivations)
        scores = []

        relevant = set([tuple(t) for t in gold_tree.labelled_spans()])

        for der in derivations:
            der_count += 1

            h_tree = ConstituentTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_constituent_token)

            retrieved = set([tuple(t) for t in h_tree.labelled_spans()])
            inters = retrieved & relevant

            # in case of parse failure there are two options here:
            #   - parse failure -> no spans at all, thus precision = 1
            #   - parse failure -> a dummy tree with all spans wrong, thus precision = 0

            precision = 1.0 * len(inters) / len(retrieved) \
                if len(retrieved) > 0 else 0
            recall = 1.0 * len(inters) / len(relevant) \
                if len(relevant) > 0 else 0
            fmeasure = 2.0 * precision * recall / (precision + recall) \
                if precision + recall > 0 else 0

            if validationMethod == "F1":
                scores.append(fmeasure)
            elif validationMethod == "Precision":
                scores.append(precision)
            elif validationMethod == "Recall":
                scores.append(recall)
            else:
                raise ()

        validator.add_scored_candidates(manager, scores,
                                        1.0 if len(relevant) > 0 else 0.0)
        # print(tree_count, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator
def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation.get_trees():
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = map(lambda x: x[1], parser.k_best_derivation_trees())
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_derivations_to_hypergraphs(derivations)
        scores = []

        gold_labels = {}
        gold_heads = {}

        for position, id in enumerate(gold_tree.id_yield()):
            parent_id = gold_tree.parent(id)
            gold_labels[position] = gold_tree.node_token(id).deprel()
            if parent_id is None:
                assert id in gold_tree.root
                gold_heads[position] = 0
            else:
                gold_heads[position] = gold_tree.id_yield().index(
                    parent_id) + 1

        derivations = parser.k_best_derivation_trees()
        for _, der in derivations:
            der_count += 1
            h_tree = HybridTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_conll_token)

            las, uas, lac = 0, 0, 0
            for position, id in enumerate(h_tree.id_yield()):
                parent_id = h_tree.parent(id)
                if parent_id is None:
                    assert id in h_tree.root
                    head = 0
                else:
                    head = h_tree.id_yield().index(parent_id) + 1
                label = h_tree.node_token(id).deprel()

                if gold_heads[position] == head:
                    uas += 1
                if gold_labels[position] == label:
                    lac += 1
                if gold_heads[position] == head and gold_labels[
                        position] == label:
                    las += 1

            if validationMethod == "LAS":
                scores.append(las)
            elif validationMethod == "UAS":
                scores.append(uas)
            elif validationMethod == "LAC":
                scores.append(lac)

        max_score = len(gold_tree.id_yield())
        validator.add_scored_candidates(manager, scores, max_score)
        print(tree_count, max_score, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator
    def test_json_corpus_grammar_export(self):
        start = 1
        stop = 50
        # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
        path = "res/tiger/tiger_8000.xml"
        exclude = []
        dsgs = sentence_names_to_deep_syntax_graphs(
            ['s' + str(i) for i in range(start, stop + 1) if i not in exclude]
            , path
            , hold=False)

        rec_part_strategy = the_recursive_partitioning_factory().get_partitioning('cfg')[0]

        def label_edge(edge):
            if isinstance(edge.label, ConstituentTerminal):
                return edge.label.pos()
            else:
                return edge.label

        nonterminal_labeling = lambda nodes, dsg: simple_labeling(nodes, dsg, label_edge)

        term_labeling_token = PosTerminals()

        def term_labeling(token):
            if isinstance(token, ConstituentTerminal):
                return term_labeling_token.token_label(token)
            else:
                return token

        grammar = induction_on_a_corpus(dsgs, rec_part_strategy, nonterminal_labeling, term_labeling)
        grammar.make_proper()

        terminals = Enumerator()

        data = export_dog_grammar_to_json(grammar, terminals)
        grammar_path = '/tmp/json_grammar.json'
        with open(grammar_path, 'w') as file:
            json.dump(data, file)

        corpus_path = '/tmp/json_corpus.json'
        with open(corpus_path, 'w') as file:
            json.dump(export_corpus_to_json(dsgs, terminals, terminal_labeling=term_labeling), file)

        with open('/tmp/enumerator.enum', 'w') as file:
            terminals.print_index(file)

        reduct_dir = '/tmp/reduct_grammars'
        if os.path.isdir(reduct_dir):
            shutil.rmtree(reduct_dir)
        os.makedirs(reduct_dir)
        p = subprocess.Popen([' '.join(
            ["java", "-jar", os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g', grammar_path, '-t',
             corpus_path, "-o", reduct_dir])], shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        print("stdout", p.stdout.name)

        while True:
            nextline = p.stdout.readline()
            if nextline == b'' and p.poll() is not None:
                break
            print(nextline.decode('unicode_escape'), end='')
            # sys.stdout.write(nextline)
            # sys.stdout.flush()

        p.wait()
        p.stdout.close()
        self.assertEqual(0, p.returncode)

        rtgs = []
        for i in range(1, len(dsgs) + 1):
            rtgs.append(read_rtg('/tmp/reduct_grammars/' + str(i) + '.gra'))

        derivation_manager = PyDerivationManager(grammar)
        derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
        derivation_manager.serialize(bytes('/tmp/reduct_manager.trace', encoding='utf8'))

        f = lambda token: token.pos() if isinstance(token, ConstituentTerminal) else token

        for i, (rtg, dsg) in enumerate(zip(rtgs, dsgs)):
            derivations = [LCFRSDerivationWrapper(der) for der in derivation_manager.enumerate_derivations(i, grammar)]
            self.assertGreaterEqual(len(derivations), 1)
            if len(derivations) > 1:
                print("Sentence", i)
                for der in derivations:
                    print(der)

            for der in derivations:
                dog, sync = dog_evaluation(der)
                dsg2 = DeepSyntaxGraph(der.compute_yield(), dog, sync)
                dsg.dog.project_labels(f)
                dsg.sentence = list(map(f, dsg.sentence))
                self.assertEqual(dsg.sentence, dsg2.sentence)
                morphs = dsg.dog.compute_isomorphism(dsg2.dog)
                self.assertFalse(morphs is None)
                self.assertListEqual([[morphs[0].get(node, node) for node in syncs]
                                      for syncs in dsg.synchronization], dsg2.synchronization)
        pass
def run_experiment(rec_part_strategy,
                   nonterminal_labeling,
                   exp,
                   reorder_children,
                   binarize=True):
    start = 1
    stop = 7000

    test_start = 7001
    test_stop = 7200

    # path = "res/tiger/tiger_release_aug07.corrected.16012013.utf8.xml"
    corpus_path = "res/tiger/tiger_8000.xml"
    exclude = []
    train_dsgs = sentence_names_to_deep_syntax_graphs(
        ['s' + str(i) for i in range(start, stop + 1) if i not in exclude],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)
    test_dsgs = sentence_names_to_deep_syntax_graphs(
        [
            's' + str(i)
            for i in range(test_start, test_stop + 1) if i not in exclude
        ],
        corpus_path,
        hold=False,
        reorder_children=reorder_children)

    # Grammar induction
    term_labeling_token = PosTerminals()

    def term_labeling(token):
        if isinstance(token, ConstituentTerminal):
            return term_labeling_token.token_label(token)
        else:
            return token

    if binarize:

        def modify_token(token):
            if isinstance(token, ConstituentCategory):
                token_new = deepcopy(token)
                token_new.set_category(token.category() + '-BAR')
                return token_new
            elif isinstance(token, str):
                return token + '-BAR'
            else:
                assert False

        train_dsgs = [
            dsg.binarize(bin_modifier=modify_token) for dsg in train_dsgs
        ]

        def is_bin(token):
            if isinstance(token, ConstituentCategory):
                if token.category().endswith('-BAR'):
                    return True
            elif isinstance(token, str):
                if token.endswith('-BAR'):
                    return True
            return False

        def debinarize(dsg):
            return dsg.debinarize(is_bin=is_bin)

    else:
        debinarize = id

    grammar = induction_on_a_corpus(train_dsgs, rec_part_strategy,
                                    nonterminal_labeling, term_labeling)
    grammar.make_proper()

    print("Nonterminals", len(grammar.nonts()), "Rules", len(grammar.rules()))

    parser = GFParser_k_best(grammar, k=500)
    return do_parsing(parser,
                      test_dsgs,
                      term_labeling_token,
                      oracle=True,
                      debinarize=debinarize)

    # Compute reducts, i.e., intersect grammar with each training dsg
    basedir = path.join('/tmp/dog_experiments', 'exp' + str(exp))
    reduct_dir = path.join(basedir, 'reduct_grammars')

    terminal_map = Enumerator()
    if not os.path.isdir(basedir):
        os.makedirs(basedir)
    data = export_dog_grammar_to_json(grammar, terminal_map)
    grammar_path = path.join(basedir, 'grammar.json')
    with open(grammar_path, 'w') as file:
        json.dump(data, file)

    corpus_path = path.join(basedir, 'corpus.json')
    with open(corpus_path, 'w') as file:
        json.dump(
            export_corpus_to_json(train_dsgs,
                                  terminal_map,
                                  terminal_labeling=term_labeling), file)

    with open(path.join(basedir, 'enumerator.enum'), 'w') as file:
        terminal_map.print_index(file)

    if os.path.isdir(reduct_dir):
        shutil.rmtree(reduct_dir)
    os.makedirs(reduct_dir)
    p = subprocess.Popen([
        ' '.join([
            "java", "-jar",
            os.path.join("util", SCHICK_PARSER_JAR), 'dog-reduct', '-g',
            grammar_path, '-t', corpus_path, "-o", reduct_dir
        ])
    ],
                         shell=True,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)

    while True:
        nextline = p.stdout.readline()
        if nextline == '' and p.poll() is not None:
            break
        sys.stdout.write(nextline)
        sys.stdout.flush()

    p.wait()
    p.stdout.close()

    rtgs = []
    for i in range(1, len(train_dsgs) + 1):
        rtgs.append(read_rtg(path.join(reduct_dir, str(i) + '.gra')))

    derivation_manager = PyDerivationManager(grammar)
    derivation_manager.convert_rtgs_to_hypergraphs(rtgs)
    derivation_manager.serialize(path.join(basedir, 'reduct_manager.trace'))

    # Training
    ## prepare EM training
    em_epochs = 20
    seed = 0
    smoothing_factor = 0.01
    split_randomization = 0.01
    sm_cycles = 2
    merge_percentage = 50.0
    grammarInfo = PyGrammarInfo(grammar,
                                derivation_manager.get_nonterminal_map())
    storageManager = PyStorageManager()

    em_builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    em_builder.set_em_epochs(em_epochs)
    em_builder.set_simple_expector(threads=THREADS)
    emTrainer = em_builder.build()

    # randomize initial weights and do em training
    la_no_splits = build_PyLatentAnnotation_initial(grammar, grammarInfo,
                                                    storageManager)
    la_no_splits.add_random_noise(seed=seed)
    emTrainer.em_train(la_no_splits)
    la_no_splits.project_weights(grammar, grammarInfo)

    do_parsing(CFGParser(grammar), test_dsgs, term_labeling_token)
    return
    ## prepare SM training
    builder = PySplitMergeTrainerBuilder(derivation_manager, grammarInfo)
    builder.set_em_epochs(em_epochs)
    builder.set_split_randomization(1.0, seed + 1)
    builder.set_simple_expector(threads=THREADS)
    builder.set_smoothing_factor(smoothingFactor=smoothing_factor)
    builder.set_split_randomization(percent=split_randomization)
    # builder.set_scc_merger(-0.2)
    builder.set_percent_merger(merge_percentage)
    splitMergeTrainer = builder.build()

    # splitMergeTrainer.setMaxDrops(validationDropIterations, mode="smoothing")
    splitMergeTrainer.setEMepochs(em_epochs, mode="smoothing")

    # set initial latent annotation
    latentAnnotation = [la_no_splits]

    # carry out split/merge training and do parsing
    parsing_method = "filter-ctf"
    # parsing_method = "single-best-annotation"
    k_best = 50
    for i in range(1, sm_cycles + 1):
        splitMergeTrainer.reset_random_seed(seed + i + 1)
        latentAnnotation.append(
            splitMergeTrainer.split_merge_cycle(latentAnnotation[-1]))
        print("Cycle: ", i)
        if parsing_method == "single-best-annotation":
            smGrammar = latentAnnotation[i].build_sm_grammar(
                grammar, grammarInfo, rule_pruning=0.0001, rule_smoothing=0.1)
            print("Rules in smoothed grammar: ", len(smGrammar.rules()))
            parser = GFParser(smGrammar)
        elif parsing_method == "filter-ctf":
            latentAnnotation[-1].project_weights(grammar, grammarInfo)
            parser = Coarse_to_fine_parser(
                grammar,
                latentAnnotation[-1],
                grammarInfo,
                derivation_manager.get_nonterminal_map(),
                base_parser_type=GFParser_k_best,
                k=k_best)
        else:
            raise (Exception())
        do_parsing(parser, test_dsgs, term_labeling_token)
        del parser