def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation.get_trees():
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = map(lambda x: x[1], parser.k_best_derivation_trees())
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_derivations_to_hypergraphs(derivations)
        scores = []

        gold_labels = {}
        gold_heads = {}

        for position, id in enumerate(gold_tree.id_yield()):
            parent_id = gold_tree.parent(id)
            gold_labels[position] = gold_tree.node_token(id).deprel()
            if parent_id is None:
                assert id in gold_tree.root
                gold_heads[position] = 0
            else:
                gold_heads[position] = gold_tree.id_yield().index(
                    parent_id) + 1

        derivations = parser.k_best_derivation_trees()
        for _, der in derivations:
            der_count += 1
            h_tree = HybridTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_conll_token)

            las, uas, lac = 0, 0, 0
            for position, id in enumerate(h_tree.id_yield()):
                parent_id = h_tree.parent(id)
                if parent_id is None:
                    assert id in h_tree.root
                    head = 0
                else:
                    head = h_tree.id_yield().index(parent_id) + 1
                label = h_tree.node_token(id).deprel()

                if gold_heads[position] == head:
                    uas += 1
                if gold_labels[position] == label:
                    lac += 1
                if gold_heads[position] == head and gold_labels[
                        position] == label:
                    las += 1

            if validationMethod == "LAS":
                scores.append(las)
            elif validationMethod == "UAS":
                scores.append(uas)
            elif validationMethod == "LAC":
                scores.append(lac)

        max_score = len(gold_tree.id_yield())
        validator.add_scored_candidates(manager, scores, max_score)
        print(tree_count, max_score, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator
Example #2
0
def parse_with_pgf(grammar, forms, poss, bin):
    """"
    :type grammar: PGF
    :return:
    :rtype:
    """
    lcfrs = grammar.languages[bin + 'grammargfconcrete']

    # sentence = "ADJD ADV _COMMA_ KOUS ADV PIS PROAV VVINF VMFIN _PUNCT_"
    sentence = ' '.join(map(escape, poss))

    try:
        i = lcfrs.parse(sentence, n=1)
        p, e = next(i)
    except (StopIteration, pgf.ParseError):
        return None

    # print_ast(gr, e, 0)
    s = lcfrs.graphvizParseTree(e)
    assert isinstance(s, str)
    s_ = s.splitlines()

    tree = HybridTree()

    # print s
    i = 0
    for line in s.splitlines():
        match = re.search(r'^\s*(n\d+)\[label="([^\s]+)"\]\s*$', line)
        if match:
            node_id = match.group(1)
            label = match.group(2)
            order = int(node_id[1:]) >= 100000
            if order:
                assert escape(poss[i]) == label
                tree.add_node(
                    node_id,
                    construct_constituent_token(form=forms[i],
                                                pos=poss[i],
                                                terminal=True), True)
                i += 1
            else:
                tree.add_node(
                    node_id,
                    construct_constituent_token(form=label,
                                                pos='_',
                                                terminal=False), False)
            # print node_id, label
            if label == 'VROOT1':
                tree.add_to_root(node_id)
            continue
        match = re.search(r'^  (n\d+) -- (n\d+)\s*$', line)
        if match:
            parent = match.group(1)
            child = match.group(2)
            tree.add_child(parent, child)
            # print line
            # print parent, child
            continue

    # print tree

    assert poss == [token.pos() for token in tree.token_yield()]
    # print the_yield

    dep_tree = HybridTree()
    head_table = defaultdict(lambda: None)
    attachment_point = defaultdict(lambda: None)
    for i, node in enumerate(tree.id_yield()):
        token = tree.node_token(node)
        dep_token = construct_conll_token(token.form(), un_escape(token.pos()))
        current = tree.parent(node)
        current = tree.parent(current)
        while current:
            current_label = tree.node_token(current).category()
            if not re.search(r'\d+X\d+$', current_label):
                s = un_escape(current_label)
                if s == 'TOP1':
                    s = 'ROOT1'
                dep_token.set_edge_label(s[:-1])
                head_table[current] = i + 1
                attachment_point[node] = current
                break
            else:
                current = tree.parent(current)
        dep_tree.add_node(i + 1, dep_token, order=True)

    # print head_table

    for node, dep_node in zip(tree.id_yield(), dep_tree.id_yield()):
        node = tree.parent(attachment_point[node])
        while node:
            if head_table[node]:
                dep_tree.add_child(head_table[node], dep_node)
                break
            node = tree.parent(node)
        if not node:
            dep_tree.add_to_root(dep_node)

    # print "dep_tree"
    # print dep_tree
    # print ' '.join(['(' + token.form() + '/' + token.deprel() + ')' for token in dep_tree.token_yield()])
    return dep_tree