Beispiel #1
0
def labels_precision_recall(system_output_path,
                            gold_path,
                            labels=["nsubj", "nsubjpass"],
                            fine_grained_deprels=True):
    system = udtree.from_files(system_output_path)
    gold = udtree.from_files(gold_path)
    system_correct, system_incorrect, gold_count = 0, 0, 0
    for system_tree, gold_tree in zip(system, gold):
        (tree_correct,
         tree_incorrect) = match_tree_attachments(system_tree, gold_tree, True,
                                                  fine_grained_deprels=fine_grained_deprels)
        for _, _, system_label, _, _ in tree_correct:
            if system_label in labels:
                system_correct += 1
                gold_count += 1
        for _, _, system_label, _, gold_label in tree_incorrect:
            if gold_label in labels:
                gold_count += 1
            if system_label in labels:
                system_incorrect += 1

    if system_correct + system_incorrect == 0:
        precision = float("NaN")
    else:
        precision = system_correct / (system_correct + system_incorrect)

    if gold_count == 0:
        recall = float("NaN")
    else:
        recall = system_correct / (gold_count)

    return precision, recall
Beispiel #2
0
def weighted_las(system_output_path, gold_path, weights):
    system = udtree.from_files(system_output_path)
    gold = udtree.from_files(gold_path)
    correct, incorrect = 0, 0
    for system_tree, gold_tree in zip(system, gold):
        (tree_correct,
           tree_incorrect) = match_tree_attachments(system_tree, gold_tree, True,
                                                  fine_grained_deprels=False)
        for _, _, _, _, gold_label in tree_correct:
            correct += weights[gold_label]

        for _, _, _, _, gold_label in tree_incorrect:
            incorrect += weights[gold_label]

    if (correct + incorrect) == 0:
        return float("NaN")

    return correct / (correct + incorrect)
Beispiel #3
0
def attachment_score(system_output_path, gold_path, labeled=True, fine_grained_deprels=True, include_punct=False):
    puncts = None
    if not include_punct:
        puncts = set.union(is_only_punctuation(gold_path), {'punct'})
    system = udtree.from_files(system_output_path)
    gold = udtree.from_files(gold_path)
    correct, incorrect = 0, 0
    for system_tree, gold_tree in zip(system, gold):
        (tree_correct,
         tree_incorrect) = match_tree_attachments(system_tree, gold_tree, labeled,
                                                  fine_grained_deprels=fine_grained_deprels,
                                                  ignore_deprels=puncts)
        correct += len(tree_correct)
        incorrect += len(tree_incorrect)

    if (correct + incorrect) == 0:
        return float("NaN")

    return correct / (correct + incorrect)
Beispiel #4
0
def root_distance_las(system_output_path, gold_path, include_punct=False):
    puncts = None
    if not include_punct:
        puncts = set.union(is_only_punctuation(gold_path), {'punct'})
    system = udtree.from_files(system_output_path)
    gold = udtree.from_files(gold_path)
    correct, incorrect = 0, 0
    for system_tree, gold_tree in zip(system, gold):
        (tree_correct,
           tree_incorrect) = match_tree_attachments(system_tree, gold_tree, True,
                                                  fine_grained_deprels=False,
                                                  ignore_deprels=puncts)
        for index, _, _, _, gold_label in tree_correct:
            correct += 1 / root_distance(gold_tree, index)

        for index, _, _, _, gold_label in tree_incorrect:
            incorrect += 1 / root_distance(gold_tree, index)

    if (correct + incorrect) == 0:
        return float("NaN")

    return correct / (correct + incorrect)
Beispiel #5
0
def get_nonprojectivity_ratios():
    lang_trees = lang_utils.get_ud_paths('../resources/universaldependencies1-2/universal-dependencies-1.2/', type_='train', format_='conllu', coarse=False)

    np = dict()
    tot = dict()
    for lang, path in lang_trees.items():
        trees = list(udtree.from_files(path))
        tot[lang] = len(trees)
        np[lang] = get_np_trees(trees)

    rel = {}
    for lang, n in zip(np.keys(), np.values()):
        rel[lang] =  len(n) / tot[lang]
    return rel
Beispiel #6
0
def is_only_punctuation(gold_path):
    is_punct = {}
    puncts = set(string.punctuation)

    for gold_tree in udtree.from_files(gold_path):
        for token, deprel in zip(gold_tree.tokens, gold_tree.deprels):
            all_puncts = True
            for char in token:
                if char not in puncts:
                    all_puncts = False
                    break
            if all_puncts:
                is_punct[deprel] = True
            else:
                is_punct[deprel] = False
    return {deprel for deprel,val in is_punct.items() if val}
Beispiel #7
0
def convert(to_convert_files):
    for lang, files in to_convert_files.items():
        file_ending = ".conllx"
        if not fine_grained_deprels:
            file_ending = ".coarse_deprels.conllx"
        if lang == "Czech" and len(files) > 1:
            file_name = "cs-ud-train" + file_ending
        else:
            file_name = files[0].split("/")[-1].split(".")[0] + file_ending

        outfile = join(project_base, "UD_" + lang, file_name)
        trees = udtree.from_files(files)
        with open (outfile, "w") as w:
            for tree in trees:
                for word in tree.sentence_structure:
                    word['deps'] = None
                    word['misc'] = None
                if not any(tree.postags):  # Copy CPOSTAG to POSTAG
                    tree.postags = tree.cpostags
                w.write("\n".join(tree.to_conllx_format(fine_grained_deprels=fine_grained_deprels)) + "\n\n")
Beispiel #8
0
    for i, head in zip(tree.ids, tree.heads):
        if head-1 < 0:
            continue
        else:
            for j, inner_head in zip(tree.ids[i:head-1], tree.heads[i:head-1]):
                if inner_head < i or inner_head > head:
                    is_non_projective = True
    return is_non_projective

def get_np_trees(trees):
    np = []
    for tree in trees:
        if is_non_projective(tree):
            np.append(tree)

    return np

def get_non-projectivity_ratios():
    lang_trees = lang_utils.get_ud_paths('../resources/universaldependencies1-2/universal-dependencies-1.2/', type_='train', format_='conllu', coarse=False)

    np = dict()
    tot = dict()
    for lang, path in lang_trees.items():
        trees = list(udtree.from_files(path))
        tot[lang] = len(trees)
        np[lang] = get_np_trees(trees)

    rel = {}
    for lang, n in zip(np.keys(), np.values()):
        rel[lang] =  len(n) / tot[lang]