def labels_precision_recall(system_output_path, gold_path, labels=["nsubj", "nsubjpass"], fine_grained_deprels=True): system = udtree.from_files(system_output_path) gold = udtree.from_files(gold_path) system_correct, system_incorrect, gold_count = 0, 0, 0 for system_tree, gold_tree in zip(system, gold): (tree_correct, tree_incorrect) = match_tree_attachments(system_tree, gold_tree, True, fine_grained_deprels=fine_grained_deprels) for _, _, system_label, _, _ in tree_correct: if system_label in labels: system_correct += 1 gold_count += 1 for _, _, system_label, _, gold_label in tree_incorrect: if gold_label in labels: gold_count += 1 if system_label in labels: system_incorrect += 1 if system_correct + system_incorrect == 0: precision = float("NaN") else: precision = system_correct / (system_correct + system_incorrect) if gold_count == 0: recall = float("NaN") else: recall = system_correct / (gold_count) return precision, recall
def weighted_las(system_output_path, gold_path, weights): system = udtree.from_files(system_output_path) gold = udtree.from_files(gold_path) correct, incorrect = 0, 0 for system_tree, gold_tree in zip(system, gold): (tree_correct, tree_incorrect) = match_tree_attachments(system_tree, gold_tree, True, fine_grained_deprels=False) for _, _, _, _, gold_label in tree_correct: correct += weights[gold_label] for _, _, _, _, gold_label in tree_incorrect: incorrect += weights[gold_label] if (correct + incorrect) == 0: return float("NaN") return correct / (correct + incorrect)
def attachment_score(system_output_path, gold_path, labeled=True, fine_grained_deprels=True, include_punct=False): puncts = None if not include_punct: puncts = set.union(is_only_punctuation(gold_path), {'punct'}) system = udtree.from_files(system_output_path) gold = udtree.from_files(gold_path) correct, incorrect = 0, 0 for system_tree, gold_tree in zip(system, gold): (tree_correct, tree_incorrect) = match_tree_attachments(system_tree, gold_tree, labeled, fine_grained_deprels=fine_grained_deprels, ignore_deprels=puncts) correct += len(tree_correct) incorrect += len(tree_incorrect) if (correct + incorrect) == 0: return float("NaN") return correct / (correct + incorrect)
def root_distance_las(system_output_path, gold_path, include_punct=False): puncts = None if not include_punct: puncts = set.union(is_only_punctuation(gold_path), {'punct'}) system = udtree.from_files(system_output_path) gold = udtree.from_files(gold_path) correct, incorrect = 0, 0 for system_tree, gold_tree in zip(system, gold): (tree_correct, tree_incorrect) = match_tree_attachments(system_tree, gold_tree, True, fine_grained_deprels=False, ignore_deprels=puncts) for index, _, _, _, gold_label in tree_correct: correct += 1 / root_distance(gold_tree, index) for index, _, _, _, gold_label in tree_incorrect: incorrect += 1 / root_distance(gold_tree, index) if (correct + incorrect) == 0: return float("NaN") return correct / (correct + incorrect)
def get_nonprojectivity_ratios(): lang_trees = lang_utils.get_ud_paths('../resources/universaldependencies1-2/universal-dependencies-1.2/', type_='train', format_='conllu', coarse=False) np = dict() tot = dict() for lang, path in lang_trees.items(): trees = list(udtree.from_files(path)) tot[lang] = len(trees) np[lang] = get_np_trees(trees) rel = {} for lang, n in zip(np.keys(), np.values()): rel[lang] = len(n) / tot[lang] return rel
def is_only_punctuation(gold_path): is_punct = {} puncts = set(string.punctuation) for gold_tree in udtree.from_files(gold_path): for token, deprel in zip(gold_tree.tokens, gold_tree.deprels): all_puncts = True for char in token: if char not in puncts: all_puncts = False break if all_puncts: is_punct[deprel] = True else: is_punct[deprel] = False return {deprel for deprel,val in is_punct.items() if val}
def convert(to_convert_files): for lang, files in to_convert_files.items(): file_ending = ".conllx" if not fine_grained_deprels: file_ending = ".coarse_deprels.conllx" if lang == "Czech" and len(files) > 1: file_name = "cs-ud-train" + file_ending else: file_name = files[0].split("/")[-1].split(".")[0] + file_ending outfile = join(project_base, "UD_" + lang, file_name) trees = udtree.from_files(files) with open (outfile, "w") as w: for tree in trees: for word in tree.sentence_structure: word['deps'] = None word['misc'] = None if not any(tree.postags): # Copy CPOSTAG to POSTAG tree.postags = tree.cpostags w.write("\n".join(tree.to_conllx_format(fine_grained_deprels=fine_grained_deprels)) + "\n\n")
for i, head in zip(tree.ids, tree.heads): if head-1 < 0: continue else: for j, inner_head in zip(tree.ids[i:head-1], tree.heads[i:head-1]): if inner_head < i or inner_head > head: is_non_projective = True return is_non_projective def get_np_trees(trees): np = [] for tree in trees: if is_non_projective(tree): np.append(tree) return np def get_non-projectivity_ratios(): lang_trees = lang_utils.get_ud_paths('../resources/universaldependencies1-2/universal-dependencies-1.2/', type_='train', format_='conllu', coarse=False) np = dict() tot = dict() for lang, path in lang_trees.items(): trees = list(udtree.from_files(path)) tot[lang] = len(trees) np[lang] = get_np_trees(trees) rel = {} for lang, n in zip(np.keys(), np.values()): rel[lang] = len(n) / tot[lang]