def save_tsv(filename, essay_code, sentences, dist, label_preds):
    header = ["essay code", "unit id", "text", "target", "relation", "drop_flag"]
    rep = TreeBuilder(dist)
    component_labels = rep.auto_component_labels(AC_breakdown=True)

    f = open(filename, "w")
    f.write("\t".join(header)+"\n")
    label_idx = 0

    for i in range(len(sentences)):
        output_line = []
        output_line.append(essay_code)
        output_line.append(str(i+1))
        output_line.append(sentences[i])
        if component_labels[i] == "non-arg comp.":
            output_line.append("")
            output_line.append("")
            output_line.append("TRUE")
        else:
            target = i+1+dist[i]
            if target==i+1: # point to itself, i.e., root
                output_line.append("")
                output_line.append("")
            else: # not root
                output_line.append(str(target))
                output_line.append(label_preds[label_idx])
                label_idx += 1
            output_line.append("FALSE")

        f.write("\t".join(output_line)+"\n")
    assert(label_idx == len(label_preds))
    f.close()
Esempio n. 2
0
def structured_output_quality(links) -> (List, float, float, float):
    """
    Infer component labels automatically from the structure
    """
    component_labels = []
    tree_ratio = 0
    avg_depth = 0
    avg_leaf_prop = 0
    all_depths = []

    n_essays = len(links)

    for i in range(len(links)):
        rep = TreeBuilder(links[i])
        component_labels.append(rep.auto_component_labels(AC_breakdown=True))

        if rep.is_tree():
            tree_ratio += 1

            # evaluate this only when the output forms a tree
            depth, leaf_prop = rep.tree_depth_and_leaf_proportion()
            avg_depth += depth
            all_depths.append(depth)
            avg_leaf_prop += leaf_prop

    return component_labels, float(tree_ratio) / float(n_essays), float(
        avg_depth) / float(tree_ratio), float(avg_leaf_prop) / float(
            tree_ratio), all_depths
def create_pairwise_data(sentences, dist):
    """
    Create pairwise link labelling data

    Args:
        sentences (list[str])
        dist (list[int])

    Returns:
        list[tuple(str,str)]
    """
    rep = TreeBuilder(dist)
    component_labels = rep.auto_component_labels(AC_breakdown=True) 

    output = []
    for i in range(len(sentences)):
        if component_labels[i] == "non-arg comp.":
            pass
        else:
            if i+dist[i] != i: # the current sentence does not point to itself, i.e., not a root
                source = sentences[i]
                target = sentences[i+dist[i]]
                output.append((source, target))
    return output
Esempio n. 4
0
        # assertion to check whether we have included non-arg-units here
        assert (len(sentences) == len(rel_distances))
        assert (len(sentences) == len(rel_labels))

        # determine where to save the file
        if args.split:
            split_folder = check_train_or_test(split_info, essay.essay_code)
            assert (split_folder != None)
            split_folder = split_folder + "/"
        else:
            split_folder = ""  # no split information provided

        # component labels
        rep = TreeBuilder(rel_distances)
        component_labels = rep.auto_component_labels(AC_breakdown=True)

        # save to file
        save_content_to_file(
            args.out_dir + "linking/" + split_folder.lower() +
            essay.essay_code + ".sentences", sentences)
        save_content_to_file(
            args.out_dir + "linking/" + split_folder.lower() +
            essay.essay_code + ".vectors", vectors.tolist())
        save_content_to_file(
            args.out_dir + "linking/" + split_folder.lower() +
            essay.essay_code + ".rel_distances", rel_distances)
        # save_content_to_file(args.out_dir + "linking/" + split_folder.lower() + essay.essay_code + ".rel_labels", rel_labels)
        save_content_to_file(
            args.out_dir + "linking/" + split_folder.lower() +
            essay.essay_code + ".component_labels", component_labels)