def compute_agreement(df):
    """
    Compute the agreement (Fleiss' Kappa) for each answer field
    :param df: Data frame grouped by HITId
    """
    update_types = ["weakener", "strengthener"]
    props = [
        "gibberish_understandable_grammatical", "relevant", "correct",
        "explains"
    ]

    for prop in props:
        data = []
        task_id = 0
        value_id = defaultdict(count().__next__)

        for update_type in update_types:
            for rat_idx in range(6):
                curr = df[[
                    f"Answer.{update_type}_rationale{rat_idx+1}_{prop}"
                ]]

                # [(annotator, task_id, ans)]
                data += [(str(worker_id + 1), str(task_id + idx),
                          str(value_id[ans])) for idx, row in curr.iterrows()
                         for worker_id, ans in enumerate(row[0])]

                task_id = len(data)

        curr_agreement = AnnotationTask(data=data)
        fleiss = curr_agreement.multi_kappa()
        print(
            f"Property: {prop}, Fleiss' Kappa: {fleiss:.3f} ({get_kappa_interpretation(fleiss)})"
        )
Esempio n. 2
0
def calculate_round_kappa(round_estimates=[]):
    from nltk.metrics.agreement import AnnotationTask

    # Calculating the distance between two different estimate categories, and return the difference ratio
    def distance_cal(v1, v2):
        # all estimate categories: 1 hour, half a day, one day, half a week, one week, two weeks,
        # and more than two weeks (-1)
        labels = ['1.0', '4.0', '8.0', '20.0', '40.0', '80.0', '-1.0']
        i1 = labels.index(v1)
        i2 = labels.index(v2)
        return abs(i1 - i2) / 6

    # prepare estimate for the annotation task
    data, i = [], 1
    for estimate in round_estimates:
        data.append(["c" + str(i), 1, str(estimate)])
        i += 1

    task = AnnotationTask(data=data, distance=distance_cal)
    agreement_level = task.multi_kappa()
    return agreement_level
Esempio n. 3
0
allcoders = data.columns
experts = ['KEY', 'MG', 'MS', 'TM']
novices = ['KEY', 'CK', 'GK', 'RM']

cols = novices

# Total values
taskdata = []
for coder in cols:
    for i in data[coder].index:
        taskdata.append([coder, i, data[coder][i]])

ratingtask = AnnotationTask(data=taskdata)
print("kappa " + str(ratingtask.kappa()))
print("fleiss " + str(ratingtask.multi_kappa()))
print("alpha " + str(ratingtask.alpha()))
print("scotts " + str(ratingtask.pi()))

# Pairwise values
similarities = []
for coders in itertools.product(cols, repeat=2):
    if coders[0] == coders[1]:
        similarities.append(1)
    else:
        taskdata = []
        for coder in coders:
            for i in data[coder].index:
                taskdata.append([coder, i, data[coder][i]])

        ratingtask = AnnotationTask(data=taskdata)
Esempio n. 4
0
                                if single_edge2 not in edges_current_annot:
                                    if (anonym_annot,edge2,dummy_label) not in iaa_data: # to avoid duplicates
                                        iaa_data.append((anonym_annot, single_edge2, dummy_label))
                                    
                        else:
                            # Disagreemnts on edge (and consequently also on label)
                            if edge2 not in summed_results[annotator][text]:
                                if (anonym_annot,edge2,dummy_label) not in iaa_data: 	 # to avoid duplicates
                                    iaa_data.append((anonym_annot, edge2, dummy_label))
                       
    return iaa_data

#text = "text3"
#annotators = ["beata", "elena", "julia"] # "text3"

text = "text6" 
annotators = ["beata", "julia","mats"] # "text6"

dummy_label = frozenset(["CORR"])
flexible = False
add_missing = False		# True = V1, False = V2
iaa_data = create_iaa_data(summed_results, text, annotators, dummy_label, flexible, add_missing)

#print iaa_data[:3]

task = AnnotationTask(data=iaa_data,distance=jaccard_distance)

print "**** Inter-annotator agreement for", text, "****"
print "Avg agreement:\t\t\t\t", round(task.avg_Ao(),3)    		# Average observed agreement across all coders and items.
print "Fleiss (multi_kappa):\t\t", round(task.multi_kappa(),3)  # (Davies and Fleiss 1982)
print "Krippendorff's alpha:\t\t", round(task.alpha(),3) 		# (Krippendorff 1980)
Esempio n. 5
0
    print('Number of spans removed from task: ' + str(len(spans_list)))
    tags_list = [tup for tup in res if tup[1] not in spans_list]
    return tags_list



#week1
a,a_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\a.xml')
b,b_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\b.xml')
c,c_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\c.xml')
d,d_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\d.xml')
tags_list = [tags_to_task(a_tags, 'a'), tags_to_task(b_tags, 'b'), tags_to_task(c_tags, 'c'), tags_to_task(d_tags, 'd')]
t_l = combine_data(tags_list)
week1 = AnnotationTask(data=t_l)
print('Week 1 cross tags agreement:')
print(week1.multi_kappa())

#week2
a,a_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\a.xml')
b,b_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\b.xml')
c,c_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\c.xml')
d,d_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\d.xml')
tags_list = [tags_to_task(a_tags, 'a'), tags_to_task(b_tags, 'b'), tags_to_task(c_tags, 'c'), tags_to_task(d_tags, 'd')]
t_l = combine_data(tags_list)
week2 = AnnotationTask(data=t_l)
print('Week 2 cross tags agreement:')
print(week2.multi_kappa())

#week3_A
a,a_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week3\\A\\A1.xml')
b,b_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week3\\A\\A2.xml')
Esempio n. 6
0
    song = row[0]
    results = row[1:]

    return [(song, coder, category) for (coder, category) in enumerate(results)]

def balify(annotation_entry):
    song, coder, category = annotation_entry
    can_bal = category in ('b', '2')

    return (coder, song, can_bal)

def lindify(annotation_entry):
    song, coder, category = annotation_entry
    can_lindy = category in ('l', '2')

    return (coder, song, can_lindy)

if __name__ == "__main__":
    with open('results.csv', 'r') as csvfile:
        results = []

        csv_rows = list(csv.reader(csvfile))
        for row in csv_rows:
            results += song_row_to_annotation_entries(row)

        bal_annotation = AnnotationTask(data=[balify(r) for r in results])
        lindy_annotation = AnnotationTask(data=[lindify(r) for r in results])

        print("Bal agreement: " + str(bal_annotation.multi_kappa()))
        print("Lindy agreement: " + str(lindy_annotation.multi_kappa()))
Esempio n. 7
0
    return (description_triples, action_triples, relation_triples)

def xmls_to_triples(filenames):
    total_annotators = len(filenames)
    description_triples = []
    action_triples = []
    relation_triples = []
    tagged_lines_count = get_tagged_lines(filenames)
    def tag_filter(tag):
        tagged_start_line_count = tagged_lines_count[tag.attrib["span_start_line"]]
        tagged_end_line_count = tagged_lines_count[tag.attrib["span_end_line"]]
        return tagged_start_line_count == total_annotators and tagged_end_line_count == total_annotators
    for filename in filenames:
        triples = xml_to_triples(filename, tag_filter)
        description_triples.extend(triples[0])
        action_triples.extend(triples[1])
        relation_triples.extend(triples[2])
    return (description_triples, action_triples, relation_triples)

filenames = ["xml_by_annotater/keren.xml",
             "xml_by_annotater/kristen.xml",
             "xml_by_annotater/jingdi.xml"]

triples = xmls_to_triples(filenames)

for n in range(3):
    task = AnnotationTask(data=triples[n])
    print(task.C)
    print("kappa:", task.multi_kappa())
    print("alpha:", task.alpha())
Esempio n. 8
0
def test_agreement_statistics():
    """Tests agreement statistics functions against those found in NLTK:
        https://www.nltk.org/api/nltk.metrics.html#module-nltk.metrics.agreement

    Compares the values of agreement statistics with those found in:
        Artstein, R. and Poesio, M. (2005) Kappa 3 = Alpha (or Beta) University of Essex NLE Technote

    Data is in:
        artstein_poesio_example.txt
    """

    file_path = os.path.join("label_data", "artstein_poesio_example.txt")

    # Distance function for weighted agreement stats
    def test_distance_func(label_a, label_b):
        if label_a == label_b:
            return 0
        elif (label_a == 'ireq'
              and label_b == 'stat') or (label_b == 'ireq'
                                         and label_a == 'stat'):
            return 1
        else:
            return 0.5

    # Gets individual user labels
    def get_user_labels(path):
        with open(path, 'r') as file:
            a_stat = [0] * 100
            a_ireq = [0] * 100
            a_chck = [0] * 100

            b_stat = [0] * 100
            b_ireq = [0] * 100
            b_chck = [0] * 100

            for line in file:
                usr = line.split()[0]
                ind = int(line.split()[1])
                lbl = line.split()[2]
                if usr == 'a':
                    if lbl == 'chck':
                        a_chck[ind - 1] += 1
                    elif lbl == 'stat':
                        a_stat[ind - 1] += 1
                    elif lbl == 'ireq':
                        a_ireq[ind - 1] += 1

                elif usr == 'b':
                    if lbl == 'chck':
                        b_chck[ind - 1] += 1
                    elif lbl == 'stat':
                        b_stat[ind - 1] += 1
                    elif lbl == 'ireq':
                        b_ireq[ind - 1] += 1

            a_data = {'stat': a_stat, 'ireq': a_ireq, 'chck': a_chck}
            a_frame = pd.DataFrame(a_data)
            b_data = {'stat': b_stat, 'ireq': b_ireq, 'chck': b_chck}
            b_frame = pd.DataFrame(b_data)
            example_users_dict = {'a': a_frame, 'b': b_frame}
        return example_users_dict

    # NLTK stats
    nltk_stats = AnnotationTask(data=[x.split() for x in open(file_path)])
    print("nltk:")
    print("multi-Pi - " + str(nltk_stats.pi()))
    print("multi-kappa - " + str(nltk_stats.multi_kappa()))
    print("alpha - " + str(nltk_stats.alpha()))

    # Stats from my functions
    example_users = get_user_labels(file_path)
    print("Mine:")
    print("Multi-Pi - {0:.4f}".format(multi_pi(example_users)))
    print("multi-kappa - {0:.4f}".format(multi_kappa(example_users)))
    print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func)))
    print("alpha prime - {0:.4f}".format(
        alpha_prime(example_users, test_distance_func)))
    print("beta - {0:.4f}".format(beta(example_users, test_distance_func)))

    # Expected values from Artstein and Poesio
    print("Expected:")
    print("mulit-Pi - " + str(0.7995))
    print("mulit-kappa - " + str(0.8013))
    print("alpha - " + str(0.8156))
    print("alpha prime - " + str(0.8146))
    print("beta - " + str(0.8163))

    # Test bias
    uniform_path = os.path.join("label_data", "bias_uniform.txt")
    unequal_path = os.path.join("label_data", "bias_unequal.txt")
    b_uniform = get_user_labels(uniform_path)
    b_unequal = get_user_labels(unequal_path)

    print("Bias with example_users:")
    print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func)))
    print("beta - {0:.4f}".format(beta(example_users, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(example_users, test_distance_func)))

    # Test uniform first
    print("Bias with uniform:")
    print("alpha - {0:.4f}".format(alpha(b_uniform, test_distance_func)))
    print("beta - {0:.4f}".format(beta(b_uniform, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(b_uniform, test_distance_func)))

    print("Bias with unequal:")
    print("alpha - {0:.4f}".format(alpha(b_unequal, test_distance_func)))
    print("beta - {0:.4f}".format(beta(b_unequal, test_distance_func)))
    print("Bias - {0:.4f}".format(bias(b_unequal, test_distance_func)))
Esempio n. 9
0
            for (coder, category) in enumerate(results)]


def balify(annotation_entry):
    song, coder, category = annotation_entry
    can_bal = category in ('b', '2')

    return (coder, song, can_bal)


def lindify(annotation_entry):
    song, coder, category = annotation_entry
    can_lindy = category in ('l', '2')

    return (coder, song, can_lindy)


if __name__ == "__main__":
    with open('results.csv', 'r') as csvfile:
        results = []

        csv_rows = list(csv.reader(csvfile))
        for row in csv_rows:
            results += song_row_to_annotation_entries(row)

        bal_annotation = AnnotationTask(data=[balify(r) for r in results])
        lindy_annotation = AnnotationTask(data=[lindify(r) for r in results])

        print("Bal agreement: " + str(bal_annotation.multi_kappa()))
        print("Lindy agreement: " + str(lindy_annotation.multi_kappa()))