def compute_agreement(df): """ Compute the agreement (Fleiss' Kappa) for each answer field :param df: Data frame grouped by HITId """ update_types = ["weakener", "strengthener"] props = [ "gibberish_understandable_grammatical", "relevant", "correct", "explains" ] for prop in props: data = [] task_id = 0 value_id = defaultdict(count().__next__) for update_type in update_types: for rat_idx in range(6): curr = df[[ f"Answer.{update_type}_rationale{rat_idx+1}_{prop}" ]] # [(annotator, task_id, ans)] data += [(str(worker_id + 1), str(task_id + idx), str(value_id[ans])) for idx, row in curr.iterrows() for worker_id, ans in enumerate(row[0])] task_id = len(data) curr_agreement = AnnotationTask(data=data) fleiss = curr_agreement.multi_kappa() print( f"Property: {prop}, Fleiss' Kappa: {fleiss:.3f} ({get_kappa_interpretation(fleiss)})" )
def calculate_round_kappa(round_estimates=[]): from nltk.metrics.agreement import AnnotationTask # Calculating the distance between two different estimate categories, and return the difference ratio def distance_cal(v1, v2): # all estimate categories: 1 hour, half a day, one day, half a week, one week, two weeks, # and more than two weeks (-1) labels = ['1.0', '4.0', '8.0', '20.0', '40.0', '80.0', '-1.0'] i1 = labels.index(v1) i2 = labels.index(v2) return abs(i1 - i2) / 6 # prepare estimate for the annotation task data, i = [], 1 for estimate in round_estimates: data.append(["c" + str(i), 1, str(estimate)]) i += 1 task = AnnotationTask(data=data, distance=distance_cal) agreement_level = task.multi_kappa() return agreement_level
allcoders = data.columns experts = ['KEY', 'MG', 'MS', 'TM'] novices = ['KEY', 'CK', 'GK', 'RM'] cols = novices # Total values taskdata = [] for coder in cols: for i in data[coder].index: taskdata.append([coder, i, data[coder][i]]) ratingtask = AnnotationTask(data=taskdata) print("kappa " + str(ratingtask.kappa())) print("fleiss " + str(ratingtask.multi_kappa())) print("alpha " + str(ratingtask.alpha())) print("scotts " + str(ratingtask.pi())) # Pairwise values similarities = [] for coders in itertools.product(cols, repeat=2): if coders[0] == coders[1]: similarities.append(1) else: taskdata = [] for coder in coders: for i in data[coder].index: taskdata.append([coder, i, data[coder][i]]) ratingtask = AnnotationTask(data=taskdata)
if single_edge2 not in edges_current_annot: if (anonym_annot,edge2,dummy_label) not in iaa_data: # to avoid duplicates iaa_data.append((anonym_annot, single_edge2, dummy_label)) else: # Disagreemnts on edge (and consequently also on label) if edge2 not in summed_results[annotator][text]: if (anonym_annot,edge2,dummy_label) not in iaa_data: # to avoid duplicates iaa_data.append((anonym_annot, edge2, dummy_label)) return iaa_data #text = "text3" #annotators = ["beata", "elena", "julia"] # "text3" text = "text6" annotators = ["beata", "julia","mats"] # "text6" dummy_label = frozenset(["CORR"]) flexible = False add_missing = False # True = V1, False = V2 iaa_data = create_iaa_data(summed_results, text, annotators, dummy_label, flexible, add_missing) #print iaa_data[:3] task = AnnotationTask(data=iaa_data,distance=jaccard_distance) print "**** Inter-annotator agreement for", text, "****" print "Avg agreement:\t\t\t\t", round(task.avg_Ao(),3) # Average observed agreement across all coders and items. print "Fleiss (multi_kappa):\t\t", round(task.multi_kappa(),3) # (Davies and Fleiss 1982) print "Krippendorff's alpha:\t\t", round(task.alpha(),3) # (Krippendorff 1980)
print('Number of spans removed from task: ' + str(len(spans_list))) tags_list = [tup for tup in res if tup[1] not in spans_list] return tags_list #week1 a,a_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\a.xml') b,b_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\b.xml') c,c_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\c.xml') d,d_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\d.xml') tags_list = [tags_to_task(a_tags, 'a'), tags_to_task(b_tags, 'b'), tags_to_task(c_tags, 'c'), tags_to_task(d_tags, 'd')] t_l = combine_data(tags_list) week1 = AnnotationTask(data=t_l) print('Week 1 cross tags agreement:') print(week1.multi_kappa()) #week2 a,a_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\a.xml') b,b_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\b.xml') c,c_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\c.xml') d,d_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\d.xml') tags_list = [tags_to_task(a_tags, 'a'), tags_to_task(b_tags, 'b'), tags_to_task(c_tags, 'c'), tags_to_task(d_tags, 'd')] t_l = combine_data(tags_list) week2 = AnnotationTask(data=t_l) print('Week 2 cross tags agreement:') print(week2.multi_kappa()) #week3_A a,a_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week3\\A\\A1.xml') b,b_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week3\\A\\A2.xml')
song = row[0] results = row[1:] return [(song, coder, category) for (coder, category) in enumerate(results)] def balify(annotation_entry): song, coder, category = annotation_entry can_bal = category in ('b', '2') return (coder, song, can_bal) def lindify(annotation_entry): song, coder, category = annotation_entry can_lindy = category in ('l', '2') return (coder, song, can_lindy) if __name__ == "__main__": with open('results.csv', 'r') as csvfile: results = [] csv_rows = list(csv.reader(csvfile)) for row in csv_rows: results += song_row_to_annotation_entries(row) bal_annotation = AnnotationTask(data=[balify(r) for r in results]) lindy_annotation = AnnotationTask(data=[lindify(r) for r in results]) print("Bal agreement: " + str(bal_annotation.multi_kappa())) print("Lindy agreement: " + str(lindy_annotation.multi_kappa()))
return (description_triples, action_triples, relation_triples) def xmls_to_triples(filenames): total_annotators = len(filenames) description_triples = [] action_triples = [] relation_triples = [] tagged_lines_count = get_tagged_lines(filenames) def tag_filter(tag): tagged_start_line_count = tagged_lines_count[tag.attrib["span_start_line"]] tagged_end_line_count = tagged_lines_count[tag.attrib["span_end_line"]] return tagged_start_line_count == total_annotators and tagged_end_line_count == total_annotators for filename in filenames: triples = xml_to_triples(filename, tag_filter) description_triples.extend(triples[0]) action_triples.extend(triples[1]) relation_triples.extend(triples[2]) return (description_triples, action_triples, relation_triples) filenames = ["xml_by_annotater/keren.xml", "xml_by_annotater/kristen.xml", "xml_by_annotater/jingdi.xml"] triples = xmls_to_triples(filenames) for n in range(3): task = AnnotationTask(data=triples[n]) print(task.C) print("kappa:", task.multi_kappa()) print("alpha:", task.alpha())
def test_agreement_statistics(): """Tests agreement statistics functions against those found in NLTK: https://www.nltk.org/api/nltk.metrics.html#module-nltk.metrics.agreement Compares the values of agreement statistics with those found in: Artstein, R. and Poesio, M. (2005) Kappa 3 = Alpha (or Beta) University of Essex NLE Technote Data is in: artstein_poesio_example.txt """ file_path = os.path.join("label_data", "artstein_poesio_example.txt") # Distance function for weighted agreement stats def test_distance_func(label_a, label_b): if label_a == label_b: return 0 elif (label_a == 'ireq' and label_b == 'stat') or (label_b == 'ireq' and label_a == 'stat'): return 1 else: return 0.5 # Gets individual user labels def get_user_labels(path): with open(path, 'r') as file: a_stat = [0] * 100 a_ireq = [0] * 100 a_chck = [0] * 100 b_stat = [0] * 100 b_ireq = [0] * 100 b_chck = [0] * 100 for line in file: usr = line.split()[0] ind = int(line.split()[1]) lbl = line.split()[2] if usr == 'a': if lbl == 'chck': a_chck[ind - 1] += 1 elif lbl == 'stat': a_stat[ind - 1] += 1 elif lbl == 'ireq': a_ireq[ind - 1] += 1 elif usr == 'b': if lbl == 'chck': b_chck[ind - 1] += 1 elif lbl == 'stat': b_stat[ind - 1] += 1 elif lbl == 'ireq': b_ireq[ind - 1] += 1 a_data = {'stat': a_stat, 'ireq': a_ireq, 'chck': a_chck} a_frame = pd.DataFrame(a_data) b_data = {'stat': b_stat, 'ireq': b_ireq, 'chck': b_chck} b_frame = pd.DataFrame(b_data) example_users_dict = {'a': a_frame, 'b': b_frame} return example_users_dict # NLTK stats nltk_stats = AnnotationTask(data=[x.split() for x in open(file_path)]) print("nltk:") print("multi-Pi - " + str(nltk_stats.pi())) print("multi-kappa - " + str(nltk_stats.multi_kappa())) print("alpha - " + str(nltk_stats.alpha())) # Stats from my functions example_users = get_user_labels(file_path) print("Mine:") print("Multi-Pi - {0:.4f}".format(multi_pi(example_users))) print("multi-kappa - {0:.4f}".format(multi_kappa(example_users))) print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func))) print("alpha prime - {0:.4f}".format( alpha_prime(example_users, test_distance_func))) print("beta - {0:.4f}".format(beta(example_users, test_distance_func))) # Expected values from Artstein and Poesio print("Expected:") print("mulit-Pi - " + str(0.7995)) print("mulit-kappa - " + str(0.8013)) print("alpha - " + str(0.8156)) print("alpha prime - " + str(0.8146)) print("beta - " + str(0.8163)) # Test bias uniform_path = os.path.join("label_data", "bias_uniform.txt") unequal_path = os.path.join("label_data", "bias_unequal.txt") b_uniform = get_user_labels(uniform_path) b_unequal = get_user_labels(unequal_path) print("Bias with example_users:") print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func))) print("beta - {0:.4f}".format(beta(example_users, test_distance_func))) print("Bias - {0:.4f}".format(bias(example_users, test_distance_func))) # Test uniform first print("Bias with uniform:") print("alpha - {0:.4f}".format(alpha(b_uniform, test_distance_func))) print("beta - {0:.4f}".format(beta(b_uniform, test_distance_func))) print("Bias - {0:.4f}".format(bias(b_uniform, test_distance_func))) print("Bias with unequal:") print("alpha - {0:.4f}".format(alpha(b_unequal, test_distance_func))) print("beta - {0:.4f}".format(beta(b_unequal, test_distance_func))) print("Bias - {0:.4f}".format(bias(b_unequal, test_distance_func)))
for (coder, category) in enumerate(results)] def balify(annotation_entry): song, coder, category = annotation_entry can_bal = category in ('b', '2') return (coder, song, can_bal) def lindify(annotation_entry): song, coder, category = annotation_entry can_lindy = category in ('l', '2') return (coder, song, can_lindy) if __name__ == "__main__": with open('results.csv', 'r') as csvfile: results = [] csv_rows = list(csv.reader(csvfile)) for row in csv_rows: results += song_row_to_annotation_entries(row) bal_annotation = AnnotationTask(data=[balify(r) for r in results]) lindy_annotation = AnnotationTask(data=[lindify(r) for r in results]) print("Bal agreement: " + str(bal_annotation.multi_kappa())) print("Lindy agreement: " + str(lindy_annotation.multi_kappa()))