def compute_agreement(df): """ Compute the agreement (Fleiss' Kappa) for each answer field :param df: Data frame grouped by HITId """ update_types = ["weakener", "strengthener"] props = [ "gibberish_understandable_grammatical", "relevant", "correct", "explains" ] for prop in props: data = [] task_id = 0 value_id = defaultdict(count().__next__) for update_type in update_types: for rat_idx in range(6): curr = df[[ f"Answer.{update_type}_rationale{rat_idx+1}_{prop}" ]] # [(annotator, task_id, ans)] data += [(str(worker_id + 1), str(task_id + idx), str(value_id[ans])) for idx, row in curr.iterrows() for worker_id, ans in enumerate(row[0])] task_id = len(data) curr_agreement = AnnotationTask(data=data) fleiss = curr_agreement.multi_kappa() print( f"Property: {prop}, Fleiss' Kappa: {fleiss:.3f} ({get_kappa_interpretation(fleiss)})" )
def calculate_kappa(filename): # save labels label_list = [] with open('data/' + filename + '_data_result.json') as json_file: tweets = json.load(json_file) for row in tweets: label_list.append(row['label']) # Generate two fake labels to calculate kappa man_1_label = change_some_values(label_list) man_2_label = change_some_values(label_list) # save the labels to a csv file save_to_csv('data/label_1.csv', man_1_label) save_to_csv('data/label_2.csv', man_2_label) # calculate inter annotator agreement civ_1 = ['c1'] * len(man_1_label) civ_2 = ['c2'] * len(man_2_label) item_num_list = range(0, len(man_1_label)) civ_1 = zip(civ_1, item_num_list, man_1_label) civ_2 = zip(civ_2, item_num_list, man_2_label) task_data = civ_1 + civ_2 task = AnnotationTask(data=task_data) # observed disagreement for the weighted kappa coefficient print 'kappa: ' + str(task.kappa())
def calc_agreements(nr_of_abstracts=150): # Loop over the abstracts and calculate the kappa and alpha per abstract aggregate = [] for i in range(0, nr_of_abstracts): # try: annotators = round_robin(i) annotations_A = flatten(get_annotations(i, annotators[0])) annotations_B = flatten(get_annotations(i, annotators[1])) annotations = __str_combine_annotations(annotations_A, annotations_B) a = AnnotationTask(annotations, agreement_fn) aggregate.append({ "kappa" : a.kappa(), "alpha" : a.alpha(), "annotator_A" : annotators[0], "annotator_B" : annotators[1] }) # except: # print("Could not calculate kappa for abstract %i" % (i + 1)) # pass # Summary statistics kappa = describe([a['kappa'] for a in aggregate]) print("number of abstracts %i" % kappa[0]) print("[kappa] mean: " + str(kappa[2])) print("[kappa] variance: " + str(kappa[3])) alpha = describe([a['alpha'] for a in aggregate]) print("[alpha] mean: " + str(alpha[2])) print("[alpha] variance: " + str(alpha[3]))
def alpha(self, ids=None, staff="upper", common_id=None, lib='nltk', label='bigram', distance=None): if ids is None: ids = [] if staff not in ('upper', 'lower'): raise Exception( "Alpha measure only applicable one staff at a time.") data = self._staff_annotation_data(ids=ids, staff=staff, lib=lib, label=label, common_id=common_id) if distance is None and label == "bigram": distance = DScore.bigram_label_distance if lib == 'nltk': if distance is None: distance = binary_distance annot_task = AnnotationTask(data=data, distance=distance) krip = annot_task.alpha() else: if distance is None: distance = 'nominal' krip = alpha(reliability_data=data, level_of_measurement=distance) return krip
def __main__(argv): if len(argv) != 2: print("Specify cmd arg") sys.exit(2) else: arg = argv[1] if arg == 'img': reliability_mat = getReliabilityMatImg("../data/imageGID_job_map_expt2_corrected.csv") else: reliability_mat = getReliabilityMatTurker() t = AnnotationTask(data=reliability_mat) print("Calculating the agreement scores") alpha = t.alpha() print("Alpha = %f" %alpha) s = t.S() print("S = %f" %s) pi = t.pi() print("Pi = %f" %pi) kappa = t.kappa() print("kappa = %f" %kappa)
def compute_krippendorff(sce_path, output_path='', wo_attention_check=False, bad_annotators_path='', dataset=''): """ Compute Krippendorff's alpha with krippendorff library (https://github.com/pln-fing-udelar/fast-krippendorff/blob/master/sample.py) :param sce_path: csv file with columns UID, ANSWER, ANNOTATOR :param output_path: path of the output file where the results will be printed (if empty string the results are printed in the standart output) :param wo_attention_check: if True remove the attention check when computing alpha :param bad_annotators_path: path of the pkl file containing for each threshold the list of 'bad' annotators. For each threshold remove the annotations of the annotators listed when computing alpha. If empty string no annotator's annotation it removed. :param dataset: alphanumeric characters identifying the corpus to compute the alpha (if empty string the alpha is computed with annotation from all corpora and from attention check) """ if output_path: sys.stdout = open(output_path, "w") rows = read_csv(sce_path, dataset=dataset) bad_annotators_per_th = get_bad_annotators(bad_annotators_path) for th, bad_annotators in bad_annotators_per_th.items(): print(f'--- Threshold {th}---') annotations = get_annotations_per_annotators( rows, wo_attention_check=wo_attention_check, wo_annotator=bad_annotators) print('- After filtering: -') print_annotation_statistics(annotations) ratings_per_annotator = get_annotator_tab(annotations) data = [[np.nan if not r else int(r) for r in ratings] for ratings in ratings_per_annotator] print( "Krippendorff's alpha for nominal metric: ", krippendorff.alpha(reliability_data=data, level_of_measurement='nominal')) print("Krippendorff's alpha for interval metric: ", krippendorff.alpha(reliability_data=data)) print( "Krippendorff's alpha for ordinal metric: ", krippendorff.alpha(reliability_data=data, level_of_measurement='ordinal')) # with nltk library task_data = annotations2task_data(annotations) rating_task = AnnotationTask(data=task_data, distance=ordinal) print("Krippendorff's alpha for ordinal metric (nltk): ", rating_task.alpha())
def nltk_with_kippendorff_data(): # needs data to be shaped in triples: (coder,item,label) input_eval_dp = "../data/krippendorff-evaluation-dataset.csv" eval_df = pd.read_table(input_eval_dp, delimiter=',', index_col=0) print("\ninput data:\n", eval_df.head()) # reshape rcsi data eval_nltk_df = pd.DataFrame() for index, row in eval_df.iterrows(): eval_nltk_df = eval_nltk_df.append( { 'coder': 'obs_1', 'item': index, 'label': row['obs1'] }, ignore_index=True) eval_nltk_df = eval_nltk_df.append( { 'coder': 'obs_2', 'item': index, 'label': row['obs2'] }, ignore_index=True) eval_nltk_df = eval_nltk_df.append( { 'coder': 'obs_3', 'item': index, 'label': row['obs3'] }, ignore_index=True) eval_nltk_df = eval_nltk_df.append( { 'coder': 'obs_4', 'item': index, 'label': row['obs4'] }, ignore_index=True) eval_nltk_df = eval_nltk_df.append( { 'coder': 'obs_5', 'item': index, 'label': row['obs5'] }, ignore_index=True) print("\nreshaped data:\n\n", eval_nltk_df.head()) print(eval_nltk_df.tail()) annotation_triples = eval_nltk_df.values.tolist() # print(annotation_triples) t = AnnotationTask(annotation_triples) print("\nKrippendorff alpha as per NLTK:\t", t.alpha(), "\n===========================================\n")
def calculate_iaa(data_dict): i = 0 data = [] for key, value in data_dict.items(): i += 1 data.append( ('Annotator1', i, frozenset((value['label1'], value['label1_2'])))) data.append( ('Annotator2', i, frozenset((value['label2'], value['label2_2'])))) print(data) t = AnnotationTask(data=data, distance=masi_distance) print(t.avg_Ao())
def test_easy(self): ''' Simple test, based on https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf. ''' data = [('coder1', 'dress1', 'YES'), ('coder2', 'dress1', 'NO'), ('coder3', 'dress1', 'NO'), ('coder1', 'dress2', 'YES'), ('coder2', 'dress2', 'NO'), ('coder3', 'dress3', 'NO'), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
def test_advanced(self): ''' More advanced test, based on http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf ''' data = [ ('A', '1', '1'), ('B', '1', '1'), ('D', '1', '1'), ('A', '2', '2'), ('B', '2', '2'), ('C', '2', '3'), ('D', '2', '2'), ('A', '3', '3'), ('B', '3', '3'), ('C', '3', '3'), ('D', '3', '3'), ('A', '4', '3'), ('B', '4', '3'), ('C', '4', '3'), ('D', '4', '3'), ('A', '5', '2'), ('B', '5', '2'), ('C', '5', '2'), ('D', '5', '2'), ('A', '6', '1'), ('B', '6', '2'), ('C', '6', '3'), ('D', '6', '4'), ('A', '7', '4'), ('B', '7', '4'), ('C', '7', '4'), ('D', '7', '4'), ('A', '8', '1'), ('B', '8', '1'), ('C', '8', '2'), ('D', '8', '1'), ('A', '9', '2'), ('B', '9', '2'), ('C', '9', '2'), ('D', '9', '2'), ('B', '10', '5'), ('C', '10', '5'), ('D', '10', '5'), ('C', '11', '1'), ('D', '11', '1'), ('C', '12', '3'), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
def test_easy2(self): ''' Same simple test with 1 rating removed. Removal of that rating should not matter: K-Apha ignores items with only 1 rating. ''' data = [('coder1', 'dress1', 'YES'), ('coder2', 'dress1', 'NO'), ('coder3', 'dress1', 'NO'), ('coder1', 'dress2', 'YES'), ('coder2', 'dress2', 'NO'), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
def test_advanced2(self): ''' Same more advanced example, but with 1 rating removed. Again, removal of that 1 rating shoudl not matter. ''' data = [ ('A', '1', '1'), ('B', '1', '1'), ('D', '1', '1'), ('A', '2', '2'), ('B', '2', '2'), ('C', '2', '3'), ('D', '2', '2'), ('A', '3', '3'), ('B', '3', '3'), ('C', '3', '3'), ('D', '3', '3'), ('A', '4', '3'), ('B', '4', '3'), ('C', '4', '3'), ('D', '4', '3'), ('A', '5', '2'), ('B', '5', '2'), ('C', '5', '2'), ('D', '5', '2'), ('A', '6', '1'), ('B', '6', '2'), ('C', '6', '3'), ('D', '6', '4'), ('A', '7', '4'), ('B', '7', '4'), ('C', '7', '4'), ('D', '7', '4'), ('A', '8', '1'), ('B', '8', '1'), ('C', '8', '2'), ('D', '8', '1'), ('A', '9', '2'), ('B', '9', '2'), ('C', '9', '2'), ('D', '9', '2'), ('B', '10', '5'), ('C', '10', '5'), ('D', '10', '5'), ('C', '11', '1'), ('D', '11', '1'), ('C', '12', '3'), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
def test_advanced(self): """ More advanced test, based on http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf """ data = [ ("A", "1", "1"), ("B", "1", "1"), ("D", "1", "1"), ("A", "2", "2"), ("B", "2", "2"), ("C", "2", "3"), ("D", "2", "2"), ("A", "3", "3"), ("B", "3", "3"), ("C", "3", "3"), ("D", "3", "3"), ("A", "4", "3"), ("B", "4", "3"), ("C", "4", "3"), ("D", "4", "3"), ("A", "5", "2"), ("B", "5", "2"), ("C", "5", "2"), ("D", "5", "2"), ("A", "6", "1"), ("B", "6", "2"), ("C", "6", "3"), ("D", "6", "4"), ("A", "7", "4"), ("B", "7", "4"), ("C", "7", "4"), ("D", "7", "4"), ("A", "8", "1"), ("B", "8", "1"), ("C", "8", "2"), ("D", "8", "1"), ("A", "9", "2"), ("B", "9", "2"), ("C", "9", "2"), ("D", "9", "2"), ("B", "10", "5"), ("C", "10", "5"), ("D", "10", "5"), ("C", "11", "1"), ("D", "11", "1"), ("C", "12", "3"), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
def test_advanced2(self): """ Same more advanced example, but with 1 rating removed. Again, removal of that 1 rating should not matter. """ data = [ ("A", "1", "1"), ("B", "1", "1"), ("D", "1", "1"), ("A", "2", "2"), ("B", "2", "2"), ("C", "2", "3"), ("D", "2", "2"), ("A", "3", "3"), ("B", "3", "3"), ("C", "3", "3"), ("D", "3", "3"), ("A", "4", "3"), ("B", "4", "3"), ("C", "4", "3"), ("D", "4", "3"), ("A", "5", "2"), ("B", "5", "2"), ("C", "5", "2"), ("D", "5", "2"), ("A", "6", "1"), ("B", "6", "2"), ("C", "6", "3"), ("D", "6", "4"), ("A", "7", "4"), ("B", "7", "4"), ("C", "7", "4"), ("D", "7", "4"), ("A", "8", "1"), ("B", "8", "1"), ("C", "8", "2"), ("D", "8", "1"), ("A", "9", "2"), ("B", "9", "2"), ("C", "9", "2"), ("D", "9", "2"), ("B", "10", "5"), ("C", "10", "5"), ("D", "10", "5"), ("C", "11", "1"), ("D", "11", "1"), ("C", "12", "3"), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
def test_advanced(self): ''' More advanced test, based on http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf ''' data = [('A', '1', '1'), ('B', '1', '1'), ('D', '1', '1'), ('A', '2', '2'), ('B', '2', '2'), ('C', '2', '3'), ('D', '2', '2'), ('A', '3', '3'), ('B', '3', '3'), ('C', '3', '3'), ('D', '3', '3'), ('A', '4', '3'), ('B', '4', '3'), ('C', '4', '3'), ('D', '4', '3'), ('A', '5', '2'), ('B', '5', '2'), ('C', '5', '2'), ('D', '5', '2'), ('A', '6', '1'), ('B', '6', '2'), ('C', '6', '3'), ('D', '6', '4'), ('A', '7', '4'), ('B', '7', '4'), ('C', '7', '4'), ('D', '7', '4'), ('A', '8', '1'), ('B', '8', '1'), ('C', '8', '2'), ('D', '8', '1'), ('A', '9', '2'), ('B', '9', '2'), ('C', '9', '2'), ('D', '9', '2'), ('B', '10', '5'), ('C', '10', '5'), ('D', '10', '5'), ('C', '11', '1'), ('D', '11', '1'), ('C', '12', '3'), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
def test_advanced2(self): ''' Same more advanced example, but with 1 rating removed. Again, removal of that 1 rating shoudl not matter. ''' data = [('A', '1', '1'), ('B', '1', '1'), ('D', '1', '1'), ('A', '2', '2'), ('B', '2', '2'), ('C', '2', '3'), ('D', '2', '2'), ('A', '3', '3'), ('B', '3', '3'), ('C', '3', '3'), ('D', '3', '3'), ('A', '4', '3'), ('B', '4', '3'), ('C', '4', '3'), ('D', '4', '3'), ('A', '5', '2'), ('B', '5', '2'), ('C', '5', '2'), ('D', '5', '2'), ('A', '6', '1'), ('B', '6', '2'), ('C', '6', '3'), ('D', '6', '4'), ('A', '7', '4'), ('B', '7', '4'), ('C', '7', '4'), ('D', '7', '4'), ('A', '8', '1'), ('B', '8', '1'), ('C', '8', '2'), ('D', '8', '1'), ('A', '9', '2'), ('B', '9', '2'), ('C', '9', '2'), ('D', '9', '2'), ('B', '10', '5'), ('C', '10', '5'), ('D', '10', '5'), ('C', '11', '1'), ('D', '11', '1'), ('C', '12', '3'), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632)
def test_easy2(self): """ Same simple test with 1 rating removed. Removal of that rating should not matter: K-Apha ignores items with only 1 rating. """ data = [ ("coder1", "dress1", "YES"), ("coder2", "dress1", "NO"), ("coder3", "dress1", "NO"), ("coder1", "dress2", "YES"), ("coder2", "dress2", "NO"), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
def test_easy(self): """ Simple test, based on https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf. """ data = [ ("coder1", "dress1", "YES"), ("coder2", "dress1", "NO"), ("coder3", "dress1", "NO"), ("coder1", "dress2", "YES"), ("coder2", "dress2", "NO"), ("coder3", "dress3", "NO"), ] annotation_task = AnnotationTask(data) self.assertAlmostEqual(annotation_task.alpha(), -0.3333333)
def get_kappa(input): head,body = fio.ReadMatrix(input, True) data = [] for i,row in enumerate(body): for coder, label in enumerate(row): if label == 'a': label = '0' data.append((head[coder], i, label)) task = AnnotationTask(data) print head[0], head[1], task.kappa_pairwise(head[0], head[1]) print head[0], head[2], task.kappa_pairwise(head[0], head[2]) print head[1], head[2], task.kappa_pairwise(head[1], head[2]) return task.kappa()
def main(): #print datadiff.diff_dict(readjson('E:\elan projects\L2\submissions\extracted\L2_100020027.eaf.379.json'),readjson('E:\elan projects\L2\submissions\extracted\L2_100020049.eaf.379.json')) ## s1=readjson(r'E:\elan projects\L2\submissions\extracted\L2_100020027.eaf.379.json') ## s2=readjson(r'E:\elan projects\L2\submissions\extracted\L2_100020049.eaf.379.json') ## #s2=readjson('E:\elan projects\L2\L2v1f_DIP.eaf.379.json') ## #s2=readjson('E:\elan projects\L2\L2v1_PRI.eaf.379.json') L1_dip = readjson(r'E:\elan projects\L1\L1v1_DIP.eaf.319.json') ## s1_data= create_task_data(s1,task_type='grouped',allowed='define',annotator='s1') ## s2_data=create_task_data(s2,task_type='grouped',allowed='define',annotator='s2') L1_dip = create_task_data(L1_dip, task_type='grouped', allowed='define', annotator='DIP') ## task=AnnotationTask(data=s1_data+s2_data,distance=nltk.metrics.distance.masi_distance_mod) l1_task = AnnotationTask(data=L1_dip, distance=nltk.metrics.distance.masi_distance_mod) # print "Observed Agreement:{0}".format( task.Ao('s1','s2')) ## print "Kappa:{0}".format(task.kappa()) ## print "Alpha:{0}".format(task.alpha()) ## print "Observed Avg Agreement over all:{0}".format(task.avg_Ao()) ## s1_labeldata=create_labels_list(s1_data) ## s2_labeldata=create_labels_list(s2_data) l1_labels = create_labels_list(L1_dip) assert len(l1_labels) == len(L1_dip) indivcounts = count_occurrances(str(l1_labels)) counts = count_labels(L1_dip) print L1_dip print counts print indivcounts
class KappaRater(object): def __init__(self, S): this_dir = os.path.dirname(os.path.realpath(__file__)) dir1 = os.path.join(this_dir, S, "G1") dir2 = os.path.join(this_dir, S, "G2") self.annotation_task = AnnotationTask(data=self.__readfile(dir1, dir2)) def __readfile(self, *args): data = [] for i in xrange(len(args)): lines = self.__get_lines(args[i]) coder = "c"+str(i+1) for ind, line in enumerate(lines): item, label = line d = (coder, str(ind)+"_"+item, label) # print d data.append(d) return data def __get_lines(self, dir): lines = [] for root, dirs, files in os.walk(dir): for file in files: if file.endswith(".txt"): # f = open(os.path.join(root, file), 'r') with io.open(os.path.join(root, file), 'r', newline='', encoding="utf-8") as f: # keep \r\n for .ann positioning print f lines += anntoconll.text_to_conll_lines(f) f.close() return lines def kappa(self): return self.annotation_task.kappa()
class KappaRater(object): def __init__(self, S): this_dir = os.path.dirname(os.path.realpath(__file__)) dir1 = os.path.join(this_dir, S, "G1") dir2 = os.path.join(this_dir, S, "G2") self.annotation_task = AnnotationTask(data=self.__readfile(dir1, dir2)) def __readfile(self, *args): data = [] for i in xrange(len(args)): lines = self.__get_lines(args[i]) coder = "c"+str(i+1) for ind, line in enumerate(lines): item, label = line d = (coder, str(ind)+"_"+item, label) data.append(d) return data def __get_lines(self, dir): lines = [] for root, dirs, files in os.walk(dir): for file in files: if file.endswith(".txt"): with io.open(os.path.join(root, file), 'r', newline='', encoding="utf-8") as f: print f lines += anntoconll.text_to_conll_lines(f) f.close() return lines def kappa(self): return self.annotation_task.kappa()
def get_alpha_overall(annos): data = [] for idx, row in annos.iterrows(): for col in ['pos', 'neg', 'nonsense']: data.append( (row.WorkerId, '{participant_id}-{block}-{sent_idx}-{col}'.format( **row, col=col), row[col])) return AnnotationTask(data).alpha()
def get_alpha_single_col(annos, col): if col == 'nonsense': distance = binary_distance else: distance = interval_distance data = [(row.WorkerId, '{participant_id}-{block}-{sent_idx}'.format(**row), row[col]) for idx, row in annos.iterrows() if not np.isnan(row[col])] return AnnotationTask(data, distance=distance).alpha()
def run_closed_class_jaccard_and_masi(cls, df: pd.DataFrame) -> Dict: iaa_by_column = { column: { "df": extract_iaa_df_by_column_name(df, column) } for column in cls.CLOSED_CLASS_COLUMNS } for column in iaa_by_column: task = AnnotationTask(distance=jaccard_distance) task.load_array( extract_records_for_nltk(iaa_by_column[column]['df'])) iaa_by_column[column]['alpha_jaccard'] = task.alpha() task = AnnotationTask(distance=masi_distance) task.load_array( extract_records_for_nltk(iaa_by_column[column]['df'])) iaa_by_column[column]['alpha_masi'] = task.alpha() return iaa_by_column
def kappa(self): """Data is a list of list. Each element is a list : [annotator, element, label] """ if not KAPPA: return 'Not installed' #if self.last is None: return # must be specific to a feature data = [] nb = 1 for elem in self.ano: u1 = elem[1] u2 = elem[2] if u1 is None or u2 is None: continue else: data.append([self.ano1.get_code(), nb, u1.get(self.last)]) data.append([self.ano2.get_code(), nb, u2.get(self.last)]) nb += 1 task = AnnotationTask(data) return task.kappa()
def calculate_round_kappa(round_estimates=[]): from nltk.metrics.agreement import AnnotationTask # Calculating the distance between two different estimate categories, and return the difference ratio def distance_cal(v1, v2): # all estimate categories: 1 hour, half a day, one day, half a week, one week, two weeks, # and more than two weeks (-1) labels = ['1.0', '4.0', '8.0', '20.0', '40.0', '80.0', '-1.0'] i1 = labels.index(v1) i2 = labels.index(v2) return abs(i1 - i2) / 6 # prepare estimate for the annotation task data, i = [], 1 for estimate in round_estimates: data.append(["c" + str(i), 1, str(estimate)]) i += 1 task = AnnotationTask(data=data, distance=distance_cal) agreement_level = task.multi_kappa() return agreement_level
def getagreement(tpl,datadir,task_type='all'): """Get agreement values for annotators in the :data:'tpl' list Args: tpl (list): combination group of annotators datadir (str): Cache data directory used by joblib Returns: namedtuple defined as ``Agree = collections.namedtuple('Agree', ['kappa', 'alpha','avg_ao'], verbose=True)`` """ mem = Memory(cachedir=datadir) readjson=mem.cache(json2taskdata.readjson,mmap_mode='r') create_task_data= mem.cache(json2taskdata.create_task_data) count_occurrances=mem.cache(json2taskdata.count_occurrances) count_labels=mem.cache(json2taskdata.count_labels) annotators=set() lectask=[] #------------------------------------------------------------------------------- # for each annotator in group tpl #------------------------------------------------------------------------------- for stditem in tpl: aname=stditem.split('.')[0][3:][-2:] annotators.add(aname) lecdict=readjson(stditem) newlectask= create_task_data(lecdict,task_type=task_type,annotator=aname) label_data=json2taskdata.create_labels_list(newlectask) abscount=count_occurrances(str(label_data)) yaml.dump(abscount,open(os.path.join( datadir,'abscount-'+aname+'.yaml'),'w')) setcount=count_labels(newlectask) yaml.dump(setcount,open(os.path.join( datadir,'setcount-'+aname+'.yaml'),'w')) lectask=lectask+newlectask task=AnnotationTask(data=lectask,distance=nltk.metrics.distance.masi_distance_mod) return {frozenset(annotators): Agree(task.kappa(),task.alpha(),task.avg_Ao())}
def get_alpha(annotations, annotator, item_format, result, distance_type='interval'): if distance_type == 'binary': distance = binary_distance elif distance_type == 'interval': distance = interval_distance else: raise KeyError(f"Unknown type of distance: {distance_type}") data = [(row[annotator], item_format.format(**row), row[result]) for idx, row in annotations.iterrows() if not np.isnan(row[result])] return AnnotationTask(data, distance=distance).alpha()
def alpha_krippendorff( answers: pd.DataFrame, distance: Callable[[Hashable, Hashable], float] = binary_distance ) -> float: """Inter-annotator agreement coefficient (Krippendorff 1980). Amount that annotators agreed on label assignments beyond what is expected by chance. The value of alpha should be interpreted as follows. alpha >= 0.8 indicates a reliable annotation, alpha >= 0.667 allows making tentative conclusions only, while the lower values suggest the unreliable annotation. Args: answers: A data frame containing `task`, `worker` and `label` columns. distance: Distance metric, that takes two arguments, and returns a value between 0.0 and 1.0 By default: binary_distance (0.0 for equal labels 1.0 otherwise). Returns: Float value. Examples: Consistent answers. >>> alpha_krippendorff(pd.DataFrame.from_records([ >>> {'task': 'X', 'worker': 'A', 'label': 'Yes'}, >>> {'task': 'X', 'worker': 'B', 'label': 'Yes'}, >>> {'task': 'Y', 'worker': 'A', 'label': 'No'}, >>> {'task': 'Y', 'worker': 'B', 'label': 'No'}, >>> ])) 1.0 Partially inconsistent answers. >>> alpha_krippendorff(pd.DataFrame.from_records([ >>> {'task': 'X', 'worker': 'A', 'label': 'Yes'}, >>> {'task': 'X', 'worker': 'B', 'label': 'Yes'}, >>> {'task': 'Y', 'worker': 'A', 'label': 'No'}, >>> {'task': 'Y', 'worker': 'B', 'label': 'No'}, >>> {'task': 'Z', 'worker': 'A', 'label': 'Yes'}, >>> {'task': 'Z', 'worker': 'B', 'label': 'No'}, >>> ])) 0.4444444444444444 """ _check_answers(answers) data: List[Tuple[Any, Hashable, Hashable]] = answers[['worker', 'task', 'label']].values.tolist() return AnnotationTask(data, distance).alpha()
def get_iaa(project, ac1_name: str, ac2_name: str, tag_filter: list = None, filter_both_ac: bool = False, level: str = 'tag', distance: str = 'binary'): """Computes Inter Annotator Agreement for 2 Annotation Collections. Args: project (CatmaProject): CatmaProject object ac1_name (str): AnnotationCollection name to be compared ac2_name (str): AnnotationCollection name to be compared with tag_filter (list, optional): Which Tags should be included. If None all are included. Default to None. level (str, optional): Whether the Annotation Tag or a specified Property should be compared. distance (str, optional): The IAA distance function. Either 'binary' or 'interval'. See https://www.nltk.org/api/nltk.metrics.html. Default to 'binary'. """ from nltk.metrics.agreement import AnnotationTask from nltk.metrics import interval_distance, binary_distance if distance == 'inteval': distance_function = interval_distance else: distance_function = binary_distance ac1 = project.ac_dict[ac1_name] ac2 = project.ac_dict[ac2_name] annotation_pairs = get_annotation_pairs(ac1, ac2, tag_filter=tag_filter, filter_both_ac=filter_both_ac) data = list(get_iaa_data(annotation_pairs, level=level)) annotation_task = AnnotationTask(data=data, distance=distance_function) print( textwrap.dedent(f""" Scott's pi: {annotation_task.pi()} Cohen's Kappa: {annotation_task.kappa()} Krippendorf Alpha: {annotation_task.alpha()} """)) return get_confusion_matrix(pair_list=annotation_pairs)
def agree_tags(delta, column): """ egytokenes címkézési feladatokra számol egyetértést :param delta: az összevetett adat :param column: az az oszlop, amelyre egyetértést akarunk számolni :return: """ by_field = reverse_tags(delta, column) task = AnnotationTask(data=by_field) oa = task.avg_Ao() # observed agreement s = task.S() # Bennett, Albert and Goldstein S (1954) all categories are equally likely pi = task.pi() # Scott pi (1955) single distribution kappa = task.kappa() # Cohen kappa (1960) individual coder distribution w_kappa = task.weighted_kappa() alpha = task.alpha() # Krippendorff alpha (1980) return oa, s, pi, kappa, w_kappa, alpha
def compute_agreement(sce_path, nb_turns_per_hit, nb_annotators=None, wo_attention_check=False): # Compute Kappa coefficient and Krippendorff's alpha with nltk library (https://www.nltk.org/api/nltk.metrics.html) rows = read_csv(sce_path) if nb_annotators: annotations = get_annotations(rows, nb_turns_per_hit, wo_attention_check) else: annotations = get_annotations_per_annotators(rows, wo_attention_check) print_annotation_statistics(annotations) task_data = annotations2task_data(annotations, nb_annotators) # print(task_data) rating_task = AnnotationTask(data=task_data, distance=ordinal) print(f"Cohen's Kappa: {rating_task.kappa()}") print(f"Fleiss' Kappa: {rating_task.multi_kappa()}") print(f"Krippendorff's alpha with ordial metric: {rating_task.alpha()}")
def compute_annotator_agreement_nltkmetrics(data_array): ''' See http://nltk.org/api/nltk.metrics.html#nltk.metrics.agreement ''' print "####### Agreement coefficients according to NLTK metrics.agreement #######" t = AnnotationTask(data=data_array) print "Average observed agreement across all coders and items: "+str(t.avg_Ao()) print "Cohen's Kappa (Cohen 1960): "+str(t.kappa()) print "Weighted kappa (Cohen 1968): "+str(t.weighted_kappa()) print "Scott's pi (Scott 1955): "+str(t.pi()) #print "pi_avg: "+str(t.pi_avg()) print "alpha (Krippendorff 1980): "+str(t.alpha()) print "Observed disagreement for the alpha coefficient: "+str(t.Do_alpha()) print "S (Bennett, Albert and Goldstein 1954): "+str(t.S()) #print "n-notation used in Artstein and Poesio (2007): "+str(t.N(k=, ic???)) print "Observed disagreement for the weighted kappa coefficient averaged over all labelers: "+str(t.Do_Kw())
''' if ('0' in line): print((annotator, word, '0')) data.append((annotator, '0', word)) if ('1' in line): data.append((annotator, '1', word)) if ('2' in line): data.append((annotator, '2', word)) with open("annotd") as d: content = d.readlines() for line in content: appender('d', line) with open("annots") as d: content = d.readlines() for line in content: appender('s', line) with open("annots") as d: content = d.readlines() for line in content: appender('k', line) task = AnnotationTask(data) #print(task.avg_Ao()) print(task.kappa())
def __init__(self, S): this_dir = os.path.dirname(os.path.realpath(__file__)) dir1 = os.path.join(this_dir, S, "G1") dir2 = os.path.join(this_dir, S, "G2") self.annotation_task = AnnotationTask(data=self.__readfile(dir1, dir2))
def annotation(output): t = AnnotationTask(data=[x.split() for x in open(output)]) print "\nAverage observed agreement: " + str(t.avg_Ao()) print "\nKappa: " + str(t.kappa());
def status_view(request, task_id=None): """ Renders the evaluation tasks status page for staff users. """ LOGGER.info('Rendering evaluation task overview for user "{0}".'.format( request.user.username)) # Check if user is member in WMT13 group. If so, redirect to wmt13 app. if request.user.groups.filter(name="WMT13").exists(): LOGGER.info('Redirecting user "{0}" to WMT13 overview.'.format( request.user.username)) return redirect('appraise.wmt13.views.overview') if task_id: task = get_object_or_404(EvaluationTask, task_id=task_id) headers = task.get_status_header() status = [] for user in task.users.all(): status.append((user.username, task.get_status_for_user(user))) scores = None result_data = [] raw_result_data = Counter() users = list(task.users.all()) for item in EvaluationItem.objects.filter(task=task): results = [] for user in users: qset = EvaluationResult.objects.filter(user=user, item=item) if qset.exists(): category = str(qset[0].results) results.append((user.id, item.id, category)) raw_result_data[qset[0].raw_result] += 1 if len(results) == len(users): result_data.extend(results) # todo for gisting, calculate - somehow - the percentage of answers against the number of different answers -> # in that same gap, and also regroup them for readability _raw_results = [] _keys = raw_result_data.keys() _total_results = float(sum(raw_result_data.values())) for key in sorted(_keys): value = raw_result_data[key] _raw_results.append((key, value, 100 * value / _total_results)) try: # Computing inter-annotator agreement only makes sense for more # than one coder -- otherwise, we only display result_data... if len(users) > 1: # Check if we can safely use NLTK's AnnotationTask class. try: from nltk.metrics.agreement import AnnotationTask chk = AnnotationTask(data=[('b', '1', 'k'), ('a', '1', 'k')]) assert(chk == 1.0) except AssertionError: LOGGER.debug('Fixing outdated version of AnnotationTask.') from appraise.utils import AnnotationTask # We have to sort annotation data to prevent StopIterator errors. result_data.sort() annotation_task = AnnotationTask(result_data) scores = ( annotation_task.alpha(), annotation_task.kappa(), annotation_task.S(), annotation_task.pi() ) except ZeroDivisionError: scores = None except ImportError: scores = None dictionary = { 'combined': task.get_status_for_users(), 'commit_tag': COMMIT_TAG, 'headers': headers, 'scores': scores, 'raw_results': _raw_results, 'status': status, 'task_id': task.task_id, 'task_name': task.task_name, 'title': 'Evaluation Task Status', } return render(request, 'evaluation/status_task.html', dictionary) else: evaluation_tasks = {} for task_type_id, task_type in APPRAISE_TASK_TYPE_CHOICES: # We collect a list of task descriptions for this task_type. evaluation_tasks[task_type] = [] # Super users see all EvaluationTask items, even non-active ones. if request.user.is_superuser: _tasks = EvaluationTask.objects.filter(task_type=task_type_id) else: _tasks = EvaluationTask.objects.filter(task_type=task_type_id, active=True) # Loop over the QuerySet and compute task description data. for _task in _tasks: if not APPRAISE_TASK_CACHE.has_key(_task.task_id): APPRAISE_TASK_CACHE[_task.task_id] = {} _cache = APPRAISE_TASK_CACHE[_task.task_id] if not _cache.has_key(request.user.username): _update_task_cache(_task, request.user) _task_data = _cache[request.user.username] # Append new task description to current task_type list. evaluation_tasks[task_type].append(_task_data) # If there are no tasks descriptions for this task_type, we skip it. if len(evaluation_tasks[task_type]) == 0: evaluation_tasks.pop(task_type) dictionary = { 'active_page': "STATUS", 'commit_tag': COMMIT_TAG, 'evaluation_tasks': evaluation_tasks, 'title': 'Evaluation Task Status', } return render(request, 'evaluation/status.html', dictionary)
""" Compute the inter-annotator agreement """ import nltk from nltk.metrics.agreement import AnnotationTask t1 = AnnotationTask(data=[x.split() for x in open("1.txt")]) print t1.kappa() t2 = AnnotationTask(data=[x.split() for x in open("2.txt")]) print t2.kappa()
spans_list.append(span) print('Number of spans removed from task: ' + str(len(spans_list))) tags_list = [tup for tup in res if tup[1] not in spans_list] return tags_list #week1 a,a_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\a.xml') b,b_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\b.xml') c,c_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\c.xml') d,d_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week1\\d.xml') tags_list = [tags_to_task(a_tags, 'a'), tags_to_task(b_tags, 'b'), tags_to_task(c_tags, 'c'), tags_to_task(d_tags, 'd')] t_l = combine_data(tags_list) week1 = AnnotationTask(data=t_l) print('Week 1 cross tags agreement:') print(week1.multi_kappa()) #week2 a,a_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\a.xml') b,b_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\b.xml') c,c_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\c.xml') d,d_tags = process_xml('C:\\Brandeis\\Annotations\\annotated_tweets\\week2\\d.xml') tags_list = [tags_to_task(a_tags, 'a'), tags_to_task(b_tags, 'b'), tags_to_task(c_tags, 'c'), tags_to_task(d_tags, 'd')] t_l = combine_data(tags_list) week2 = AnnotationTask(data=t_l) print('Week 2 cross tags agreement:') print(week2.multi_kappa()) #week3_A
data = pd.read_csv('../input_data/labels-C.csv', sep=';', index_col=0) allcoders = data.columns experts = ['KEY', 'MG', 'MS', 'TM'] novices = ['KEY', 'CK', 'GK', 'RM'] cols = novices # Total values taskdata = [] for coder in cols: for i in data[coder].index: taskdata.append([coder, i, data[coder][i]]) ratingtask = AnnotationTask(data=taskdata) print("kappa " + str(ratingtask.kappa())) print("fleiss " + str(ratingtask.multi_kappa())) print("alpha " + str(ratingtask.alpha())) print("scotts " + str(ratingtask.pi())) # Pairwise values similarities = [] for coders in itertools.product(cols, repeat=2): if coders[0] == coders[1]: similarities.append(1) else: taskdata = [] for coder in coders: for i in data[coder].index: taskdata.append([coder, i, data[coder][i]])
song = row[0] results = row[1:] return [(song, coder, category) for (coder, category) in enumerate(results)] def balify(annotation_entry): song, coder, category = annotation_entry can_bal = category in ('b', '2') return (coder, song, can_bal) def lindify(annotation_entry): song, coder, category = annotation_entry can_lindy = category in ('l', '2') return (coder, song, can_lindy) if __name__ == "__main__": with open('results.csv', 'r') as csvfile: results = [] csv_rows = list(csv.reader(csvfile)) for row in csv_rows: results += song_row_to_annotation_entries(row) bal_annotation = AnnotationTask(data=[balify(r) for r in results]) lindy_annotation = AnnotationTask(data=[lindify(r) for r in results]) print("Bal agreement: " + str(bal_annotation.multi_kappa())) print("Lindy agreement: " + str(lindy_annotation.multi_kappa()))