def test_fleis_randolph(): # reference numbers from online calculator # http://justusrandolph.net/kappa/#dInfo table = [[7, 0], [7, 0]] assert_equal(fleiss_kappa(table, method='unif'), 1) table = [[6.99, 0.01], [6.99, 0.01]] # % Overall Agreement 0.996671 # Fixed Marginal Kappa: -0.166667 # Free Marginal Kappa: 0.993343 assert_allclose(fleiss_kappa(table), -0.166667, atol=6e-6) assert_allclose(fleiss_kappa(table, method='unif'), 0.993343, atol=6e-6) table = [[7, 1], [3, 5]] # % Overall Agreement 0.607143 # Fixed Marginal Kappa: 0.161905 # Free Marginal Kappa: 0.214286 assert_allclose(fleiss_kappa(table, method='fleiss'), 0.161905, atol=6e-6) assert_allclose(fleiss_kappa(table, method='randolph'), 0.214286, atol=6e-6) table = [[7, 0], [0, 7]] # % Overall Agreement 1.000000 # Fixed Marginal Kappa: 1.000000 # Free Marginal Kappa: 1.000000 assert_allclose(fleiss_kappa(table), 1) assert_allclose(fleiss_kappa(table, method='uniform'), 1) table = [[6, 1, 0], [0, 7, 0]] # % Overall Agreement 0.857143 # Fixed Marginal Kappa: 0.708333 # Free Marginal Kappa: 0.785714 assert_allclose(fleiss_kappa(table), 0.708333, atol=6e-6) assert_allclose(fleiss_kappa(table, method='rand'), 0.785714, atol=6e-6)
def test_fleiss_kappa_irr(): fleiss = Holder() #> r = kappam.fleiss(diagnoses) #> cat_items(r, pref="fleiss.") fleiss.method = "Fleiss' Kappa for m Raters" fleiss.irr_name = 'Kappa' fleiss.value = 0.4302445 fleiss.stat_name = 'z' fleiss.statistic = 17.65183 fleiss.p_value = 0 data_ = aggregate_raters(diagnoses)[0] res1_kappa = fleiss_kappa(data_) assert_almost_equal(res1_kappa, fleiss.value, decimal=7)
def kappa(y_true, y_pred, type='cohens'): import statsmodels.stats.inter_rater as irater yy = (y_true & y_pred).sum() yn = (y_true & (~y_pred)).sum() nn = ((~y_true) & (~y_pred)).sum() ny = ((~y_true) & (y_pred)).sum() result = np.array([[yy, yn], [ny, nn]]) if type == 'cohens': stat = irater.cohens_kappa(result) score = stat['kappa'] elif type == 'fleiss': score = irater.fleiss_kappa(result) return score, result
END as verdict, annotation.user, verdict_line.page, verdict_line.line_number, annotation.id as aid, testing, isOracle,isReval, isTestMode,isOracleMaster,isDiscounted from annotation inner join claim on annotation.claim_id = claim.id left join annotation_verdict on annotation.id = annotation_verdict.annotation_id left join verdict_line on annotation_verdict.id = verdict_line.verdict_id where isForReportingOnly = 0 and isTestMode = 0 and testing= 0 and isReval=1) as a group by id, user """) def row_ct(row): rowct = [] for i in range(3): rowct.append(row.count(i)) return rowct claims = cursor.fetchall() for claim in claims: if claim['verifiable'] == "NOT ENOUGH INFO": claims_dict[claim['id']].append(0) elif claim['verifiable'] == "VERIFIABLE": claims_dict[claim['id']].append(1 if claim["verdict"]=="SUPPORTS" else 2) fkt1 = [row_ct(claims_dict[key]) for key in claims_dict if len(claims_dict[key]) == 5] print(fkt1) print(len(fkt1)) print(fleiss_kappa(fkt1))
def reducer_(key, values): # key : string of intermediary key # load return dict correspondning to mapper ouput. they need to be loaded. # DEBUG import glob, mapreduce BASE = "/neurospin/brainomics/2013_adni/ADAS11-MCIc-CTL/rndperm" INPUT = BASE + "/%i/%s" OUTPUT = BASE + "/../results/rndperm" keys = ["0.001_0.3335_0.3335_0.333_-1", "0.001_0.5_0_0.5_-1", "0.001_0.5_0.5_0_-1", "0.001_1_0_0_-1"] for key in keys: #key = keys[0] paths_5cv_all = [INPUT % (perm, key) for perm in xrange(NFOLDS * NRNDPERMS)] idx_5cv_blocks = range(0, (NFOLDS * NRNDPERMS) + NFOLDS, NFOLDS) cpt = 0 qc = dict() r2_perms = np.zeros(NRNDPERMS) corr_perms = np.zeros(NRNDPERMS) r_bar_perms = np.zeros(NRNDPERMS) fleiss_kappa_stat_perms = np.zeros(NRNDPERMS) dice_bar_perms = np.zeros(NRNDPERMS) for perm_i in xrange(len(idx_5cv_blocks)-1): paths_5cv = paths_5cv_all[idx_5cv_blocks[perm_i]:idx_5cv_blocks[perm_i+1]] for p in paths_5cv: if os.path.exists(p) and not(p in qc): if p in qc: qc[p] += 1 else: qc[p] = 1 cpt += 1 # values = [mapreduce.OutputCollector(p) for p in paths_5cv] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) r2 = r2_score(y_true, y_pred) corr = np.corrcoef(y_true.ravel(), y_pred.ravel())[0, 1] betas = np.hstack([item["beta"] for item in values]).T # ## Compute beta similarity measures # # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0])]) print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)) print np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5, rtol=0, atol=1e-02) # # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # # Paire-wise Dice coeficient beta_n0 = betas_t != 0 ij = [[i, j] for i in xrange(5) for j in xrange(i+1, 5)] #print [[idx[0], idx[1]] for idx in ij] dice_bar = np.mean([float(np.sum(beta_signed[idx[0], :] == beta_signed[idx[1], :])) /\ (np.sum(beta_n0[idx[0], :]) + np.sum(beta_n0[idx[1], :])) for idx in ij]) except: dice_bar = fleiss_kappa_stat = 0. # r2_perms[perm_i] = r2 corr_perms[perm_i] = corr r_bar_perms[perm_i] = r_bar fleiss_kappa_stat_perms[perm_i] = fleiss_kappa_stat dice_bar_perms[perm_i] = dice_bar # END PERMS print "save", key np.savez_compressed(OUTPUT+"/perms_"+key+".npz", r2=r2_perms, corr=corr_perms, r_bar=r_bar_perms, fleiss_kappa=fleiss_kappa_stat_perms, dice_bar=dice_bar_perms) # perms = dict() fig, axis = plt.subplots(len(keys), 4)#, sharex='col') for i, key in enumerate(keys): perms[key] = np.load(OUTPUT+"/perms_"+key+".npz") n, bins, patches = axis[i, 0].hist(perms[key]['r2'], 50, normed=1, histtype='stepfilled') axis[i, 0].set_title(key + "_r2") n, bins, patches = axis[i, 1].hist(perms[key]['r_bar'], 50, normed=1, histtype='stepfilled') axis[i, 1].set_title(key + "_r_bar") n, bins, patches = axis[i, 2].hist(perms[key]['fleiss_kappa'], 50, histtype='stepfilled') axis[i, 2].set_title(key + "_fleiss_kappa") n, bins, patches = axis[i, 3].hist(perms[key]['dice_bar'], 50)#, 50, normed=1, histtype='stepfilled') axis[i, 3].set_title(key + "_dice_bar") plt.show() l1l2tv, l1tv, l1l2, l1 = ["0.001_0.3335_0.3335_0.333_-1", "0.001_0.5_0_0.5_-1", "0.001_0.5_0.5_0_-1", "0.001_1_0_0_-1"] # Read true scores import pandas as pd true = pd.read_csv(os.path.join(BASE, "..", "ADAS11-MCIc-CTL.csv")) true = true[true.a == 0.001] true_l1l2tv = true[true.l1 == 0.3335].iloc[0] true_l1l2 = true[(true.l1 == 0.5) & (true.l2 == 0.5)].iloc[0] true_l1tv = true[(true.l1 == 0.5) & (true.tv == 0.5)].iloc[0] true_l1 = true[(true.l1 == 1.)].iloc[0] # pvals nperms = float(len(perms[l1]['r2'])) from collections import OrderedDict pvals = OrderedDict() pvals["cond"] = ['l1', 'l1tv', 'l1l2', 'l1l2tv'] * 4 + \ ['l1 vs l1tv'] * 4 + ['l1l2 vs l1l2tv'] * 4 pvals["stat"] = ['r2'] * 4 + ['r_bar'] * 4 + ['fleiss_kappa'] * 4 + ['dice_bar'] * 4 +\ ['r2', 'r_bar', 'fleiss_kappa', 'dice_bar'] * 2 pvals["pval"] = [ np.sum(perms[l1]['r2'] > true_l1["r2"]), np.sum(perms[l1tv]['r2'] > true_l1tv["r2"]), np.sum(perms[l1l2]['r2'] > true_l1l2["r2"]), np.sum(perms[l1l2tv]['r2'] > true_l1l2tv["r2"]), np.sum(perms[l1]['r_bar'] > true_l1["beta_r_bar"]), np.sum(perms[l1tv]['r_bar'] > true_l1tv["beta_r_bar"]), np.sum(perms[l1l2]['r_bar'] > true_l1l2["beta_r_bar"]), np.sum(perms[l1l2tv]['r_bar'] > true_l1l2tv["beta_r_bar"]), np.sum(perms[l1]['fleiss_kappa'] > true_l1["beta_fleiss_kappa"]), np.sum(perms[l1tv]['fleiss_kappa'] > true_l1tv["beta_fleiss_kappa"]), np.sum(perms[l1l2]['fleiss_kappa'] > true_l1l2["beta_fleiss_kappa"]), np.sum(perms[l1l2tv]['fleiss_kappa'] > true_l1l2tv["beta_fleiss_kappa"]), np.sum(perms[l1]['dice_bar'] > true_l1["beta_dice_bar"]), np.sum(perms[l1tv]['dice_bar'] > true_l1tv["beta_dice_bar"]), np.sum(perms[l1l2]['dice_bar'] > true_l1l2["beta_dice_bar"]), np.sum(perms[l1l2tv]['dice_bar'] > true_l1l2tv["beta_dice_bar"]), # l1 vs l1tv np.sum((perms[l1tv]['r2'] - perms[l1]['r2']) > (true_l1tv["r2"] - true_l1["r2"])), np.sum((perms[l1tv]['r_bar'] - perms[l1]['r_bar']) > (true_l1tv["beta_r_bar"] - true_l1["beta_r_bar"])), np.sum((perms[l1tv]['fleiss_kappa'] - perms[l1]['fleiss_kappa']) > (true_l1tv["beta_fleiss_kappa"] - true_l1["beta_fleiss_kappa"])), np.sum((perms[l1tv]['dice_bar'] - perms[l1]['dice_bar']) > (true_l1tv["beta_dice_bar"] - true_l1["beta_dice_bar"])), # l1l2 vs l1l2tv np.sum((perms[l1l2]['r2'] - perms[l1l2tv]['r2']) > (true_l1l2["r2"] - true_l1l2tv["r2"])), np.sum((perms[l1l2tv]['r_bar'] - perms[l1l2]['r_bar']) > (true_l1l2tv["beta_r_bar"] - true_l1l2["beta_r_bar"])), np.sum((perms[l1l2tv]['fleiss_kappa'] - perms[l1l2]['fleiss_kappa']) > (true_l1l2tv["beta_fleiss_kappa"] - true_l1l2["beta_fleiss_kappa"])), np.sum((perms[l1l2tv]['dice_bar'] - perms[l1l2]['dice_bar']) > (true_l1l2tv["beta_dice_bar"] - true_l1l2["beta_dice_bar"]))] pvals = pd.DataFrame(pvals) pvals["pval"] /= nperms pvals.to_csv(os.path.join(OUTPUT, "pvals_stats_permutations.csv"), index=False)
if __name__ == '__main__': arguments = docopt(__doc__) batch_result_file1 = arguments['--f'] workers, results = load_results(batch_result_file1) N = len(results) k = 7 n = 3 label_index = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 5, 8: 6} mat = np.zeros((N, k)) for ind, (_, v) in enumerate(results.items()): for a in v.values(): mat[ind][label_index[a[0]]] += 1 print(fleiss_kappa(mat)) full_ag = 0 part_ag = 0 for row in mat: non_zero = len(np.nonzero(row)[0]) if non_zero == 1: full_ag += 1 elif non_zero == 2: part_ag += 1 print('full agreement', full_ag) print('partial agreement', part_ag) print('no agreement', len(mat) - full_ag - part_ag)
def reducer(key, values): # key : string of intermediary key # load return dict correspondning to mapper ouput. they need to be loaded. # DEBUG #import glob, mapreduce #values = [mapreduce.OutputCollector(p) for p in glob.glob("/neurospin/brainomics/2013_adni/AD-CTL/results/*/0.1_0.0_0.0_1.0_-1.0/")] #values = [mapreduce.OutputCollector(p) for p in glob.glob("/home/ed203246/tmp/MCIc-MCInc_cs/results/*/0.1_0.0_0.0_1.0_-1.0/")] # values = [mapreduce.OutputCollector(p) for p in glob.glob("/home/ed203246/tmp/MCIc-CTL_cs/results/*/0.1_0.0_1.0_0.0_-1.0/")] # values = [mapreduce.OutputCollector(p) for p in glob.glob("/home/ed203246/tmp/MCIc-CTL_cs/results/*/0.1_0.0_0.5_0.5_-1.0/")] # Compute sd; ie.: compute results on each folds print key values = [item.load() for item in values[1:]] recall_mean_std = np.std([ np.mean( precision_recall_fscore_support( item["y_true"].ravel(), item["y_pred"])[1]) for item in values ]) / np.sqrt(len(values)) y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] prob_pred = [item["proba_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = np.concatenate(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) #area under curve score. n_ite = None betas = np.hstack([item["beta"] for item in values]).T ## Compute beta similarity measures # Correlation R = np.corrcoef(betas) #print R R = R[np.triu_indices_from(R, 1)] print R # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0]) ]) print "--", np.sqrt(np.sum(betas_t**2, 1)) / np.sqrt( np.sum(betas**2, 1)) print np.allclose(np.sqrt(np.sum(betas_t**2, 1)) / np.sqrt(np.sum(betas**2, 1)), [0.99] * 5, rtol=0, atol=1e-02) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient beta_n0 = betas_t != 0 ij = [[i, j] for i in xrange(5) for j in xrange(i + 1, 5)] #print [[idx[0], idx[1]] for idx in ij] dice_bar = np.mean([float(np.sum(beta_signed[idx[0], :] == beta_signed[idx[1], :])) /\ (np.sum(beta_n0[idx[0], :]) + np.sum(beta_n0[idx[1], :])) for idx in ij]) except: dice_bar = fleiss_kappa_stat = 0. a, l1, l2, tv, k = key #[float(par) for par in key.split("_")] scores = OrderedDict() scores['a'] = a scores['l1'] = l1 scores['l2'] = l2 scores['tv'] = tv left = float(1 - tv) if left == 0: left = 1. scores['l1l2_ratio'] = float(l1) / left scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['recall_mean'] = r.mean() scores['recall_mean_std'] = recall_mean_std scores['auc'] = auc # scores['beta_cor_mean'] = beta_cor_mean scores['precision_0'] = p[0] scores['precision_1'] = p[1] scores['precision_mean'] = p.mean() scores['f1_0'] = f[0] scores['f1_1'] = f[1] scores['f1_mean'] = f.mean() scores['support_0'] = s[0] scores['support_1'] = s[1] # scores['corr']= corr scores['beta_r'] = str(R) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['n_ite'] = n_ite scores['k'] = k scores['key'] = key return scores
4 0 3 9 2 0 0.440 5 2 2 8 1 1 0.330 6 7 7 0 0 0 0.462 7 3 2 6 3 0 0.242 8 2 5 3 2 2 0.176 9 6 5 2 1 0 0.286 10 0 2 2 3 7 0.286'''.split(), float).reshape(10,-1) Total = np.asarray("20 28 39 21 32".split('\t'), int) Pj = np.asarray("0.143 0.200 0.279 0.150 0.229".split('\t'), float) kappa_wp = 0.210 table1 = table0[:, 1:-1] print(fleiss_kappa(table1)) table4 = np.array([[20,5], [10, 15]]) print('res', cohens_kappa(table4), 0.4) #wikipedia table5 = np.array([[45, 15], [25, 15]]) print('res', cohens_kappa(table5), 0.1304) #wikipedia table6 = np.array([[25, 35], [5, 35]]) print('res', cohens_kappa(table6), 0.2593) #wikipedia print('res', cohens_kappa(table6, weights=np.arange(2)), 0.2593) #wikipedia t7 = np.array([[16, 18, 28], [10, 27, 13], [28, 20, 24]]) print(cohens_kappa(t7, weights=[0, 1, 2])) table8 = np.array([[25, 35], [5, 35]])
def scores(key, paths, config): key_parts = key.split("_") values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) prob_pred_splits = [item["prob_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred_splits) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) # balanced accuracy (recall_mean) bacc_splits = [ recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits)) ] auc_splits = [ roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits)) ] print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits))) # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_bacc = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i + 1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 # Proportion of selection within the support accross the CV support_count = (betas_t != 0).sum(axis=0) support_count = support_count[support_count > 0] support_prop = support_count / betas_t.shape[0] scores = OrderedDict() scores['key'] = key scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['bacc'] = r.mean() scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits)) scores["auc"] = auc scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits)) scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_bacc_mean'] = pvalue_bacc scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) scores['beta_support_prop_select_mean'] = support_prop.mean() scores['beta_support_prop_select_sd'] = support_prop.std() return scores
2 0 2 6 4 2 0.253 3 0 0 3 5 6 0.308 4 0 3 9 2 0 0.440 5 2 2 8 1 1 0.330 6 7 7 0 0 0 0.462 7 3 2 6 3 0 0.242 8 2 5 3 2 2 0.176 9 6 5 2 1 0 0.286 10 0 2 2 3 7 0.286'''.split(), float).reshape(10, -1) Total = np.asarray("20 28 39 21 32".split('\t'), int) Pj = np.asarray("0.143 0.200 0.279 0.150 0.229".split('\t'), float) kappa_wp = 0.210 table1 = table0[:, 1:-1] print(fleiss_kappa(table1)) table4 = np.array([[20, 5], [10, 15]]) print('res', cohens_kappa(table4), 0.4) #wikipedia table5 = np.array([[45, 15], [25, 15]]) print('res', cohens_kappa(table5), 0.1304) #wikipedia table6 = np.array([[25, 35], [5, 35]]) print('res', cohens_kappa(table6), 0.2593) #wikipedia print('res', cohens_kappa(table6, weights=np.arange(2)), 0.2593) #wikipedia t7 = np.array([[16, 18, 28], [10, 27, 13], [28, 20, 24]]) print(cohens_kappa(t7, weights=[0, 1, 2])) table8 = np.array([[25, 35], [5, 35]]) print('res', cohens_kappa(table8))
def scores(key, paths, config, ret_y=False): import glob, mapreduce print key values = [mapreduce.OutputCollector(p) for p in paths] values = [item.load() for item in values] recall_mean_std = np.std([ np.mean( precision_recall_fscore_support( item["y_true"].ravel(), item["y_pred"])[1]) for item in values ]) / np.sqrt(len(values)) y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] prob_pred = [item["proba_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = np.concatenate(prob_pred) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) #area under curve score. n_ite = None betas = np.hstack( [item["beta"][config['penalty_start']:, :] for item in values]).T ## Compute beta similarity measures # Correlation R = np.corrcoef(betas) #print R R = R[np.triu_indices_from(R, 1)] print R # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in xrange(betas.shape[0]) ]) #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)) print np.allclose(np.sqrt(np.sum(betas_t**2, 1)) / np.sqrt(np.sum(betas**2, 1)), [0.99] * 5, rtol=0, atol=1e-02) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in xrange(5) for j in xrange(i + 1, 5)] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0. scores = OrderedDict() try: a, l1, l2, tv, k = [float(par) for par in key.split("_")] scores['a'] = a scores['l1'] = l1 scores['l2'] = l2 scores['tv'] = tv left = float(1 - tv) if left == 0: left = 1. scores['l1_ratio'] = float(l1) / left scores['k'] = k except: pass scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['recall_mean'] = r.mean() scores['recall_mean_std'] = recall_mean_std scores['auc'] = auc # scores['beta_cor_mean'] = beta_cor_mean scores['precision_0'] = p[0] scores['precision_1'] = p[1] scores['precision_mean'] = p.mean() scores['f1_0'] = f[0] scores['f1_1'] = f[1] scores['f1_mean'] = f.mean() scores['support_0'] = s[0] scores['support_1'] = s[1] # scores['corr']= corr scores['beta_r'] = str(R) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice'] = str(dices) scores['beta_dice_bar'] = dice_bar scores['n_ite'] = n_ite scores['param_key'] = key if ret_y: scores["y_true"], scores["y_pred"], scores[ "prob_pred"] = y_true, y_pred, prob_pred return scores
question_arr = [] for question, responses in question_responses.iteritems(): if question not in ["humanlike_text", "correct_text", "strategic_text", "cooperative_text", "fluent_text"]: responses = np.array(responses[:5]) question_arr.append(bin(responses)) avg = responses.mean() median = np.median(responses) std = responses.std() dialogue_to_stats[dialogue_id][agent_id][question].append(avg) dialogue_to_stats[dialogue_id][agent_id][question].append(median) dialogue_to_stats[dialogue_id][agent_id][question].append(std) question_arr = np.array(question_arr) kappa = fleiss_kappa(question_arr) dialogue_to_stats[dialogue_id][agent_id]["kappa"].append(kappa) dialogue_eval_info = [] dialogue_eval_info.append(dialogue_to_agent_mapping) dialogue_eval_info.append(dialogue_to_responses) dialogue_eval_info.append(dialogue_to_stats) scenario_id_to_mappings = defaultdict(list) # Name of eval results file eval_results_file = None # Dump dialogue to average
def main(): input_file = sys.argv[1] fliess_table = generate_fliess_table(input_file) print(len(fliess_table)) print(fleiss_kappa(fliess_table, method='fleiss'))
def test_fleiss_kappa(): #currently only example from Wikipedia page kappa_wp = 0.210 assert_almost_equal(fleiss_kappa(table1), kappa_wp, decimal=3)
def compute_fleiss_kappa(data): """ Computes label agreement between crowd workers according to Fleiss' Kappa w.r.t relevance of a tweet to the topic and sentiment separately. We have M rows representing tweets and N labels (from which a label was selected) as columns our matrix. Fleiss' Kappa is known to be a conservative metric as it sometimes yields low agreement, although the agreement is quite high in reality, see https://link.springer.com/article/10.1007/s11135-014-0003-1#page-1 Parameters ---------- data: dict - {tid: [label1, label2, label3...]}. Returns ------- float, float. Fleiss' Kappa between 0 (no agreement) - 1 (perfect agreement) over all labels. Fleiss' Kappa between 0 (no agreement) - 1 (perfect agreement) investigating annotator agreement w.r.t. relevance only (irrelevant vs. rest). """ # http://www.tau.ac.il/~tsirel/dump/Static/knowino.org/wiki/Fleiss%27_kappa.html # print """Agreement levels: # < 0 No agreement # 0.0 - 0.19 Poor agreement # 0.20 - 0.39 Fair agreement # 0.40 - 0.59 Moderate agreement # 0.60 - 0.79 Substantial agreement # 0.80 - 1.00 Almost perfect agreement""" ############################ # 1. Overall Fleiss' Kappa # ############################ mat = np.zeros((len(data), len(LABEL_MAPPING))) # For each tweet for idx, tid in enumerate(data): labels = Counter(data[tid]) # Count which labels exist for a tweet for label in labels: # Get the column of the label (= column to update) label_col = NUMBER_MAPPING[label] # Update the column with the votes of the crowd worers mat[idx, label_col] += labels[label] kappa_total = fleiss_kappa(mat) print "Overall Fleiss kappa:", kappa_total ######################################## # 2. Fleiss' Kappa for tweet relevance # ######################################## # We compare relevant (i.e. assigning a sentiment label) vs. irrelevant mat = np.zeros((len(data), 2)) # Indices of the columns in the matrix for the two labels rel_col = 0 irrel_col = 1 # For each tweet for idx, tid in enumerate(data): labels = Counter(data[tid]) # Count which labels exist for a tweet for label in labels: # Update the column with the votes of the crowd worers # a) Relevant if label != "Irrelevant": mat[idx, rel_col] += labels[label] # b) Irrelevant else: mat[idx, irrel_col] += labels[label] kappa_relevance = fleiss_kappa(mat) print "Relevance Fleiss kappa:", kappa_relevance return kappa_total, kappa_relevance
def compute_overall_scores(coder_df, document_column, outcome_column, coder_column): """ Computes overall inter-rater reliability scores (Krippendorf's Alpha and Fleiss' Kappa). Allows for more than two \ coders and code values. The input data must consist of a :py:class:`pandas.DataFrame` with the following columns: - A column with values that indicate the coder (like a name) - A column with values that indicate the document (like an ID) - A column with values that indicate the code value :param coder_df: A :py:class:`pandas.DataFrame` of codes :type coder_df: :py:class:`pandas.DataFrame` :param document_column: The column that contains IDs for the documents :type document_column: str :param outcome_column: The column that contains the codes :type outcome_column: str :param coder_column: The column containing values that indicate which coder assigned the code :type coder_column: str :return: A dictionary containing the scores :rtype: dict Usage:: from pewanalytics.stats.irr import compute_overall_scores import pandas as pd df = pd.DataFrame([ {"coder": "coder1", "document": 1, "code": "2"}, {"coder": "coder2", "document": 1, "code": "2"}, {"coder": "coder1", "document": 2, "code": "1"}, {"coder": "coder2", "document": 2, "code": "2"}, {"coder": "coder1", "document": 3, "code": "0"}, {"coder": "coder2", "document": 3, "code": "0"}, ]) >>> compute_overall_scores(df, "document", "code", "coder") {'alpha': 0.5454545454545454, 'fleiss_kappa': 0.4545454545454544} """ alpha = AnnotationTask( data=coder_df[[coder_column, document_column, outcome_column]].values) try: alpha = alpha.alpha() except (ZeroDivisionError, ValueError): alpha = None grouped = coder_df.groupby(document_column).count() complete_docs = grouped[grouped[coder_column] == len( coder_df[coder_column].unique())].index dataset = coder_df[coder_df[document_column].isin(complete_docs)] df = dataset.groupby([outcome_column, document_column]).count()[[coder_column]] df = df.unstack(outcome_column).fillna(0) if len(df) > 0: kappa = fleiss_kappa(df) else: kappa = None return {"alpha": alpha, "fleiss_kappa": kappa}
def scores(key, paths, config, as_dataframe=False, algo_idx=None): # print(key, paths) # key = 'enettv_0.1_0.5_0.1' # paths = ['5cv/cv00/refit/enetgn_0.1_0.9_0.1', '5cv/cv01/refit/enetgn_0.1_0.9_0.1', '5cv/cv02/refit/enetgn_0.1_0.9_0.1', '5cv/cv03/refit/enetgn_0.1_0.9_0.1', '5cv/cv04/refit/enetgn_0.1_0.9_0.1'] key_parts = key.split("_") algo = key_parts[algo_idx] if algo_idx is not None else None key_parts.remove(algo) if len(key_parts) > 0: try: params = [float(p) for p in key_parts] except: params = [None, None, None] print(algo, params) #Comment out, because it's a 4 x 5 cross validation; it creates failed # if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER): # print("Failed for key %s" % key) # return None values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) prob_pred_splits = [item["proba_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred_splits) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) # balanced accuracy (recall_mean) bacc_splits = [ recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits)) ] auc_splits = [ roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits)) ] print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits))) # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_bacc = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i + 1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 # Proportion of selection within the support accross the CV support_count = (betas_t != 0).sum(axis=0) support_count = support_count[support_count > 0] support_prop = support_count / betas_t.shape[0] scores = OrderedDict() scores['key'] = key scores['algo'] = algo scores['a'], scores['l1_ratio'], scores['tv_ratio'] = params scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['bacc'] = r.mean() scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits)) scores["auc"] = auc scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits)) scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_bacc_mean'] = pvalue_bacc scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) scores['beta_support_prop_select_mean'] = support_prop.mean() scores['beta_support_prop_select_sd'] = support_prop.std() if as_dataframe: scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys())) return scores
def calculate_inter_annotator_agreement(annotations): matrix = get_annotations_matrix(annotations) fleiss_kappa_score = fleiss_kappa(matrix) return fleiss_kappa_score
4 0 3 9 2 0 0.440 5 2 2 8 1 1 0.330 6 7 7 0 0 0 0.462 7 3 2 6 3 0 0.242 8 2 5 3 2 2 0.176 9 6 5 2 1 0 0.286 10 0 2 2 3 7 0.286'''.split(), float).reshape(10,-1) Total = np.asarray("20 28 39 21 32".split('\t'), int) Pj = np.asarray("0.143 0.200 0.279 0.150 0.229".split('\t'), float) kappa_wp = 0.210 table1 = table0[:, 1:-1] print fleiss_kappa(table1) table4 = np.array([[20,5], [10, 15]]) print 'res', cohens_kappa(table4), 0.4 #wikipedia table5 = np.array([[45, 15], [25, 15]]) print 'res', cohens_kappa(table5), 0.1304 #wikipedia table6 = np.array([[25, 35], [5, 35]]) print 'res', cohens_kappa(table6), 0.2593 #wikipedia print 'res', cohens_kappa(table6, weights=np.arange(2)), 0.2593 #wikipedia t7 = np.array([[16, 18, 28], [10, 27, 13], [28, 20, 24]]) print cohens_kappa(t7, weights=[0, 1, 2]) table8 = np.array([[25, 35], [5, 35]])
def scores(key, paths, config, as_dataframe=False): import mapreduce print(key) if (len(paths) != NFOLDS_INNER) or (len(paths) != NFOLDS_OUTER): print("Failed for key %s" % key) return None values = [mapreduce.OutputCollector(p) for p in paths] values = [item.load() for item in values] y_true = [item["y_true"].ravel() for item in values] y_pred = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) prob_pred = [item["proba_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) #area under curve score. # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_recall_mean = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][penalty_start:, :] for item in values]).T # Correlation R = np.corrcoef(betas) #print R R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) #print "--", np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)) #print(np.allclose(np.sqrt(np.sum(betas_t ** 2, 1)) / np.sqrt(np.sum(betas ** 2, 1)), [0.99]*5, # rtol=0, atol=1e-02)) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i + 1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 scores = OrderedDict() scores['key'] = key try: a, l1, l2, tv = [float(par) for par in key.split("_")] scores['a'] = a scores['l1'] = l1 scores['l2'] = l2 scores['tv'] = tv left = float(1 - tv) if left == 0: left = 1. scores['l1_ratio'] = float(l1) / left except: pass scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['recall_mean'] = r.mean() scores["auc"] = auc scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_recall_mean'] = pvalue_recall_mean scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) if as_dataframe: scores = pd.DataFrame([list(scores.values())], columns=list(scores.keys())) return scores
df_tmp = df_count_agreement[df_count_agreement.task_id == task].iloc[:, 1:3] df_merged = pd.merge_ordered(df_tmp, df_agreement_frame, fill_method='ffill', right_by="answer_csagreement", how="left") list_count_agreement.append(df_merged['size'].tolist()) df_kappaTable = pd.DataFrame(list_count_agreement) df_disagree = df_kappaTable[0] + df_kappaTable[1] df_agree = df_kappaTable[2] + df_kappaTable[3] df_kappaTable = pd.concat([df_disagree, df_agree], axis=1) ir.fleiss_kappa(df_kappaTable, method='fleiss') #%% df_kappaTable.columns = ['disagree', 'agree'] df_kappaTable #%% [markdown] # ## Evaluation effort #%% answer_no_dummy_dt labels = ['DT', 'No DT'] answerDtMinutes = answer_no_dummy_dt.secondsToAnswer
def analyze_interrater_reliability(phase, labels): results_csv = pd.DataFrame() labels_list = labels metadata_phase = pd.read_csv( f'{dataset_location}/metadata_phase_{phase}.csv') all_images = metadata_phase['image'].unique() # convert metadata table information to a more compact format to use for calculations full_array = np.full([5, len(all_images), len(labels)], True) for i, image in enumerate(all_images): j = 0 for _, row in metadata_phase[metadata_phase['image'] == image].iterrows(): for k, label in enumerate(labels): if label.lower() in ['support devices', 'quality issue']: full_array[j, i, k] = row[label] else: # for labels that had a certainty chosen, only consider as present for Possibly or higher full_array[j, i, k] = row[label] >= 3 j += 1 assert (j == 5) #calculate fleiss kappa for every label, as shown in part of Table 2 for k in range(len(labels)): array_to_use = full_array # numpy.savetxt(f"rating_{labels[k].replace('/','_').replace(' ','_').lower()}_phase_{phase}.csv", convert_to_stats_table(array_to_use[:,:,k], 5), delimiter=",") # numpy.savetxt(f"rating_2_{labels[k].replace('/','_').replace(' ','_').lower()}_phase_{phase}.csv", array_to_use[:,:,k], delimiter=",") table_answers = convert_to_stats_table(array_to_use[:, :, k], 5) value = inter_rater.fleiss_kappa(table_answers, method='fleiss') #calculates the Fleiss Kappa standard error using the equation from the original paper (Fleiss, 1971) se = fleiss_kappa_standard_error(table_answers) new_row = { 'label': labels_list[k], 'title': 'Fleiss Kappa', 'value': value } results_csv = results_csv.append(new_row, ignore_index=True) new_row = { 'label': labels_list[k], 'title': 'Fleiss Kappa standard error', 'value': se } results_csv = results_csv.append(new_row, ignore_index=True) #calculate how much would the Fleiss Kappa be without the answers for each specific chest x-ray, for understanding what cases were the worst for that label for trial_index, _ in enumerate(all_images): value = inter_rater.fleiss_kappa(convert_to_stats_table( np.delete(array_to_use[:, :, k], trial_index, axis=1), 5), method='fleiss') new_row = { 'label': labels_list[k], 'trial': trial_index, 'title': 'Fleiss Kappa (except trial)', 'value': value } results_csv = results_csv.append(new_row, ignore_index=True) #get IoU for chest bounding boxes, used to calculate the numbers presented in Technical Validation > Validation Labels > Chest bounding boxes for image_index, image in enumerate(all_images): this_case = metadata_phase[metadata_phase['image'] == image] all_chest_boxes = [] for id in this_case['id'].values: chest_box_table = pd.read_csv( f'{dataset_location}/{id}/chest_bounding_box.csv') chest_box_coordinates = chest_box_table.values[0] assert (len(chest_box_coordinates) == 4) all_chest_boxes.append(chest_box_coordinates) for index_1 in range(len(all_chest_boxes)): for index_2 in range(len(all_chest_boxes)): if index_1 != index_2: value = get_iou([all_chest_boxes[index_1]], [all_chest_boxes[index_2]], create_box) new_row = { 'trial': image_index, 'title': 'Chest Box IoU', 'value': value } results_csv = results_csv.append(new_row, ignore_index=True) #get IoU for drawn ellipses, used to calculate part of Table 2 for k in range(len(labels_list)): print(labels_list[k]) for image_index, image in enumerate(all_images): ellipses_iou_k = [] this_case = metadata_phase[metadata_phase['image'] == image] ellipses = [] for id in this_case['id'].values: ellipse_table = pd.read_csv( f'{dataset_location}/{id}/anomaly_location_ellipses.csv') #only use labels with certainty Possibly or higher ellipse_table = ellipse_table[ellipse_table['certainty'] > 2] #only use the currently selected label ellipse_table = ellipse_table[ellipse_table[labels_list[k]]] if len(ellipse_table) > 0: ellipses.append( ellipse_table[['xmin', 'ymin', 'xmax', 'ymax']].values) else: ellipses.append([]) for user_index in range(len(ellipses)): # do IoU for label BBox for every pairs of users who drew at least one ellipse for this label for user_index_2 in range(len(ellipses)): if user_index_2 != user_index: if len(ellipses[user_index]) > 0 and len( ellipses[user_index_2]) > 0: value = get_iou(ellipses[user_index], ellipses[user_index_2], create_ellipse) ellipses_iou_k.append(value) # calculates the average IoU for all the readings of this specific chest x-ray and label if len(ellipses_iou_k) > 0: average_iou = np.mean(ellipses_iou_k) new_row = { 'label': labels_list[k], 'trial': image_index, 'title': 'Ellipse IoU', 'value': average_iou } results_csv = results_csv.append(new_row, ignore_index=True) results_csv.to_csv(f'interrater_phase_{phase}.csv', index=False)
def reducer(key, values): global N_COMP, N_FOLDS # N_FOLDS is the number of true folds (not the number of resamplings) # key : string of intermediary key # load return dict corresponding to mapper ouput. they need to be loaded.] # Avoid taking into account the fold 0 values = [item.load() for item in values[1:]] # Load components: each file is 4096xN_COMP matrix. # We stack them on the third dimension (folds) components = np.dstack([item["components"] for item in values]) # Thesholded components (list of tuples (comp, threshold)) thresh_components = np.empty(components.shape) thresholds = np.empty((N_COMP, N_FOLDS)) for l in range(N_FOLDS): for k in range(N_COMP): thresh_comp, t = array_utils.arr_threshold_from_norm2_ratio( components[:, k, l], .99) thresh_components[:, k, l] = thresh_comp thresholds[k, l] = t frobenius_train = np.vstack([item["frobenius_train"] for item in values]) frobenius_test = np.vstack([item["frobenius_test"] for item in values]) l0 = np.vstack([item["l0"] for item in values]) l1 = np.vstack([item["l1"] for item in values]) l2 = np.vstack([item["l2"] for item in values]) tv = np.vstack([item["tv"] for item in values]) evr_train = np.vstack([item["evr_train"] for item in values]) evr_test = np.vstack([item["evr_test"] for item in values]) times = [item["time"] for item in values] # Average precision/recall across folds for each component av_frobenius_train = frobenius_train.mean(axis=0) av_frobenius_test = frobenius_test.mean(axis=0) av_evr_train = evr_train.mean(axis=0) av_evr_test = evr_test.mean(axis=0) av_l0 = l0.mean(axis=0) av_l1 = l1.mean(axis=0) av_l2 = l2.mean(axis=0) av_tv = tv.mean(axis=0) # Compute correlations of components between all folds n_corr = N_FOLDS * (N_FOLDS - 1) / 2 correlations = np.zeros((N_COMP, n_corr)) for k in range(N_COMP): R = np.corrcoef(np.abs(components[:, k, :].T)) # Extract interesting coefficients (upper-triangle) correlations[k] = R[np.triu_indices_from(R, 1)] # Transform to z-score Z = 1. / 2. * np.log((1 + correlations) / (1 - correlations)) # Average for each component z_bar = np.mean(Z, axis=1) # Transform back to average correlation for each component r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # Compute fleiss_kappa and DICE on thresholded components fleiss_kappas = np.empty(N_COMP) dice_bars = np.empty(N_COMP) for k in range(N_COMP): # One component accross folds thresh_comp = thresh_components[:, k, :] try: # Compute fleiss kappa statistics # The "raters" are the folds and we have 3 variables: # - number of null coefficients # - number of > 0 coefficients # - number of < 0 coefficients # We build a (N_FOLDS, 3) table thresh_comp_signed = np.sign(thresh_comp) table = np.zeros((N_FOLDS, 3)) table[:, 0] = np.sum(thresh_comp_signed == 0, 0) table[:, 1] = np.sum(thresh_comp_signed == 1, 0) table[:, 2] = np.sum(thresh_comp_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) except: fleiss_kappa_stat = 0. fleiss_kappas[k] = fleiss_kappa_stat try: # Paire-wise DICE coefficient (there is the same number than # pair-wise correlations) thresh_comp_n0 = thresh_comp != 0 # Index of lines (folds) to use ij = [[i, j] for i in xrange(N_FOLDS) for j in xrange(i + 1, N_FOLDS)] num = [ np.sum(thresh_comp[idx[0], :] == thresh_comp[idx[1], :]) for idx in ij ] denom = [(np.sum(thresh_comp_n0[idx[0], :]) + \ np.sum(thresh_comp_n0[idx[1], :])) for idx in ij] dices = np.array([float(num[i]) / denom[i] for i in range(n_corr)]) dice_bar = dices.mean() except: dice_bar = 0. dice_bars[k] = dice_bar scores = OrderedDict( (('model', key[0]), ('global_pen', key[1]), ('tv_ratio', key[2]), ('l1_ratio', key[3]), ('frobenius_train', av_frobenius_train[0]), ('frobenius_test', av_frobenius_test[0]), ('correlation_0', r_bar[0]), ('correlation_1', r_bar[1]), ('correlation_2', r_bar[2]), ('correlation_mean', np.mean(r_bar)), ('kappa_0', fleiss_kappas[0]), ('kappa_1', fleiss_kappas[1]), ('kappa_2', fleiss_kappas[2]), ('kappa_mean', np.mean(fleiss_kappas)), ('dice_bar_0', dice_bars[0]), ('dice_bar_1', dice_bars[1]), ('dice_bar_2', dice_bars[2]), ('dice_bar_mean', np.mean(dice_bar)), ('evr_train_0', av_evr_train[0]), ('evr_train_1', av_evr_train[1]), ('evr_train_2', av_evr_train[2]), ('evr_test_0', av_evr_test[0]), ('evr_test_1', av_evr_test[1]), ('evr_test_2', av_evr_test[2]), ('l0_0', av_l0[0]), ('l0_1', av_l0[1]), ('l0_2', av_l0[2]), ('l1_0', av_l1[0]), ('l1_1', av_l1[1]), ('l1_2', av_l1[2]), ('l2_0', av_l2[0]), ('l2_1', av_l2[1]), ('l2_2', av_l2[2]), ('tv_0', av_tv[0]), ('tv_1', av_tv[1]), ('tv_2', av_tv[2]), ('time', np.mean(times)))) return scores