def f_measure(): max_val = 0 for y_col in [1,2,3,4]: for x_col in [5, 6, 7, 8, 9, 10, 11, 12]: for x_col_2 in [6, 7, 8, 9, 10, 11, 12]: if x_col == x_col_2: continue if not x_col < x_col_2: continue x = df.iloc[:,x_col] x2 = df.iloc[:,x_col_2] y = df.iloc[:,y_col] f = 2 * (x * x2)/ (x + x2) val = kendalltau(f, y)[0]; if kendalltau(f, y)[0] < 0.40: continue if kendalltau(f, y)[0] > max_val: max_val = kendalltau(f, y)[0] max_x , max_x_2, max_y = df.columns[x_col], df.columns[x_col_2], df.columns[y_col] print df.columns[x_col],df.columns[x_col_2] , df.columns[y_col], kendalltau(f, y) print 'max' print max_x, max_x_2, max_y ,max_val, " : ", max_val
def getBirthDeadCor(birthAndDeadFileName): ''' Explore the correlation betweeen firm death and firm emergence via Spearman coefficient. PS: the records later than 01/2010 should be abandoned. Parameters: birthAndDeadFileName: str The file of firm birth and dead in every month. Return: coefficient: int correlation coefficient between firm death and emergence. ''' birthAndDeadFile = open(birthAndDeadFileName, 'r').readlines() seq1, seq2 = [], [] for birthDeadRecord in birthAndDeadFile[1:]: birth = int(birthDeadRecord.split('###')[1]) dead = int(birthDeadRecord.split('###')[2]) seq1.append(birth) seq2.append(dead) if int(birthDeadRecord.split('###')[0]) >= 201001: break print spearmanr(seq1, seq2) print pearsonr(seq1, seq2) print kendalltau(seq1, seq2)
def test_nancorr_kendall(self): tm.skip_if_no_package("scipy.stats") from scipy.stats import kendalltau targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0] targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="kendall") targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0] targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall")
def test_nancorr_kendall(self): from scipy.stats import kendalltau targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0] targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0] self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method='kendall') targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0] targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method='kendall')
def determineVariance(self,runs): #This method determines the variance of parameter values caused by #the stochastic nature of the data. Data is generated 'runs' times, #redetermining only the labels, and the model is fitted each time. #the variance for each parameter over all runs is determined parvar=[] params=[] ranks2=[] for i in range(len(self.parameters)): params.append(np.zeros((runs,len(self.parameters[i])))) parvar.append([]) for i in range(runs): othermodel=copy.deepcopy(self) othermodel.clearGenerate() for d in self.data.giveData(): othermodel.generate(d[0],d[1]) #Testpart is not yet correct?, as it the basekcc needs to be set at this point and the kcc reset once the testdata has been made. # for d in self.data.giveTestData(): # othermodel.generateTest(d[0],d[1]) othermodel.fit() ranks2.append(self.rankOrder(othermodel)) # othermodel.aPrime() for j,p in enumerate(othermodel.giveParams()): params[j][i,:]=p parvar[j].append(othermodel.parameterVariance(othermodel.paranames[j])) ranks=np.zeros((len(self.parameters),runs,runs)) variances=[] for i in range(len(self.parameters)): parvar[i]=np.mean(parvar[i]) for j in range(runs): for k in range(j+1,runs): ranks[i,j,k]=stat.kendalltau(params[i][j,:],params[i][k,:])[0] avg=np.mean(params[i],0) var=np.var(params[i],0,ddof=1) variances.append(var) # print "" # print self.paranames[i] # for j in range(params[i].shape[1]): # print j,avg[j],var[j] #variances=np.concatenate(variances) ranks=np.sum(ranks,(1,2))/((runs**2-runs)/2) ranks2=[[],[],[],[]] for j,p in enumerate(self.giveParams()): for i in range(runs): ranks2[j].append(stat.kendalltau(params[j][i,:],p)[0]) # print "\nValues for the inherent ranks vs ranks against sourcemodel\n",ranks,np.mean(np.array(ranks2),1),"\n" return (variances,ranks,parvar)
def plot(name, qualities_mes, costs_mes, qualities_th, costs_th): fig, axes = plt.subplots(2,1) ax1= axes[0] texts_mes= [] for (i, (quality, cost)) in enumerate(zip(qualities_mes, costs_mes)): texts_mes.append(ax1.text(quality, cost, str(i), ha='center', va='center')) #print("Measured: ", q, c_cycle) color='tab:red' ax1.set_ylabel("cost per cycle (µs)") ax1.set_xlabel("quality") ax1.scatter(qualities_mes, costs_mes, label="Measured", color=color) ax1.tick_params(axis='y', labelcolor=color) ax1.grid(True) ax2 = axes[1] texts_th = [] for (i, (quality, cost)) in enumerate(zip(qualities_th, costs_th)): texts_th.append(ax2.text(quality, cost, str(i), ha='center', va='center')) color = 'tab:blue' ax2.set_ylabel("cost") ax2.set_xlabel("quality") ax2.scatter(qualities_th, costs_th, label="Model", color=color) ax2.tick_params(axis='y', labelcolor=color) ax2.grid(True) adjust_text(texts_mes, ax=ax1) adjust_text(texts_th, ax=ax2) kendalltau = GraphResults("Kendall's tau") kendalltau.costs = stats.kendalltau(costs_mes, costs_th, nan_policy='raise') kendalltau.quality = stats.kendalltau(qualities_mes, qualities_th, nan_policy='raise') spearmanr = GraphResults("Spearman's R") spearmanr.costs = stats.spearmanr(costs_mes, costs_th, nan_policy='raise') spearmanr.quality = stats.spearmanr(qualities_mes, qualities_th, nan_policy='raise') print(kendalltau.name, " Kendal's tau: cost=", kendalltau.costs, " and quality=", kendalltau.quality) print(spearmanr.name, " Spearman's r: cost=", spearmanr.costs, " and quality=", spearmanr.quality) fig.tight_layout() fig.legend() if args.tikz: tikz_save(name+".tex") plt.show()
def evaluate_all(self): all_true = [] all_pred = [] logging.info((self.name, self.fold)) for key in self.keys(): y_true, y_pred = self.ordered_scores(key) if self.debug: tau, p_value = kendalltau(y_true, y_pred) logging.info(KendallTauUser(user=key, tau=tau, p=p_value)) all_true.extend(y_true) all_pred.extend(y_pred) tau, p_value = kendalltau(all_true, all_pred) stat = KendallTauFold(fold=self.fold, tau=tau, p=p_value) logging.info(stat) return [stat]
def calc_kendall_tau(gam_unit, average=False): """ Calculate Kendall tau value for predicted values. This tau scales between -1 (prefect negative correlation) and 1 (perfect correlation). gam_unit : GamUnit has `actual` and `pred` attributes average : bool average across repeats before calculating tau """ assert type(average) == bool if not average: act_flat = gam_unit.actual.flatten() else: act_flat = stats.nanmean(gam_unit.actual, axis=1).flatten() nans = np.isnan(act_flat) act_flat = act_flat[~nans] tau = np.zeros((gam_unit.pred.shape[0])) + np.nan P = np.zeros_like(tau) + np.nan for i, pred in enumerate(gam_unit.pred): if not average: pred_flat = pred.flatten()[~nans] else: pred_flat = stats.nanmean(pred, axis=1).flatten() tau[i], P[i] = stats.kendalltau(act_flat, pred_flat) return tau, P
def how_far_intime(paths, moment_of_infection, mode = 'abs'): res = [] out_moment_of_infection = {} for p in paths: for step in p: if step[1] in out_moment_of_infection: out_moment_of_infection[step[1]] = min(out_moment_of_infection[step[1]], step[0]) else: out_moment_of_infection[step[1]] = step[0] if step[2] in out_moment_of_infection: out_moment_of_infection[step[2]] = min(out_moment_of_infection[step[2]], step[0]) else: out_moment_of_infection[step[2]] = step[0] sorted_out = [i[0] for i in sorted(out_moment_of_infection.items(), key=operator.itemgetter(1)) if i[0] in moment_of_infection] sorted_gt = [i[0] for i in sorted(moment_of_infection.items(), key=operator.itemgetter(1)) if i[0] in out_moment_of_infection] for k, v in out_moment_of_infection.iteritems(): if k in moment_of_infection: if v > moment_of_infection[k]: t = (v - moment_of_infection[k]).total_seconds() else: t = -(moment_of_infection[k] - v).total_seconds() if mode == 'abs': t = np.abs(t) res.append(t) #return res, stats.kendalltau(sorted_gt, sorted_out), stats.pearsonr(sorted_gt, sorted_out) try: tau = stats.kendalltau(sorted_gt, sorted_out) except: tau = 0.0 return res, tau
def kendall_f_three(): result = pd.DataFrame(columns =['first', 'second', 'third', 'y_col', 'kendaltau', 'abs_kendaltau']) pos = 0 start = time.clock() for y_col in np.arange(0,23): max_val = 0 for x_col in np.arange(23,23 + 48): for x_col_2 in np.arange(23,23 + 48): for x_col_3 in np.arange(23,23 + 48): if x_col == x_col_2 or x_col == x_col_3 or x_col_2 == x_col_3: continue if not (x_col < x_col_2 and x_col_2 < x_col_3): continue x = df.iloc[:,x_col] x2 = df.iloc[:,x_col_2] x3 = df.iloc[:,x_col_3] y = df.iloc[:,y_col] f = 3 * (x * x2 * x3)/ (x + x2 + x3) val = kendalltau(f, y)[0] result.loc[pos, 'first'], result.loc[pos,'second'], result.loc[pos,'third'] = df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3] result.loc[pos, 'y_col'], result.loc[pos,'kendaltau'], result.loc[pos, 'abs_kendaltau'] = df.columns[y_col], val, abs(val) if val > max_val: max_val = val max_x , max_x_2, max_x_3, max_y = df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3], df.columns[y_col] print df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3], " | ", df.columns[y_col], " : " , val pos = pos + 1 print 'max' print max_x, max_x_2, max_x_3, " | " , max_y, " : ", max_val print 'ran for {0} minutes '.format((time.clock() - start)/60.0) result.to_excel(os.path.join(result_dir, 'kendal_f_measure_three.xlsx'), encoding = 'utf-8', index = False)
def f_measure_3(): result = pd.DataFrame(columns =['first', 'second', 'third', 'y_col', 'kendaltau']) pos = 0 for y_col in [1,2,3,4]: max_val = 0 for x_col in [5, 6, 7, 8, 9, 10, 11, 12]: for x_col_2 in [6, 7, 8, 9, 10, 11, 12]: for x_col_3 in [7, 8, 9, 10, 11, 12]: if x_col == x_col_2 or x_col == x_col_3 or x_col_2 == x_col_3: continue if not (x_col < x_col_2 and x_col_2 < x_col_3): continue x = df.iloc[:,x_col] x2 = df.iloc[:,x_col_2] x3 = df.iloc[:,x_col_3] y = df.iloc[:,y_col] f = 3 * (x * x2 * x3)/ (x + x2 + x3) val = kendalltau(f, y)[0] result.loc[pos, 'first'], result.loc[pos,'second'], result.loc[pos,'third'] = df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3] result.loc[pos, 'y_col'], result.loc[pos,'kendaltau'] = df.columns[y_col], val if val > max_val: max_val = val max_x , max_x_2, max_x_3, max_y = df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3], df.columns[y_col] # print df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3], " | ", df.columns[y_col], " : " , val pos = pos + 1 print 'max' print max_x, max_x_2, max_x_3, " | " , max_y, " : ", max_val result.to_csv(os.path.join(result_dir, 'kendal_f_measure_3.csv'), encoding = 'utf-8', index = False)
def get_tau(ccode1, ccode2): ''' Closure to find K-Tau between two actors. ''' x = np.array(adj_mat[name_to_pos[ccode1]])[0] y = np.array(adj_mat[name_to_pos[ccode2]])[0] return kendalltau(x, y)[0]
def calc_tau_and_chi(sorted_data, exp_key): """ Consumes a dictionary of results that is output from sort_data and calculates kendall's tau and the chi square statistic value for each key with the specified key as the data representing the expected value """ chitau_dict = {} bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] t_exp_array = scipy.array(sorted_data[exp_key]) c_exp_lst = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for val in sorted_data[exp_key]: for i in xrange(1,11): if bins[i-1] < val and bins[i] > val: c_exp_lst[i-1] += 1 c_exp_array = scipy.array(c_exp_lst) for k in sorted_data.keys(): k_dict = {} t_obs_array = scipy.array(sorted_data[k]) c_obs_lst = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for val in sorted_data[k]: for i in xrange(1,11): if bins[i-1] < val and bins[i] > val: c_obs_lst[i-1] += 1 c_obs_array = scipy.array(c_obs_lst) chi_test = chisquare(c_obs_array, f_exp=c_exp_array) k_dict['tau'] = kendalltau(t_exp_array, t_obs_array)[0] k_dict['chi_sq_val'] = chi_test[0] k_dict['chi_sq_p'] = chi_test[1] chitau_dict[k] = k_dict return chitau_dict
def _plot_data(figure, datalist, labels, ylabels, xlabels, ncols=3): if isinstance(ylabels,str): ylabels = [ylabels]*len(datalist) xlabels = [xlabels]*len(datalist) ilabels = False else: ilabels = True nrows = int(np.ceil(len(datalist)/float(ncols))) minv = np.floor(min([d.min() for d in datalist])) maxv = np.ceil(max([d.max() for d in datalist])) vrange = maxv - minv delta = np.ceil(vrange/8.0) ticks = np.arange(minv,maxv+1.0,delta) for i,(data,label,ylabel,xlabel) in enumerate(zip(datalist,labels,ylabels,xlabels),1): a = figure.add_subplot(nrows,ncols,i) a.plot(data[:,1],data[:,0],".",color=colors.color(i-1)) tau,tpval = stats.kendalltau(data[:,1],data[:,0]) r,rpval = stats.pearsonr(data[:,1],data[:,0]) print "%s\t%.5f\t%.5f\t%.5f\t%5f"%(label,r,rpval,tau,tpval) a.plot([minv,maxv],[minv,maxv],"--k") a.set_xlim(minv,maxv) a.set_ylim(minv,maxv) a.set_xticks(ticks) a.set_yticks(ticks) if ilabels or a.is_first_col(): a.set_ylabel(ylabel) if ilabels or a.is_last_row(): a.set_xlabel(xlabel) a.text(0.05,0.92,label,transform=a.transAxes) a.set_aspect('equal') figure.tight_layout() print ""
def test_kendalltau(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r = stats.kendalltau(x, y) rm = stats.mstats.kendalltau(xm, ym) assert_almost_equal(r[0], rm[0], decimal=10) assert_almost_equal(r[1], rm[1], decimal=7)
def calc_kendall_tau(gam_unit, average=False): ''' Calculate Kendall tau value for predicted values. This tau scales between -1 (prefect negative correlation) and 1 (perfect correlation). gam_unit : GamUnit has `actual` and `pred` attributes average : bool average across repeats before calculating tau ''' assert(type(average) == bool) if not average: act_flat = gam_unit.actual.flatten() else: act_flat = stats.nanmean(gam_unit.actual, axis=1).flatten() nans = np.isnan(act_flat) act_flat = act_flat[~nans] tau = {} P = {} for k, v in gam_unit.fits.iteritems(): if not average: pred_flat = v.pred.flatten()[~nans] else: pred_flat = stats.nanmean(v.pred, axis=1).flatten() tau[k], P[k] = stats.kendalltau(act_flat, pred_flat) return tau, P
def rankOrder(self, other,rank="kendall"): #A bit of a hack momentarily which works for AFM and PFA. Only looks at KC parameters #Changed it to work for eirt ONLY by changing from -1 to -2 in line below off=0 if self.paranames[1]=="gamma": off=1 else: off=2 answerlist=np.zeros(len(self.parameters)-off) for i in range(len(answerlist)): pars1=[] pars2=[] skip1=skip2=0 for j in range(len(self.parameters[0])+len(self.data.kcmis)): if j in self.data.kcmis: skip1+=1 if j in other.data.kcmis: skip2+=1 if not (j in self.data.kcmis or j in other.data.kcmis): pars1.append(self.parameters[i][j-skip1]) pars2.append(other.parameters[i][j-skip2]) if rank=="spearman": answerlist[i]=stat.spearmanr(pars1,pars2)[0] else: answerlist[i]=stat.kendalltau(pars1,pars2)[0] return answerlist
def _fill_stats(ws,coloffset,nentries,calcol,expcol,data,tau=None): """ Fill in statistical analysis: MAD, r2, tau, r, slope and intercept """ diffcol = ws.cell(row=3,column=5+coloffset).column ws.cell(row=3,column=6+coloffset).value = "MAD" ws.cell(row=3,column=7+coloffset).value = "=AVERAGE(%s3:%s%d)"%(diffcol,diffcol,nentries+2) ws.cell(row=4,column=6+coloffset).value = "r2" ws.cell(row=4,column=7+coloffset).value = "=CORREL(%s3:%s%d,%s3:%s%d)^2"% \ (calcol,calcol,nentries+2,expcol,expcol,nentries+2) if tau is None: tau = stats.kendalltau(data[:,0],data[:,1]) ws.cell(row=5,column=6+coloffset).value = "tau" ws.cell(row=5,column=7+coloffset).value = tau[0] ws.cell(row=5,column=8+coloffset).value = tau[1] ws.cell(row=6,column=6+coloffset).value = "r" ws.cell(row=6,column=7+coloffset).value = "=CORREL(%s3:%s%d,%s3:%s%d)"% \ (calcol,calcol,nentries+2,expcol,expcol,nentries+2) ws.cell(row=6,column=8+coloffset).value = stats.pearsonr(data[:,0],data[:,1])[1] ws.cell(row=7,column=6+coloffset).value = "Slope" ws.cell(row=7,column=7+coloffset).value = "=SLOPE(%s3:%s%d,%s3:%s%d)^2"% \ (expcol,expcol,nentries+2,calcol,calcol,nentries+2) ws.cell(row=8,column=6+coloffset).value = "Intercept" ws.cell(row=8,column=7+coloffset).value = "=INTERCEPT(%s3:%s%d,%s3:%s%d)^2"% \ (expcol,expcol,nentries+2,calcol,calcol,nentries+2)
def runTest(): sd = 110 np.random.seed(sd) # Generate sample Theta's which tells the relative goodness of answers # Suppose there are 10 answers Theta = np.random.randn(10) # pDensity pDens = np.exp(Theta)/np.sum(np.exp(Theta)) cDist = np.cumsum(pDens) aNum = np.ones(10) aNum[0] = 0 aNum = np.cumsum(aNum) plt.bar(aNum, pDens) plt.plot(aNum, cDist) plt.title('PDensityF/CDistributionF') print "Close the figure to run simulation" plt.savefig('PDF-sd%d.png'%sd) plt.show() N = 200 kTau = np.zeros(N) for i in range(N): ThetaEst = simulateClicks(i+1, cDist) kTau[i] = kendalltau(Theta, ThetaEst)[0] kTau2 = simulateClicks2(N, cDist, Theta) kTau3 = simClicksTime(N, cDist, Theta) plt.plot(range(1, N+1),kTau, 'b') plt.plot(range(1, N+1),kTau2,'k') plt.plot(range(1, N+1),kTau3,'r') plt.title('KendallTau') plt.xlabel('Number of clicks(N)') plt.savefig('KTau-sd%d.png'%sd) plt.show()
def calculate_jaccard_kendall(page1, page2, from_file = True): """ calculate jaccard and kendall from two files. Return a tuple(jaccard, kendall), float number. page1, page2: full path to two files """ Measurements.is_from_file = from_file alph = Measurements.pages_to_alphabet([page1, page2]) str1 = Measurements.page_to_string(page1, alph) str2 = Measurements.page_to_string(page2, alph) str1 = unicode(str1, 'utf-8',errors='ignore') str2 = unicode(str2, 'utf-8',errors='ignore') l1 = [a for a in str1] l2 = [a for a in str2] while len(l1) < len(l2): l1.append('null') while len(l2) < len(l1): l2.append('null') #----- s1 = Measurements.link_to_set(page1) s2 = Measurements.link_to_set(page2) j = 0 # this is jaccard if len(s1) > len(s2): j = float(len(s1.intersection(s2)))/len(s1) else: j = float(len(s2.intersection(s1)))/len(s2) #----- k = kendalltau(l1, l2)[0] return j, k
def _map(self, key, video_group): #Creates a data matrix with the number of views per video (rows) for #each referrer (columns). i = 0 n_rows = len(video_group) n_cols = len(self.ref_group_to_int) + 1 data = np.zeros(shape=(n_rows, n_cols)) for video_data in video_group.values(): referral_views = video_data[0] total_view = video_data[1] for ref_group, ref_views in referral_views: ref_group_id = self.ref_group_to_int[ref_group] data[i][ref_group_id] = ref_views #Last column has total views data[i][-1] = total_view; i += 1 #Generating correlations total_view_array = data[:,-1] return_val = {} for ref_group, ref_group_id in self.ref_group_to_int.iteritems(): ref_group_array = data[:,ref_group_id] k_tau = stats.kendalltau(total_view_array, ref_group_array) s_rho = stats.spearmanr(total_view_array, ref_group_array) return_val[ref_group] = (k_tau, s_rho) return return_val
def correlation(): '''Pearson correlation, and two types of rank correlation (Spearman, Kendall) comparing age and %fat (measured by dual-photon absorptiometry) for 18 normal adults. ''' # Get the data inFile = 'altman_11_1.txt' data = np.genfromtxt(inFile, delimiter=',') x = data[:,0] y = data[:,1] # --- >>> START stats <<< --- # Calculate correlations # Resulting correlation values are stored in a dictionary, so that it is # obvious which value belongs to which correlation coefficient. corr = {} corr['pearson'], _ = stats.pearsonr(x,y) corr['spearman'], _ = stats.spearmanr(x,y) corr['kendall'], _ = stats.kendalltau(x,y) # --- >>> STOP stats <<< --- print(corr) # Assert that Spearman's rho is just the correlation of the ranksorted data np.testing.assert_almost_equal(corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]) return corr['pearson'] # should be 0.79208623217849117
def compute_kendalltau_corellations(meth_rankings,printRes=True): """Method for computing kendall tau corellations between different rankings Parameters ---------- meth_rankings: dictionary dictionary {'method_name':ranking} printRes: bool, optional print correlations Returns ------- corrs: list of tuples [('method_name_1', 'method_name_2', correlation, p_value), ] """ corrs=[] for p in itertools.combinations(meth_rankings.items(), 2): z,pv=stats.kendalltau(p[0][1], p[1][1]) corrs.append((p[0][0], p[1][0],z,pv)) if printRes==True: show_corr_results(corrs) return corrs
def kendall_tau(position_scores, position_predictions, topN=TOP_N): """ Each arg has form [(score, (name, team, id))]. Extract IDs from each, find intersection, remap to unique IDs in [0,N), and use scipy. """ def get_ids(score_list): return [id for score, (name, team, id) in score_list[:topN]] true_ids = get_ids(position_scores) pred_ids = get_ids(position_predictions) shared = set(true_ids) & set(pred_ids) frac_shared = float(len(shared)) / topN def get_scores(score_list): # Sort to ensure same order among lists idscore = sorted([(id, score) for score, (name, team, id) in score_list if id in shared]) return [score for id, score in idscore] true_scores = get_scores(position_scores) pred_scores = get_scores(position_predictions) if len(true_scores) < 2: return (0, 0), frac_shared return kendalltau(true_scores, pred_scores), frac_shared
def ternary_metrics(polarities, lexicon, eval_words, tau_lexicon=None): if not tau_lexicon == None: kendall_words = list(set(eval_words).intersection(tau_lexicon)) y_prob, y_true = [], [] polarities = {word:polarities[word] for word in eval_words} for w in polarities: y_prob.append(polarities[w]) y_true.append(lexicon[w]) y_prob = np.array(y_prob) y_true = np.array(y_true) y_prob = 2*(y_prob - np.min(y_prob)) / (np.max(y_prob) - np.min(y_prob)) - 1 neg_prop = np.sum(np.array(lexicon.values()) == -1) / float(len(lexicon)) pos_prop = np.sum(np.array(lexicon.values()) == 1) / float(len(lexicon)) sorted_probs = sorted(y_prob) neg_thresh = sorted_probs[int(np.round(neg_prop*len(sorted_probs)))] pos_thresh = sorted_probs[-int(np.round(pos_prop*len(sorted_probs)))] cmn_labels = [1 if val >= pos_thresh else -1 if val <= neg_thresh else 0 for val in y_prob] if not tau_lexicon == None: tau = kendalltau(*zip(*[(polarities[word], tau_lexicon[word]) for word in kendall_words]))[0] else: tau = None maj_f1 = f1_score(y_true, np.repeat(sp.stats.mode(y_true)[0][0], len(y_true)), average="macro") cmn_f1 = f1_score(y_true, cmn_labels, average="macro") label_func = lambda entry : 1 if entry > pos_thresh else -1 if entry < neg_thresh else 0 conf_mat = confusion_matrix(y_true, [label_func(entry) for entry in y_prob]) return tau, cmn_f1, maj_f1, conf_mat
def evaluation_prediction(): correlation_all_w = [] correlation_all_tau = [] correlation_all_rho = [] len_ = 0 for event_name in dict_name2: f = open(root + 'baseline_all_0509/' + event_name+ '/vgg_test_result_v2.cPickle','r') # f = open(root + 'baseline_all_noblock/' + event_name+ '/vgg_test_result_v2.cPickle','r') ground_truth = cPickle.load(f) f.close() f = open(root + 'CNN_all_event_1009/features/' + event_name+ '_test_combined_10_combine_dict.cPickle','r') prediction = cPickle.load(f) f.close() correlation_rho = [] correlation_tau = [] correlation_w = [] for event_id in ground_truth: g = [i[2] for i in ground_truth[event_id]] p = [i[2] for i in prediction[event_id]] temp_rho,temp1 = spearmanr(g, p) temp_w = kendall_w({1:g,2:p}) temp_tau, temp1 = kendalltau(g, p) correlation_rho.append(temp_rho) correlation_w.append(temp_w) correlation_tau.append(temp_tau) len_ += len(ground_truth) print event_name, ', rho:', np.mean(correlation_rho), ', kendall\'s tau:', np.mean(correlation_tau), ', kendall\'s W:', np.mean(correlation_w) correlation_all_rho.append(np.mean(correlation_rho) * len(ground_truth)) correlation_all_tau.append(np.mean(correlation_tau) * len(ground_truth)) correlation_all_w.append(np.mean(correlation_w) * len(ground_truth)) print 'rho:', np.sum(correlation_all_rho) / len_, ', kendall\'s tau:', np.sum(correlation_all_tau) / len_, ', kendall\'s W:', np.sum(correlation_all_w) / len_
def calculate_richness_out_change(g, last_value=None): richness_scores = richclub.richness_scores(g, richness='out_strength') if last_value: from scipy.stats import kendalltau return richness_scores, kendalltau(richness_scores, last_value)[0] else: return richness_scores
def get_kendall_tau(path1, path2): item_ranking = defaultdict(list) with open(path1, 'r') as f1: for line in f1: vid, tweetcount = line.rstrip().split() item_ranking[vid].append(int(tweetcount)) with open(path2, 'r') as f2: for line in f2: vid, tweetcount = line.rstrip().split() if vid not in item_ranking: item_ranking[vid].append(0) item_ranking[vid].append(int(tweetcount)) # fill zero in vid not appear in file2 for tweetcounts in item_ranking.values(): if len(tweetcounts) == 1: tweetcounts.append(0) # sort by value of file1 sorted_item_ranking = sorted(item_ranking.items(), key=operator.itemgetter(1), reverse=True) file1_list = [] file2_list = [] for item in sorted_item_ranking: file1_list.append(item[1][0]) file2_list.append(item[1][1]) taus = [] for i in xrange(start, end + 1, jump): tau, p_value = stats.kendalltau(file1_list[:i], file2_list[:i]) taus.append(tau) return taus
def generate_mod_series(reference,series,RealKen): """ Takes the series from generate_base_null, takes the list from data, and makes a null for each gene in data or uses the one previously calculated. Then it runs Kendall's Tau on the exp. series against the null """ geneID = series[0] values = series[1:] binary = [1 if value!="NA" else np.nan for value in values] temp = reference*binary mod_reference = [value for value in temp if not np.isnan(value)] mod_values = [value for value in values if value!='NA'] # print reference # print temp # print mod_reference # print mod_values if len(mod_values) < 3: tau,p = np.nan,np.nan elif mod_values.count(np.nan) == len(mod_values): tau,p = np.nan,np.nan elif mod_values.count(0) == len(mod_values): tau,p = np.nan,np.nan elif sum(mod_values)<0.00001: tau,p = np.nan,np.nan else: tau,p=kendalltau(mod_values,mod_reference) if not np.isnan(tau): pk = RealKen.pval(tau,len(mod_values)) if pk!=None: p=pk #print tau,p return geneID,tau,p
def calculate_betweeness_change_kendall(g, last_value=None): betweeness_sequence = g.edge_betweenness(weights='weight') if last_value: from scipy.stats import kendalltau return betweeness_sequence, kendalltau(betweeness_sequence, last_value)[0] else: return betweeness_sequence
class JointPlot(FeatureVisualizer): """ Joint plots are useful for machine learning on multi-dimensional data, allowing for the visualization of complex interactions between different data dimensions, their varying distributions, and even their relationships to the target variable for prediction. The Yellowbrick ``JointPlot`` can be used both for pairwise feature analysis and feature-to-target plots. For pairwise feature analysis, the ``columns`` argument can be used to specify the index of the two desired columns in ``X``. If ``y`` is also specified, the plot can be colored with a heatmap or by class. For feature-to-target plots, the user can provide either ``X`` and ``y`` as 1D vectors, or a ``columns`` argument with an index to a single feature in ``X`` to be plotted against ``y``. Histograms can be included by setting the ``hist`` argument to ``True`` for a frequency distribution, or to ``"density"`` for a probability density function. Note that histograms requires matplotlib 2.0.2 or greater. Parameters ---------- ax : matplotlib Axes, default: None The axes to plot the figure on. If None is passed in the current axes will be used (or generated if required). This is considered the base axes where the the primary joint plot is drawn. It will be shifted and two additional axes added above (xhax) and to the right (yhax) if hist=True. columns : int, str, [int, int], [str, str], default: None Determines what data is plotted in the joint plot and acts as a selection index into the data passed to ``fit(X, y)``. This data therefore must be indexable by the column type (e.g. an int for a numpy array or a string for a DataFrame). If None is specified then either both X and y must be 1D vectors and they will be plotted against each other or X must be a 2D array with only 2 columns. If a single index is specified then the data is indexed as ``X[columns]`` and plotted jointly with the target variable, y. If two indices are specified then they are both selected from X, additionally in this case, if y is specified, then it is used to plot the color of points. Note that these names are also used as the x and y axes labels if they aren't specified in the joint_kws argument. correlation : str, default: 'pearson' The algorithm used to compute the relationship between the variables in the joint plot, one of: 'pearson', 'covariance', 'spearman', 'kendalltau'. kind : str in {'scatter', 'hex'}, default: 'scatter' The type of plot to render in the joint axes. Note that when kind='hex' the target cannot be plotted by color. hist : {True, False, None, 'density', 'frequency'}, default: True Draw histograms showing the distribution of the variables plotted jointly. If set to 'density', the probability density function will be plotted. If set to True or 'frequency' then the frequency will be plotted. Requires Matplotlib >= 2.0.2. alpha : float, default: 0.65 Specify a transparency where 1 is completely opaque and 0 is completely transparent. This property makes densely clustered points more visible. {joint, hist}_kws : dict, default: None Additional keyword arguments for the plot components. kwargs : dict Keyword arguments that are passed to the base class and may influence the visualization as defined in other Visualizers. Attributes ---------- corr_ : float The correlation or relationship of the data in the joint plot, specified by the correlation algorithm. Examples -------- >>> viz = JointPlot(columns=["temp", "humidity"]) >>> viz.fit(X, y) >>> viz.show() """ # TODO: should we couple more closely with Rank2D? correlation_methods = { "pearson": lambda x, y: pearsonr(x, y)[0], "spearman": lambda x, y: spearmanr(x, y)[0], "covariance": lambda x, y: np.cov(x, y)[0, 1], "kendalltau": lambda x, y: kendalltau(x, y)[0], } def __init__(self, ax=None, columns=None, correlation="pearson", kind="scatter", hist=True, alpha=0.65, joint_kws=None, hist_kws=None, **kwargs): # Initialize the visualizer super(JointPlot, self).__init__(ax=ax, **kwargs) self._xhax, self._yhax = None, None # Set and validate the columns self.columns = columns if self.columns is not None and not isinstance(self.columns, (int, str)): self.columns = tuple(self.columns) if len(self.columns) > 2: raise YellowbrickValueError(( "'{}' contains too many indices or is invalid for joint plot - " "specify either a single int or str index or two columns as a list" ).format(columns)) # Seet and validate the correlation self.correlation = correlation if self.correlation not in self.correlation_methods: raise YellowbrickValueError( "'{}' is an invalid correlation method, use one of {}".format( self.correlation, ", ".join(self.correlation_methods.keys()))) # Set and validate the kind of plot self.kind = kind if self.kind not in {"scatter", "hex", "hexbin"}: raise YellowbrickValueError( ("'{}' is invalid joint plot kind, use 'scatter' or 'hex'" ).format(self.kind)) # Set and validate the histogram if specified self.hist = hist if self.hist not in {True, "density", "frequency", None, False}: raise YellowbrickValueError( ("'{}' is an invalid argument for hist, use None, True, " "False, 'density', or 'frequency'").format(hist)) # If hist is True, test the version availability if self.hist in {True, "density", "frequency"}: self._layout() # Set the additional visual parameters self.alpha = alpha self.joint_kws = joint_kws self.hist_kws = hist_kws @property def xhax(self): """ The axes of the histogram for the top of the JointPlot (X-axis) """ if self._xhax is None: raise AttributeError( "this visualizer does not have a histogram for the X axis") return self._xhax @property def yhax(self): """ The axes of the histogram for the right of the JointPlot (Y-axis) """ if self._yhax is None: raise AttributeError( "this visualizer does not have a histogram for the Y axis") return self._yhax def _layout(self): """ Creates the grid layout for the joint plot, adding new axes for the histograms if necessary and modifying the aspect ratio. Does not modify the axes or the layout if self.hist is False or None. """ # Ensure the axes are created if not hist, then return. if not self.hist: self.ax return # Ensure matplotlib version compatibility if make_axes_locatable is None: raise YellowbrickValueError(( "joint plot histograms requires matplotlib 2.0.2 or greater " "please upgrade matplotlib or set hist=False on the visualizer" )) # Create the new axes for the histograms divider = make_axes_locatable(self.ax) self._xhax = divider.append_axes("top", size=1, pad=0.1, sharex=self.ax) self._yhax = divider.append_axes("right", size=1, pad=0.1, sharey=self.ax) # Modify the display of the axes self._xhax.xaxis.tick_top() self._yhax.yaxis.tick_right() self._xhax.grid(False, axis="y") self._yhax.grid(False, axis="x") def fit(self, X, y=None): """ Fits the JointPlot, creating a correlative visualization between the columns specified during initialization and the data and target passed into fit: - If self.columns is None then X and y must both be specified as 1D arrays or X must be a 2D array with only 2 columns. - If self.columns is a single int or str, that column is selected to be visualized against the target y. - If self.columns is two ints or strs, those columns are visualized against each other. If y is specified then it is used to color the points. This is the main entry point into the joint plot visualization. Parameters ---------- X : array-like An array-like object of either 1 or 2 dimensions depending on self.columns. Usually this is a 2D table with shape (n, m) y : array-like, default: None An vector or 1D array that has the same length as X. May be used to either directly plot data or to color data points. """ # Convert python objects to numpy arrays if isinstance(X, (list, tuple)): X = np.array(X) if y is not None and isinstance(y, (list, tuple)): y = np.array(y) # Case where no columns are specified if self.columns is None: if (y is None and (X.ndim != 2 or X.shape[1] != 2)) or ( y is not None and (X.ndim != 1 or y.ndim != 1)): raise YellowbrickValueError(( "when self.columns is None specify either X and y as 1D arrays " "or X as a matrix with 2 columns")) if y is None: # Draw the first column as x and the second column as y self.draw(X[:, 0], X[:, 1], xlabel="0", ylabel="1") return self # Draw x against y self.draw(X, y, xlabel="x", ylabel="y") return self # Case where a single string or int index is specified if isinstance(self.columns, (int, str)): if y is None: raise YellowbrickValueError( "when self.columns is a single index, y must be specified") # fetch the index from X -- raising index error if not possible x = self._index_into(self.columns, X) self.draw(x, y, xlabel=str(self.columns), ylabel="target") return self # Case where there is a double index for both columns columns = tuple(self.columns) if len(columns) != 2: raise YellowbrickValueError( ("'{}' contains too many indices or is invalid for joint plot" ).format(columns)) # TODO: color the points based on the target if it is given x = self._index_into(columns[0], X) y = self._index_into(columns[1], X) self.draw(x, y, xlabel=str(columns[0]), ylabel=str(columns[1])) return self def draw(self, x, y, xlabel=None, ylabel=None): """ Draw the joint plot for the data in x and y. Parameters ---------- x, y : 1D array-like The data to plot for the x axis and the y axis xlabel, ylabel : str The labels for the x and y axes. """ # This is a little weird to be here, but it is the best place to perform # this computation given how fit calls draw and returns. self.corr_ = self.correlation_methods[self.correlation](x, y) # First draw the joint plot joint_kws = self.joint_kws or {} joint_kws.setdefault("alpha", self.alpha) joint_kws.setdefault("label", "{}={:0.3f}".format(self.correlation, self.corr_)) # Draw scatter joint plot if self.kind == "scatter": self.ax.scatter(x, y, **joint_kws) # TODO: Draw best fit line (or should this be kind='reg'?) # Draw hexbin joint plot elif self.kind in ("hex", "hexbin"): joint_kws.setdefault("mincnt", 1) joint_kws.setdefault("gridsize", 50) joint_kws.setdefault("cmap", "Blues") self.ax.hexbin(x, y, **joint_kws) # Something bad happened else: raise ValueError("unknown joint plot kind '{}'".format(self.kind)) # Set the X and Y axis labels on the plot self.ax.set_xlabel(xlabel) self.ax.set_ylabel(ylabel) # If we're not going to draw histograms, stop here if not self.hist: # Ensure the current axes is always the main joint plot axes plt.sca(self.ax) return self.ax # Draw the histograms hist_kws = self.hist_kws or {} hist_kws.setdefault("bins", 50) if self.hist == "density": hist_kws.setdefault("density", True) self.xhax.hist(x, **hist_kws) self.yhax.hist(y, orientation="horizontal", **hist_kws) # Ensure the current axes is always the main joint plot axes plt.sca(self.ax) return self.ax def finalize(self, **kwargs): """ Finalize executes any remaining image modifications making it ready to show. """ # Set the aspect ratio to make the visualization square # TODO: still unable to make plot square using make_axes_locatable # x0,x1 = self.ax.get_xlim() # y0,y1 = self.ax.get_ylim() # self.ax.set_aspect(abs(x1-x0)/abs(y1-y0)) # Add the title to the plot if the user has set one. self.set_title("") # TODO: use manual legend so legend works with both scatter and hexbin # Set the legend with full opacity patches using manual legend. # Or Add the colorbar if this is a continuous plot. if self.kind == "scatter": self.ax.legend(loc="best", frameon=True) # Finalize the histograms if self.hist: plt.setp(self.xhax.get_xticklabels(), visible=False) plt.setp(self.yhax.get_yticklabels(), visible=False) plt.sca(self.ax) # Call tight layout to maximize readability plt.tight_layout() def _index_into(self, idx, data): """ Attempts to get the column from the data using the specified index, raises an exception if this is not possible from this point in the stack. """ try: if is_dataframe(data): # Assume column indexing return data[idx] # Otherwise assume numpy array-like indexing return data[:, idx] except Exception as e: raise IndexError( "could not index column '{}' into type {}: {}".format( self.columns, data.__class__.__name__, e))
#extras_x = extras_x.fillna(0) y = y.fillna(0) #extras_y = extras_y.fillna(0) for train, test in k_fold.split(x): train_x, test_x = x.iloc[train], x.iloc[test] train_y, test_y = y.iloc[train], y.iloc[test] #rand_for.fit(pd.concat([train_x, extras_x]), pd.concat([train_y, extras_y])) rand_for.fit(train_x, train_y) #svr.fit(train_x, train_y) predictions = rand_for.predict(test_x) # i=0 # for row in test: # dataframe["predicted_label"].iat[row] = predictions[i] # i += 1 kappa_avg += cohen_kappa_score(test_y, predictions, weights="linear") tau_avg += stats.kendalltau(test_y, predictions)[0] acc_avg += accuracy_score(test_y, predictions) #score += rand_for.score(test_x, test_y) # cm = confusion_matrix(test_y, predictions) # for i in range(5): # for j in range(5): # try: # confusion[i][j] += cm[i][j] # except: # pass for item in range(len(paras)): i = paras[item] d_x = test_x[dataframe["marked_par"] == i] if len(d_x) > 0: d_y = test_y[d_x.index.values] p = rand_for.predict(d_x)
def compute_correlations(test_path, predictions, human_metric, mode): """ Computes the correlations between BERT output and the other human metrics. :param test_path: Path to the test data. :param predictions: The predictions of the model. :param human_metric: The metric for which the model is trained. It is needed only on 'Single Task' mode. :param mode: Depending on your choice : ['Single Task', 'Multi Task-1', 'Multi Task-5']. """ test_data = dict(np.load(test_path, allow_pickle=True).item()) ordered_ids = test_data['peer_ids'] system_ids = {i for i in ordered_ids} empty_ids = test_data['empty_ids'] correlations = {} # Here will be store the correlations test_human_metrics = { 'Q1': test_data['test_Q1'], 'Q2': test_data['test_Q2'], 'Q3': test_data['test_Q3'], 'Q4': test_data['test_Q4'], 'Q5': test_data['test_Q5'] } for k in range(predictions.shape[1]): output_aggregation_table = np.zeros([len(system_ids)]) human_aggregation_table = np.zeros([len(system_ids)]) # Choose only Q_k to compute the correlation. # At single task, we have only one dimension on predictions if mode == 'Multi Task-1' or mode == 'Multi Task-5': predictions_of_metric = predictions[:, k] metric_real = test_human_metrics['Q' + str(k + 1)] else: predictions_of_metric = predictions metric_real = test_human_metrics[human_metric] for i, s_id in enumerate(system_ids): id_predictions = [] id_human_scores = [] for j, o_id in enumerate(ordered_ids): if s_id == o_id: id_predictions.append(predictions_of_metric[j]) id_human_scores.append(metric_real[j]) # Empty ids is a list with the peer_ids which the summary they sent was empty. # Each position corresponds to a doc_id-peer_id. if the system had sent more # than one empty summaries, it will be appeared on list multiple times, so when we # check each s_id we will append 0 as many times as the empty summaries it sent for e_id in empty_ids: if e_id == s_id: id_predictions.append(0) id_human_scores.append(0) output_aggregation_table[i] = np.mean(np.array(id_predictions)) human_aggregation_table[i] = np.mean(np.array(id_human_scores)) if mode == 'Multi Task-1' or mode == 'Multi Task-5': correlations['Q{}'.format(k + 1)] = { 'Spearman': spearmanr(human_aggregation_table, output_aggregation_table)[0], 'Kendall': kendalltau(human_aggregation_table, output_aggregation_table)[0], 'Pearson': pearsonr(human_aggregation_table, output_aggregation_table)[0] } else: correlations[human_metric] = { 'Spearman': spearmanr(human_aggregation_table, output_aggregation_table)[0], 'Kendall': kendalltau(human_aggregation_table, output_aggregation_table)[0], 'Pearson': pearsonr(human_aggregation_table, output_aggregation_table)[0] } if mode == 'Multi Task-1' or mode == 'Multi Task-5': log_msg = 'Q1 -> {} \nQ2 -> {} \nQ3 -> {} \nQ4 -> {} \nQ5 -> {} \n'.format( ''.join(['{}={:.3f} '.format(metric, correlations['Q1'][metric]) for metric in ['Spearman', 'Kendall', 'Pearson']]), ''.join(['{}={:.3f} '.format(metric, correlations['Q2'][metric]) for metric in ['Spearman', 'Kendall', 'Pearson']]), ''.join(['{}={:.3f} '.format(metric, correlations['Q3'][metric]) for metric in ['Spearman', 'Kendall', 'Pearson']]), ''.join(['{}={:.3f} '.format(metric, correlations['Q4'][metric]) for metric in ['Spearman', 'Kendall', 'Pearson']]), ''.join(['{}={:.3f} '.format(metric, correlations['Q5'][metric]) for metric in ['Spearman', 'Kendall', 'Pearson']])) else: log_msg = '{} -> {} \n'.format(human_metric, ''.join( ['{}={:.3f} '.format(metric, correlations[human_metric][metric]) for metric in ['Spearman', 'Kendall', 'Pearson']])) LOGGER.info(log_msg)
def score(self, X, y): yp = self.predict(X) return kendalltau(y, yp)[0]
def kendalltau_eval(preds_targs, preds_col="preds", targs_col="targs"): preds_targs = preds_targs[["preds", "targs"]] preds_targs = preds_targs[~preds_targs.isnull().any(axis=1)] preds_targs_rank = preds_targs.rank(method="average") return kendalltau(preds_targs_rank[preds_col].values, preds_targs_rank[targs_col].values)[0]
def clean_data_and_find_correlations(*, df, row_Filter=None, colName_main_variable, verbose=False): """ ================= =============================================================================== Property Description ================= =============================================================================== * Function This function, takes df, with numeric data in each column, computes all column to one of them, called colName_main_variable the data are formatted and cleaned in each pari of column individually, (by removing na) Parameters/Input _________________ _______________________________________________________________________________ . Input . * df DataFrame, with unique column names * row_filter list[bool], with len(row_filter )==df.shape[0] * verbose bool, if True, shows info Returns _________________ _______________________________________________________________________________ * comparisons_dct dictionary, where key=column name in df, inside each entry, there is another dict wiht condition names, raw and filtered data and results of correlation made with 3 different methods (pearson, spearman and kendal) """ #### filter the data: if row_Filter == None: row_Filter = [True] * df.shape[0] # ........ filtered_df_main_col = pd.Series( df.loc[row_Filter, colName_main_variable]) # pd.Series filtered_df = df.loc[row_Filter, :] # pd Series or pd.DataFrame #### rename column with main group to avoid having duplicates, filtered_df.rename( columns={ str(colName_main_variable): "".join([str(colName_main_variable), "_"]) }, inplace=True) # to ensute, that we have no columns with the same name, #### Loop over each columns and compare it with main group, comparisons_dct = dict() # ....... for i, colName in enumerate(list(filtered_df.columns)): if verbose == True: print(i, colName_main_variable, " - with - ", colName) # some columns had to many repeats r nan. i remove them later, but warningn were annoying, thus I removed them; import warnings warnings.filterwarnings("ignore", category=RuntimeWarning) # -------------------------- # Prepare dct for the data, # -------------------------- # new dict, one_comparison_dct = dict() one_comparison_dct["X_main_group_sliced_with"] = row_Filter one_comparison_dct["X_main_group"] = colName_main_variable one_comparison_dct["Y_compared_with"] = colName # -------------------------- # PREPARE THE DATA # -------------------------- # ................... raw data, ......................... # prepare the data, filtered_df_one_col = filtered_df.loc[:, colName] data_for_comparison_df = pd.concat( [filtered_df_main_col, filtered_df_one_col], axis=1) data_for_comparison_df_full = data_for_comparison_df.copy( ) # for hitograms # remove missing data, data_for_comparison_df = data_for_comparison_df.dropna( how="any", axis=0) # to create X/Y_cleaned # names, and basic data to display, sample_number = int(data_for_comparison_df.shape[0]) # int atribute_names = list(data_for_comparison_df.columns) # list # ................... control, ............................ # check if you can continue, if sample_number <= 2: # add to dict, one_comparison_dct["X_total"] = [None] one_comparison_dct["X_cleaned"] = [None] one_comparison_dct["X_cleaned_log"] = [None] # ....... one_comparison_dct["Y_total"] = [None] one_comparison_dct["Y_cleaned"] = [None] one_comparison_dct["Y_cleaned_log"] = [None] # ....... one_comparison_dct["pearson_results"] = [None] one_comparison_dct["sperman_results"] = [None] one_comparison_dct["kendalltau_results"] = [None] # .......... one_comparison_dct["linregress_results"] = [None] one_comparison_dct["linregress_results_log"] = [None] if verbose == True: print( f"Caution, (column cobination nr {i}) -ie.- {atribute_names[0]},vs{atribute_names[1]}, has less then 3 items to compare !" ) ############################################################################ comparisons_dct[colName] = one_comparison_dct ############################################################################ # else, if sample_number > 2: # ... X,Y data for plots and correlation, ................ # all data, without removing NaN in each row, - for hist, X_total = data_for_comparison_df_full.iloc[:, 0] X_total = X_total.dropna(how="any").values.flatten() Y_total = data_for_comparison_df_full.iloc[:, 1] Y_total = Y_total.dropna(how="any").values.flatten() # data, X_cleaned = data_for_comparison_df.iloc[:, 0].values.flatten() Y_cleaned = data_for_comparison_df.iloc[:, 1].values.flatten() # transform values into log(x+2), +2 to avoid having log= inf, or zero X_cleaned_log = np.log(X_cleaned + 16) Y_cleaned_log = np.log(Y_cleaned + 16) # ..... # add to dict, one_comparison_dct["X_total"] = X_total one_comparison_dct["X_cleaned"] = X_cleaned one_comparison_dct["X_cleaned_log"] = X_cleaned_log # ....... one_comparison_dct["Y_total"] = Y_total one_comparison_dct["Y_cleaned"] = Y_cleaned one_comparison_dct["Y_cleaned_log"] = Y_cleaned_log # -------------------------- # FIND CORR. # -------------------------- # ... Correlation, .................................... # correlations, pearson_results = stats.pearsonr(X_cleaned_log, Y_cleaned_log) # linear sperman_results = stats.spearmanr( X_cleaned, Y_cleaned) # rank, with rho value, kendalltau_results = stats.kendalltau( X_cleaned, Y_cleaned) # rank,based on orientation of pairs of ranks # ............ one_comparison_dct["pearson_results"] = pearson_results one_comparison_dct["sperman_results"] = sperman_results one_comparison_dct["kendalltau_results"] = kendalltau_results # Compute a least-squares regression for two sets of measurements. LR_slope, LR_intercept, LR_r_value, LR_p_value, LR_std_err = stats.linregress( X_cleaned, X_cleaned) # .......... linregress_results = { "slope": LR_slope, "intercept": LR_intercept, "r_value": LR_r_value, "p_value": LR_p_value, "std_err": LR_std_err } # .......... one_comparison_dct["linregress_results"] = linregress_results # Compute a least-squares regression for two sets of measurements. LR_slope, LR_intercept, LR_r_value, LR_p_value, LR_std_err = stats.linregress( X_cleaned_log, Y_cleaned_log) # .......... linregress_results_log = { "slope": LR_slope, "intercept": LR_intercept, "r_value": LR_r_value, "p_value": LR_p_value, "std_err": LR_std_err } # .......... one_comparison_dct[ "linregress_results_log"] = linregress_results_log ############################################################################ comparisons_dct[colName] = one_comparison_dct ############################################################################ return comparisons_dct
clust = np.load("erp_cluster.npz") dfs, data = load_data_clust_av(clust['times'], clust['spaces']) all_data = np.concatenate(data) df = pd.concat(dfs) df['target'] = all_data df.is_correct = df.is_correct.astype(bool) df.target *= 1e12 df = df[df.confidence.notna()] # df.confidence /= 100 # df.confidence -= df.confidence.mean() # df.target = st.boxcox(df.target - df.target.min() * 1.01)[0] # df.target=st.boxcox(df.target - df.target.min() * 1.01, lmbda=1) st.spearmanr(df.confidence, df.target) st.kendalltau(df.is_correct, df.target) st.pearsonr(df.confidence, df.target) md = smf.mixedlm( "target ~ is_correct*confidence", data=df, groups=df.subject, # re_formula="~confidence", ) mdf = md.fit(method="powell") mdf.summary() df_sep = df.copy() df_low = df_sep[df_sep.confidence < 40] df_high = df_sep[df_sep.confidence > 50]
print("std", np.std(y1)) # # This is needed as per statsmodel documentation # print('x before : ' , x) x = sm.add_constant(x) # print('x after : ' , x) # ##################################### regression model = sm.OLS(y, x) results = model.fit() print("summary : ", results.summary()) # # print('Parameters: ', results.params) print('results.params : ', results.params) # print(x1, y) pc = stats.pearsonr(x1, y) print('pc : ', pc) tau = stats.kendalltau(x1, y) # print(tau) rho = stats.spearmanr(x1, y) # print(rho) # # creating regression line xx = x1 # print(type(results.params[1]), results.params[1]) print('x1 ', type(x1), ' results.params[1] ', type(results.params[1])) # print('calculations : ', pc*np.std(y1)/np.std(x1)) yy = results.params[0] + x1 * results.params[1] plt.scatter(x1, y, s=None, marker='o', color='g',
def ctsimilarities_cal(data1, data2, sub_opt=1, chl_opt=1, time_win=10, time_step=5, method='spearman'): """ Calculate the Cross-Temporal Similarities between neural data under two conditions Parameters ---------- data1 : array EEG/MEG data from a time-window under condition1. The shape of data should be [n_subs, n_channels, n_ts]. n_subs, n_channels, n_ts represent the number of conditions, the number of subjects, the number of channels and the number of time-points respectively. data2 : array EEG/MEG data from a time-window under condition2. The shape of data should be [n_subs, n_channels, n_ts]. n_subs, n_channels, n_ts represent the number of conditions, the number of subjects, the number of channels and the number of time-points respectively. sub_opt : int 0 or 1. Default is 1. Caculate the CTRDMs for each subject or not. If sub_opt=1, return the CTRDMs for each subjects. If sub_opt=0, return the avg CTRDMs among all subjects. chl_opt : int 0 or 1. Default is 1. Caculate the CTRDMs for each channel or not. If chl_opt=1, calculate the CTRDMs for each channel. If chl_opt=0, calculate the CTRDMs after averaging the channels. time_win : int. Default is 10. Set a time-window for calculating the CTRDM for different time-points. If time_win=10, that means each calculation process based on 10 time-points. time_step : int. Default is 5. The time step size for each time of calculating. method : string 'spearman' or 'pearson' or 'kendall' or 'similarity' or 'distance'. Default is 'spearman'. The method to calculate the similarities. If method='spearman', calculate the Spearman Correlations. If method='pearson', calculate the Pearson Correlations. If methd='kendall', calculate the Kendall tau Correlations. If method='similarity', calculate the Cosine Similarities. If method='distance', calculate the Euclidean Distances. Returns ------- CTSimilarities : array Cross-temporal similarities. If method='spearman' or 'pearson' or 'kendall': If sub_opt=1 and chl_opt=1, the shape of CTSimilarities will be [n_subs, n_channels, int((n_ts-time_win)/time_step)+1, int((n_ts-time_win)/time_step)+1, 2] If sub_opt=1 and chl_opt=0, the shape of CTSimilarities will be [n_subs, int((n_ts-time_win)/time_step)+1, int((n_ts-time_win)/time_step)+1, 2] If sub_opt=0 and chl_opt=1, the shape of CTSimilarities will be [n_channels, int((n_ts-time_win)/time_step) +1, int((n_ts-time_win)/time_step)+1, 2] If sub_opt=0 and chl_opt=0, the shape of CTSimilarities will be [int((n_ts-time_win)/time_step)+1, int((n_ts-time_win)/time_step)+1, 2] If method='similarity' or 'distance': If sub_opt=1 and chl_opt=1, the shape of CTSimilarities will be [n_subs, n_channels, int((n_ts-time_win)/time_step)+1, int((n_ts-time_win)/time_step)+1] If sub_opt=1 and chl_opt=0, the shape of CTSimilarities will be [n_subs, int((n_ts-time_win)/time_step)+1, int((n_ts-time_win)/time_step)+1] If sub_opt=0 and chl_opt=1, the shape of CTSimilarities will be [n_channels, int((n_ts-time_win)/time_step) +1, int((n_ts-time_win)/time_step)+1] If sub_opt=0 and chl_opt=0, the shape of CTSimilarities will be [int((n_ts-time_win)/time_step)+1, int((n_ts-time_win)/time_step)+1] """ n_cons, n_subs, n_chls, n_ts = np.shape(data1) nts = int((n_ts - time_win) / time_step) + 1 # chl_opt=0 if chl_opt == 0: newdata1 = np.zeros([n_subs, nts, n_chls, time_win], dtype=np.float) newdata2 = np.zeros([n_subs, nts, n_chls, time_win], dtype=np.float) for sub in range(n_subs): for t in range(nts): for chl in range(n_chls): newdata1[sub, t, chl] = data1[sub, chl, t * time_step:t * time_step + time_win] newdata2[sub, t, chl] = data2[sub, chl, t * time_step:t * time_step + time_win] newdata1 = np.reshape(newdata1, [n_subs, nts, n_chls * time_win]) newdata2 = np.reshape(newdata2, [n_subs, nts, n_chls * time_win]) CTSimilarities = np.zeros([n_subs, nts, nts, 2], dtype=np.float) for sub in range(n_subs): for t1 in range(nts): for t2 in range(nts): if method == 'spearman': CTSimilarities[sub, t1, t2] = spearmanr(newdata1[sub, t1], newdata2[sub, t2]) if method == 'pearson': CTSimilarities[sub, t1, t2] = pearsonr(newdata1[sub, t1], newdata2[sub, t2]) if method == 'kendall': CTSimilarities[sub, t1, t2] = kendalltau( newdata1[sub, t1], newdata2[sub, t2]) if method == 'similarity': V1 = np.mat(newdata1[sub, t1]) V2 = np.mat(newdata2[sub, t2]) num = float(V1 * V2.T) denom = np.linalg.norm(V1) * np.linalg.norm(V2) cos = num / denom CTSimilarities[sub, t1, t2, 0] = 0.5 + 0.5 * cos if method == 'distance': CTSimilarities[sub, t1, t2, 0] = np.linalg.norm(newdata1[sub, t1] - newdata2[sub, t2]) if sub_opt == 0: CTSimilarities = np.average(CTSimilarities, axis=0) if method == 'spearman' or method == 'pearson' or method == 'kendall': return CTSimilarities if method == 'similarity' or method == 'distance': return CTSimilarities[:, :, 0] if sub_opt == 1: if method == 'spearman' or method == 'pearson' or method == 'kendall': return CTSimilarities if method == 'similarity' or method == 'distance': return CTSimilarities[:, :, :, 0] if chl_opt == 1: newdata1 = np.zeros([n_subs, n_chls, nts, time_win], dtype=np.float) newdata2 = np.zeros([n_subs, n_chls, nts, time_win], dtype=np.float) for sub in range(n_subs): for chl in range(n_chls): for t in range(nts): newdata1[sub, chl, t] = data1[sub, chl, t * time_step:t * time_step + time_win] newdata2[sub, chl, t] = data2[sub, chl, t * time_step:t * time_step + time_win] CTSimilarities = np.zeros([n_subs, n_chls, nts, nts, 2], dtype=np.float) for sub in range(n_subs): for chl in range(n_chls): for t1 in range(nts): for t2 in range(nts): if method == 'spearman': CTSimilarities[sub, chl, t1, t2] = spearmanr( newdata1[sub, t1], newdata2[sub, t2]) if method == 'pearson': CTSimilarities[sub, chl, t1, t2] = pearsonr( newdata1[sub, t1], newdata2[sub, t2]) if method == 'kendall': CTSimilarities[sub, chl, t1, t2] = kendalltau( newdata1[sub, t1], newdata2[sub, t2]) if method == 'similarity': V1 = np.mat(newdata1[sub, chl, t1]) V2 = np.mat(newdata2[sub, chl, t2]) num = float(V1 * V2.T) denom = np.linalg.norm(V1) * np.linalg.norm(V2) cos = num / denom CTSimilarities[sub, t1, t2, 0] = 0.5 + 0.5 * cos if method == 'distance': CTSimilarities[sub, t1, t2, 0] = np.linalg.norm( newdata1[sub, chl, t1] - newdata2[sub, chl, t2]) if sub_opt == 0: CTSimilarities = np.average(CTSimilarities, axis=0) if method == 'spearman' or method == 'pearson' or method == 'kendall': return CTSimilarities if method == 'similarity' or method == 'distance': return CTSimilarities[:, :, :, 0] if sub_opt == 1: if method == 'spearman' or method == 'pearson' or method == 'kendall': return CTSimilarities if method == 'similarity' or method == 'distance': return CTSimilarities[:, :, :, :, 0]
# (v_degrees == k) es una lista por nodo: 1 si el nodo tiene grado k, 0 si no # essential_vertex es una lista de escencialidad de nodo: 1 si el nodo es escencial, 0 si no # el producto actua como operador "y"-logico: 1 si es escencial y de grado k essential_hist[i] = np.sum( ( v_degrees == k )*essential_vertex ) data = pd.DataFrame({'degrees':degrees,'nodes':hist,'essentials':essential_hist}) # histogramas como funcion de k percent = np.linspace(0,1,100) # x,y = essential_fraction_array(data,percent,v_degrees) # subplot.plot(x,y,'-',label=name) # # Medicion de correlacion x,y ,kcut= essential_fraction(data,args.percent_cut,v_degrees,return_data_cut=True) tau, tp_value = kendalltau(x,y) rho, rp_value = spearmanr(x,y) print("%25s: %.2f(%g)\t%.2f(%.2g)\t%3i"%(name,tau,tp_value,rho,rp_value,kcut)) subplot.set_xlabel('Fraccion de hubs en la red',fontsize=20) subplot.set_ylabel('Fraccion de hubs esenciales',fontsize=20) subplot.tick_params(labelsize=20) subplot.legend(loc='best') plt.savefig('ess_hub.pdf') plt.show()
def calculate_average_kendall_tau(self, rankings, values, weights, ranks): kendall = {i: {} for i in ["reg", "max", "mean"]} change_rate = {} rbo_min_models = {} for model in rankings: rankings_list_lm = rankings[model] last_list_index_lm = {} epochs = sorted(list(rankings_list_lm.keys())) for epoch in epochs: for query in rankings_list_lm[epoch]: if not kendall["reg"].get(query, False): kendall["reg"][query] = {} kendall["max"][query] = {} kendall["mean"][query] = {} change_rate[query] = {} rbo_min_models[query] = {} if not kendall["reg"][query].get(model, False): kendall["reg"][query][model] = [] kendall["mean"][query][model] = [] kendall["max"][query][model] = [] change_rate[query][model] = {"reg": [], "winner": []} rbo_min_models[query][model] = [] current_list_svm = rankings_list_lm[epoch][query] if not last_list_index_lm.get(query, False): last_list_index_lm[query] = current_list_svm continue if current_list_svm.index( 5) != last_list_index_lm[query].index(5): change_rate[query][model]["reg"].append(1) change_rate[query][model]["winner"].append( float(1) / (weights[epoch][query][ ranks[model][epoch][query][0]] + 1)) else: change_rate[query][model]["reg"].append(0) change_rate[query][model]["winner"].append(0) kt = kendalltau(current_list_svm, last_list_index_lm[query])[0] kt_max = weighted_kendall_tau( ranks[model][epoch][query], ranks[model][epoch - 1][query], weights[epoch][query], "max") kt_mean = weighted_kendall_tau( ranks[model][epoch][query], ranks[model][epoch - 1][query], weights[epoch][query], "mean") if not np.isnan(kt): kendall["reg"][query][model].append(kt) kendall["max"][query][model].append(kt_max) kendall["mean"][query][model].append(kt_mean) rbo = r.rbo_dict( { x: j for x, j in enumerate(last_list_index_lm[query]) }, {x: j for x, j in enumerate(current_list_svm)}, 0.7)["min"] rbo_min_models[query][model].append(rbo) last_list_index_lm[query] = current_list_svm for query in kendall["reg"]: for model in kendall["reg"][query]: kendall["reg"][query][model] = np.mean( kendall["reg"][query][model]) kendall["max"][query][model] = np.mean( kendall["max"][query][model]) kendall["mean"][query][model] = np.mean( kendall["mean"][query][model]) rbo_min_models[query][model] = np.mean( rbo_min_models[query][model]) change_rate[query][model]["reg"] = np.mean( change_rate[query][model]["reg"]) change_rate[query][model]["winner"] = np.mean( change_rate[query][model]["winner"]) return kendall, change_rate, rbo_min_models
def time_kendalltau(self, nan_policy, method, variant): tau, p_value = stats.kendalltau(self.a, self.b, nan_policy=nan_policy, method=method, variant=variant)
save_best_only=True), cb.EarlyStopping(patience=args.num_epochs // 8, restore_best_weights=True), cb.CSVLogger(os.path.join(test_dir, 'train_log.csv')), cb.TerminateOnNaN() ]) # Run on the validation set and assess statistics y_true = np.hstack([np.squeeze(x[1].numpy()) for x in iter(test_loader)]) test_time = perf_counter() y_pred = np.squeeze(model.predict(test_loader)) test_time = perf_counter() - test_time pd.DataFrame({ 'true': y_true, 'pred': y_pred }).to_csv(os.path.join(test_dir, 'test_results.csv'), index=False) with open(os.path.join(test_dir, 'test_summary.json'), 'w') as fp: json.dump( { 'r2_score': float(np.corrcoef(y_true, y_pred)[1, 0]** 2), # float() converts from np.float32 'spearmanr': float(spearmanr(y_true, y_pred)[0]), 'kendall_tau': float(kendalltau(y_true, y_pred)[0]), 'mae': float(np.mean(np.abs(y_pred - y_true))), 'rmse': float(np.sqrt(np.mean(np.square(y_pred - y_true)))) }, fp, indent=2)
def func(a, b): return kendalltau(a, b)[0]
def kendall_correlation(gt_video_rank, test_video_rank): assert (len(gt_video_rank) == len(test_video_rank)) tau, p_value = stats.kendalltau(range(len(gt_video_rank)), test_video_rank) return tau, p_value
def eval_corr_single(self, alpha, beta): corr, _ = stats.kendalltau(alpha, beta) return corr
requeues, overlimits ] for i in range(0, len(x)): x[i] = float(x[i]) data.append(x) # retrans = int(retrans) y.append(ratio) x = ("ratio", "drop_count", "busy_time", "ext_busy_time", "rx_time", "tx_time", "scan_time", "freq", "noise", "bytes", "packets", "qlen", "backlog", "drops", "requeues", "overlimits") IG = info_gain(data, y) # print IG print "iGGGGGGGGGGGGGGGGG" for k, v in IG: try: print x[k], v except Exception: print k, len(x) data = np.matrix(data).T for i in range(0, len(x)): try: print x[i], 'a', stats.kendalltau(data[i, ], data[0, ]) except Exception: print i, 'abc' if csvfile: csvfile.close() # del data, y gc.collect()
}))) #initialize an empty dataframe to append to final_win_loss = [] #loop through all the teams and have the rows append for j in range(len(team_ids)): row = team_win_loss(margins_long, j) final_win_loss.append(row) final_win_loss = (pd.concat(final_win_loss).sort_values( by=["Wins", "Ties", "Points"], ascending=False).assign(Standing=np.arange(1, 11)).sort_values( by=["Team"]).reset_index(drop=True)) comparison = (final_win_loss.replace({ "Team": mapping }).set_index("Team").merge( how="right", right=predicted_ranks, left_index=True, right_index=True).loc[:, ["Rank", "Standing"]].rename(columns={ "Rank": "Predicted", "Standing": "Observed" })) tau, p_value = stats.kendalltau(comparison.Predicted, comparison.Observed) #heatmap of the percentages counts2 = counts.reindex(predicted_ranks.index) / 10 cmap = sns.diverging_palette(10, 150, as_cmap=True) sns.heatmap(counts2, cmap=cmap, cbar=False, annot=counts2, linewidth=0.5) plt.title("1000 Simulations - Percentage of Each Final Standing")
def correlation_test(self, data1, data2, normal_dist=True, corr_algo="spearman"): """ Checking if two samples are related. The following 3 rank correlation are provided. 1. Pearson’s Correlation Coefficient Tests whether two samples have a monotonic relationship. Assumptions Observations in each sample are independent and identically distributed (iid). Observations in each sample are normally distributed. Observations in each sample have the same variance. Interpretation H0: the two samples are independent. H1: there is a dependency between the samples. 2. Spearman’s Rank Correlation Tests whether two samples have a monotonic relationship. Assumptions Observations in each sample are independent and identically distributed (iid). Observations in each sample can be ranked. Interpretation H0: the two samples are independent. H1: there is a dependency between the samples. 3. Kendall’s Rank Correlation Tests whether two samples have a monotonic relationship. Assumptions Observations in each sample are independent and identically distributed (iid). Observations in each sample can be ranked. Interpretation H0: the two samples are independent. H1: there is a dependency between the samples. Args: data1: input data1 data2: input data2 normal_dist: if samples have Normal Distribution. corr_algo: rank correlation algorithm name. Returns: correlations """ algo_name_spearman = "spearman" algo_name_kendall = "kendall" if normal_dist is True: corr, p = pearsonr(data1, data2) else: if corr_algo == algo_name_spearman: corr, p = spearmanr(data1, data2) elif corr_algo == algo_name_kendall: corr, p = kendalltau(data1, data2) else: raise ValueError("not supported rank correlation!") # interpret the significance alpha = 0.05 if p > alpha: print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p) else: print('Samples are correlated (reject H0) p=%.3f' % p) return corr
def correlateFile(f,directory,coefficient,pvalue, cutoff): develdict = c.OrderedDict() infile = open(directory+"/"+f, 'rU') line = infile.readline() for line in infile: line = re.sub("\n", "", line) split = re.split("\t", line) score = split[1:] develdict[split[0]] = score taxondict = c.OrderedDict() taxonlist = list() name1 = re.sub("taxcoIn","taxcoList",f) name1 = re.split("\.",name1) name1 = name1[0]+"_"+coefficient+"_"+str(pvalue)+"."+name1[1] outfile1 = open(directory+"/"+name1,'w') outfile1.write("Taxon1\tTaxon2\t"+coefficient+"\tp-value") usedDict = dict() for taxon in develdict: if not(taxon in taxondict): thelist = list() taxonlist.append(taxon) else: thelist = taxondict[taxon] for taxon2 in develdict: if (taxon == taxon2): correlation = (0.0, 0.0) else: if coefficient== "pearson": scores1 = list(map(float, develdict[taxon])) scores2 = list(map(float, develdict[taxon2])) correlation = scistats.pearsonr(scores1, scores2) elif coefficient== "spearman": correlation = scistats.spearmanr(develdict[taxon], develdict[taxon2], 0) elif coefficient== "kendall": correlation = scistats.kendalltau(develdict[taxon], develdict[taxon2], True) else: print("This correlation coefficient is not supported. Please use spearman, kendall or pearson.") exit(-1) if correlation == 1: correlation = (numpy.NaN, numpy.NaN) rounded = correlation[0] rounded = round(abs(rounded),2) if((correlation[1]<pvalue) and (rounded > cutoff)): if(not ((taxon in usedDict)and(taxon2 in usedDict[taxon]))): outfile1.write("\n"+taxon+"\t"+taxon2+"\t"+str(correlation[0])+"\t"+str(correlation[1])) if(taxon in usedDict): liste = usedDict[taxon] else: liste = list() liste.append(taxon2) usedDict[taxon] = liste if(taxon2 in usedDict): liste = usedDict[taxon2] else: liste = list() liste.append(taxon) usedDict[taxon2] = liste thelist.append(correlation) taxondict[taxon] = thelist outfile1.close() infile.close() name = re.sub("taxcoIn","taxcoCor",f) name = re.split("\.",name) name = name[0]+"_"+coefficient+"_"+str(pvalue)+"."+name[1] outfile = open(directory+"/"+name,'w') line1 = "Taxon" for taxon in taxondict: line1 = line1+"\t"+taxon outfile.write(line1+"\n") for taxon in taxondict: outfile.write(taxon) for correlation in taxondict[taxon]: if correlation[1] < pvalue: outfile.write("\t"+str(correlation[0])) else: outfile.write("\t0.0") outfile.write("\n") outfile.close()
def custom(a, b): v, p = stats.kendalltau(a, b) return round(p, 4)
def _correlation(table, vars, method='pearson', display_plt=True, height=2.5, corr_prec=2): size = len(vars) result_arr = [] for i in range(size): for j in range(i): if method == 'pearson': r, p = stats.pearsonr(table[vars[i]], table[vars[j]]) elif method == 'spearman': r, p = stats.spearmanr(table[vars[i]], table[vars[j]]) else: r, p = stats.kendalltau(table[vars[i]], table[vars[j]]) result_arr.append([vars[i], vars[j], r, p]) df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value']) rb = BrtcReprBuilder() if display_plt: s_default = plt.rcParams['lines.markersize']**2. scatter_kws = {"s": s_default * height / 6.4} def corr(x, y, **kwargs): if kwargs['method'] == 'pearson': r, p = stats.pearsonr(x, y) elif kwargs['method'] == 'spearman': r, p = stats.spearmanr(x, y) else: r, p = stats.kendalltau(x, y) p_stars = '' if p <= 0.05: p_stars = '*' if p <= 0.01: p_stars = '**' if p <= 0.001: p_stars = '***' corr_text = '{:.{prec}f}'.format(r, prec=corr_prec) font_size = abs(r) * 15 * 2 / corr_prec + 5 ax = plt.gca() ax.annotate(corr_text, [ .5, .5, ], xycoords="axes fraction", ha='center', va='center', fontsize=font_size * height) ax.annotate(p_stars, xy=(0.65, 0.6), xycoords=ax.transAxes, color='red', fontsize=17 * height) g = sns.PairGrid(table, vars=vars, height=height) g.map_diag(sns.distplot) if method == 'pearson': g.map_lower(sns.regplot, scatter_kws=scatter_kws) else: g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws) g.map_upper(corr, method=method) fig_corr = plt2MD(plt) plt.clf() rb.addMD( strip_margin(""" ## Correlation Results | ### Correlation Matrix | {fig_corr} | | ### Correlation Table | {table} """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result)))) params = {'vars': vars, 'method': method, 'height': height} else: rb.addMD( strip_margin(""" ## Correlation Results | ### Correlation Table | {table} """.format(table=pandasDF2MD(df_result)))) params = {'vars': vars, 'method': method} res = dict() res['params'] = params res['corr_table'] = df_result res['_repr_brtc_'] = rb.get() return {'result': res}
graphRed.add_edges_from([('0', '6'), ('0', '4'), ('1', '2'), ('1', '5'), ('1', '6'), ('1', '7'), ('2', '1'), ('2', '3'), ('2', '7'), ('3', '2'), ('3', '7'), ('3', '4'), ('4', '3'), ('4', '7'), ('4', '6'), ('4', '0'), ('4', '5'), ('5', '4'), ('5', '6'), ('5', '1'), ('6', '0'), ('6', '1'), ('6', '7'), ('6', '4'), ('6', '5')]) prRed = nx.pagerank(graphRed, dampFactor) print("Approximate pagerank of red Graph:", prRed) truePrRed, kendalLisRed = realPageRank(pr.get('RED'), prRed, 0, 7) print("Real pagerank of red Graph:", truePrRed) listRed = fromListToVector(prRed, 0, 7) corrRed, _ = kendalltau(listRed, kendalLisRed) print('Kendall Rank correlation between reds: %.5f' % corrRed) print("\n\n") # green graph graphGreen = nx.DiGraph() graphGreen.add_edges_from([ ('8', '9'), ('8', '14'), ('9', '8'), ('9', '10'), ('9', '11'), ('9', '13'), ('10', '9'), ('10', '14'), ('10', '13'), ('10', '11'), ('11', '10'), ('11', '9'), ('11', '12'), ('12', '11'), ('12', '10'), ('12', '13'), ('13', '12'), ('13', '10'), ('13', '9'), ('13', '14'), ('14', '13'), ('14', '10'), ('14', '8') ]) prGreen = nx.pagerank(graphGreen, dampFactor)
def _kendall(a, b): # kendallttau returns a tuple of the tau statistic and pvalue rs = kendalltau(a, b) return rs[0]
coefficient, p_val = stats.pearsonr( df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['favorite_count_detrend_weekdayhour'], df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['hashtag_count']) pearson_favorite = ["Favorite", 'Pearson', coefficient, p_val] coefficient, p_val = stats.spearmanr(df['retweet_count_detrend_weekdayhour'], df['hashtag_count']) spearman_retweet = ["Retweet", 'Spearman', coefficient, p_val] coefficient, p_val = stats.spearmanr( df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['favorite_count_detrend_weekdayhour'], df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['hashtag_count']) spearman_favorite = ["Favorite", 'Spearman', coefficient, p_val] coefficient, p_val = stats.kendalltau(df['retweet_count_detrend_weekdayhour'], df['hashtag_count']) kendall_retweet = ["Retweet", 'Kendall', coefficient, p_val] coefficient, p_val = stats.kendalltau( df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['favorite_count_detrend_weekdayhour'], df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['hashtag_count']) kendall_favorite = ["Favorite", 'Kendall', coefficient, p_val] pd.DataFrame([ pearson_retweet, spearman_retweet, kendall_retweet, pearson_favorite, spearman_favorite, kendall_favorite ], columns=[ 'Dependent Variable', 'Correlation Test', 'Coefficient', 'p-value' ])
def plot_p(ds_name): eif_dir = './result_p/' + ds_name results_eif, res_eif_full, ps = read_stream_csv(eif_dir, 3) # results_eif_end = np.array([res[10] for res in results_eif]) eif_percision = [ np.array(res)[:, 0] / np.array(res)[:, 1] for res in results_eif ] eif_recall = [ np.array(res)[:, 0] / np.array(res)[:, 3] for res in results_eif ] timeline = [np.array(res)[:, 2] for res in results_eif] # eif_recall = results_eif_end[:,0]/results_eif_end[:,3] ps = np.array(ps, dtype=float) from scipy.interpolate import interp1d intp_precision, intp_recall = [], [] for res in results_eif: res = np.insert(res, 0, np.array([0, 0, 0, 0]), axis=0) f_per = interp1d( np.array(res)[:, 2], np.array(res)[:, 0] / np.array(res)[:, 1]) intp_pre = f_per(np.arange(res[-1, 2] - 255) + 255) f_rec = interp1d( np.array(res)[:, 2], np.array(res)[:, 0] / np.array(res)[:, 3]) intp_rec = f_per(np.arange(res[-1, 2] - 255) + 255) intp_precision.append(intp_pre) intp_recall.append(intp_rec) intp_precision, intp_recall = np.array(intp_precision), np.array( intp_recall) from scipy.stats import pearsonr, kendalltau corrs_recall = [] corrs_precision = [] for idx in np.linspace(0, len(intp_precision[0]), num=11)[1:]: idx = int(idx) pre = intp_precision[:, idx - 1] corr, _ = kendalltau(ps, pre) corrs_precision.append(corr) rec = intp_recall[:, idx - 1] corr, _ = kendalltau(ps, rec) corrs_recall.append(corr) return corrs_recall # print(corrs) # from matplotlib import rcParams, cycler # cmap = plt.cm.coolwarm # rcParams['axes.prop_cycle'] = cycler(color=cmap(np.linspace(0, 1, 11))) # from matplotlib.lines import Line2D # custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4), # Line2D([0], [0], color=cmap(.5), lw=4), # Line2D([0], [0], color=cmap(1.), lw=4)] # fig, ax = plt.subplots() # for i in np.argsort(ps): # lines = ax.plot(timeline[i], eif_recall[i]) # ax.legend(custom_lines, ['Cold', 'Medium', 'Hot']) # rgba_colors = np.zeros((len(ps),4)) # rgba_colors[:,0] = 1.0 # rgba_colors[:, 3] = ps # # print(eif_recall.shape, eif_percision.shape, ps.shape) # fig, ax = plt.subplots() # im = ax.plot(timeline, eif_percision, c=ps, cmap=plt.cm.jet) # fig.colorbar(im, ax=ax) # plt.xlabel('Recall') # plt.ylabel('Precision') # plt.title(ds_name) plt.show()
def _kendall(a, b): rs = kendalltau(a, b) if isinstance(rs, tuple): return rs[0] return rs
mean_per_change_topk = np.zeros((len(top_K_protocols_list),n_batches-1)) for k, n in enumerate(top_K_protocols_list): top_protocol_idx = np.argsort(-mean)[0:n] mean_change_topk[k,:] = np.mean(np.abs(change[top_protocol_idx]),axis=0) mean_per_change_topk[k,:] = np.mean(np.abs(per_change[top_protocol_idx]),axis=0) ## find change in rankings per kendall tau tau = np.zeros((len(top_K_protocols_list),n_batches-2)) for k1, n in enumerate(top_K_protocols_list): top_protocol_idx = np.argsort(-mean)[0:n] for k2 in [1,2,3]: # don't start with 0 - all are tied ranks1 = len(param_space) - rankdata(data[k2])[top_protocol_idx] ranks2 = len(param_space) - rankdata(data[k2+1])[top_protocol_idx] #print(k1,k2,ranks1,ranks2) tau[k1,k2-1] = kendalltau(ranks1,ranks2)[0] ## plot batches = np.arange(n_batches-1)+1 plt.subplots(3,3,figsize=figsize) ax0 = plt.subplot2grid((3, 3), (0, 0)) ax1 = plt.subplot2grid((3, 3), (0, 1)) ax2 = plt.subplot2grid((3, 3), (0, 2)) ax3 = plt.subplot2grid((3, 3), (1, 0)) ax4 = plt.subplot2grid((3, 3), (1, 1)) ax5 = plt.subplot2grid((3, 3), (1, 2)) ax6 = plt.subplot2grid((3, 3), (2, 0), colspan=3) ax0.set_title('a', loc='left', weight='bold', fontsize=8) ax1.set_title('b', loc='left', weight='bold', fontsize=8)
'Selected fuel consumption(L)', 'Selected kilometer(L)', 'Selected speed(km/h)', 'Service stop status', 'Odometer speed', 'Wheel speed', 'Engine torque mode', 'Percentage of torque on driving instructions', 'Actual percentage of engine torque', 'RPM', 'Coolant temperature', 'Oil pressure', 'ECU fuel consumption', 'Accelerator pedal position', 'Parking brake switch', 'Clutch switch', 'Brake switch', 'Urea tank level', 'Urea tank temperature', 'Engine input voltage', 'Ignition switch voltage', 'Cumulative engine running time', 'Cumulative engine revolutions', 'Engine fuel rate', 'Instantaneous engine fuel rate', 'Average fuel consumption', 'Particle catcher inlet pressure', 'Relative boost pressure', 'Intake manifold temperature', 'Absolute boost pressure', 'Discharge temperature', 'Atmospheric pressure', 'Cabin temperature', 'Atmospheric temperature', 'Cold start light', 'Kilometers of this driving cycle', 'Total kilometers', 'Fuel contains water', 'Target gear', 'Actual speed ratio', 'Current gear', 'Gauge fuel level', 'Odometer subcounts kilometer', 'Total odometer kilometer', 'Integral kilometer', 'Integral fuel consumption', 'Interval brake times', 'Merger marks', 'Compensation transmission'] df.set_index('ID',inplace=True) a, b = df['Interval brake times'], df['Accelerator pedal position'] df1 = pd.concat([a,b],1) df1 = df1.dropna() print(a.corr(b, method='spearman')) # sns.regplot(x='Brakes', y='Accelerator', data=df1) # plt.show() # x1 = stats.pearsonr(a, b) x2 = stats.spearmanr(a, b) # x3 = stats.kendalltau(a, b) # print(x2,x3) if __name__ == '__main__': pass
if not get_is_mol(player): appearance = AppearanceExtractor.get_relative_occurrence(player, parsed_video, [True]) appearances.append(appearance) for feat1, feat2 in itertools.permutations(appearances, 2): input.append(feat1) output.append(feat2) r, p_value = pearsonr(input, output) print("Pearson Test (Between):") print("R value: " + str(r)) print("R-squared value: " + str(r ** 2)) print("p-value: " + str(p_value)) print() t, p_value = kendalltau(input, output) print("Kendall Test (Between):") print("Tau value: " + str(t)) print("p-value: " + str(p_value)) print() input = [] output = [] for season in TEST_SEASONS: for episode in itertools.count(1): appearances = [] parsed_video = VideoParser.load_parsed_video(season, episode) if parsed_video is None: break for player in parsed_video.alive_players: