def f_measure():
    max_val = 0
    for y_col in [1,2,3,4]:    
        for x_col in [5, 6, 7, 8, 9, 10, 11, 12]:
            for x_col_2 in [6, 7, 8, 9, 10, 11, 12]:
                if x_col == x_col_2:
                        continue
                if not x_col < x_col_2:
                    continue

                x = df.iloc[:,x_col]
                x2 = df.iloc[:,x_col_2]
                y = df.iloc[:,y_col]
                f = 2 * (x * x2)/ (x + x2)
                val = kendalltau(f, y)[0];
                if kendalltau(f, y)[0] < 0.40:
                    continue
                if kendalltau(f, y)[0] > max_val:
                    max_val = kendalltau(f, y)[0]
                    max_x , max_x_2, max_y = df.columns[x_col], df.columns[x_col_2], df.columns[y_col]
                
                print df.columns[x_col],df.columns[x_col_2] , df.columns[y_col], kendalltau(f, y)
            
    print 'max'
    print max_x, max_x_2, max_y ,max_val, " : ", max_val
Beispiel #2
0
def getBirthDeadCor(birthAndDeadFileName):
	'''
	Explore the correlation betweeen firm death and firm emergence via Spearman coefficient.
	PS: the records later than 01/2010 should be abandoned.

	Parameters:
		birthAndDeadFileName: str
			The file of firm birth and dead in every month.

	Return:
		coefficient: int 
			correlation coefficient between firm death and emergence.
	'''
	
	birthAndDeadFile = open(birthAndDeadFileName, 'r').readlines()
	seq1, seq2 = [], []
	for birthDeadRecord in birthAndDeadFile[1:]:
		birth = int(birthDeadRecord.split('###')[1])
		dead = int(birthDeadRecord.split('###')[2])
		seq1.append(birth)
		seq2.append(dead)
		if int(birthDeadRecord.split('###')[0]) >= 201001:
			break
	print spearmanr(seq1, seq2)
	print pearsonr(seq1, seq2)
	print kendalltau(seq1, seq2)
    def test_nancorr_kendall(self):
        tm.skip_if_no_package("scipy.stats")
        from scipy.stats import kendalltau

        targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0]
        targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
        self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="kendall")
        targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0]
        targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
        self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall")
Beispiel #4
0
 def test_nancorr_kendall(self):
     from scipy.stats import kendalltau
     targ0 = kendalltau(self.arr_float_2d, self.arr_float1_2d)[0]
     targ1 = kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
     self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1,
                                  method='kendall')
     targ0 = kendalltau(self.arr_float_1d, self.arr_float1_1d)[0]
     targ1 = kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
     self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1,
                                  method='kendall')
Beispiel #5
0
    def determineVariance(self,runs):
        #This method determines the variance of parameter values caused by
        #the stochastic nature of the data. Data is generated 'runs' times,
        #redetermining only the labels, and the model is fitted each time.
        #the variance for each parameter over all runs is determined
        parvar=[]
        params=[]
        ranks2=[]
        for i in range(len(self.parameters)):      
            params.append(np.zeros((runs,len(self.parameters[i]))))
            parvar.append([])
        
        
        for i in range(runs):
            othermodel=copy.deepcopy(self)
            othermodel.clearGenerate()
            for d in self.data.giveData():
                othermodel.generate(d[0],d[1])
            #Testpart is not yet correct?, as it the basekcc needs to be set at this point and the kcc reset once the testdata has been made.            
#            for d in self.data.giveTestData():
#                othermodel.generateTest(d[0],d[1])
            
            othermodel.fit()
            
            ranks2.append(self.rankOrder(othermodel))
#            othermodel.aPrime()
            for j,p in enumerate(othermodel.giveParams()):
                params[j][i,:]=p
                parvar[j].append(othermodel.parameterVariance(othermodel.paranames[j]))
        ranks=np.zeros((len(self.parameters),runs,runs))
        
        variances=[]
        for i in range(len(self.parameters)):
            parvar[i]=np.mean(parvar[i])
            for j in range(runs):
                for k in range(j+1,runs):
                    
                    ranks[i,j,k]=stat.kendalltau(params[i][j,:],params[i][k,:])[0]
            avg=np.mean(params[i],0)
            var=np.var(params[i],0,ddof=1)
            variances.append(var)
#            print ""
#            print self.paranames[i]
#            for j in range(params[i].shape[1]):
#                print j,avg[j],var[j]
        #variances=np.concatenate(variances)
        ranks=np.sum(ranks,(1,2))/((runs**2-runs)/2)
        ranks2=[[],[],[],[]]
        for j,p in enumerate(self.giveParams()):
            for i in range(runs):
                ranks2[j].append(stat.kendalltau(params[j][i,:],p)[0])
#        print "\nValues for the inherent ranks vs ranks against sourcemodel\n",ranks,np.mean(np.array(ranks2),1),"\n"

        return (variances,ranks,parvar)
def plot(name, qualities_mes, costs_mes, qualities_th, costs_th):
    fig, axes = plt.subplots(2,1)

    ax1= axes[0]

    texts_mes= []
    for (i, (quality, cost)) in enumerate(zip(qualities_mes, costs_mes)):
        texts_mes.append(ax1.text(quality, cost, str(i), ha='center', va='center'))

    #print("Measured: ", q, c_cycle)

    color='tab:red'

    ax1.set_ylabel("cost per cycle (µs)")
    ax1.set_xlabel("quality")
    ax1.scatter(qualities_mes, costs_mes,  label="Measured", color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    ax1.grid(True)

    ax2 = axes[1]

    texts_th = []
    for (i, (quality, cost)) in enumerate(zip(qualities_th, costs_th)):
        texts_th.append(ax2.text(quality, cost, str(i), ha='center', va='center'))

    color = 'tab:blue'
    ax2.set_ylabel("cost")
    ax2.set_xlabel("quality")

    ax2.scatter(qualities_th, costs_th,  label="Model", color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.grid(True)


    adjust_text(texts_mes, ax=ax1)
    adjust_text(texts_th, ax=ax2)

    kendalltau = GraphResults("Kendall's tau")
    kendalltau.costs = stats.kendalltau(costs_mes, costs_th, nan_policy='raise')
    kendalltau.quality = stats.kendalltau(qualities_mes, qualities_th, nan_policy='raise')

    spearmanr = GraphResults("Spearman's R")
    spearmanr.costs = stats.spearmanr(costs_mes, costs_th, nan_policy='raise')
    spearmanr.quality = stats.spearmanr(qualities_mes, qualities_th, nan_policy='raise')

    print(kendalltau.name, " Kendal's tau: cost=", kendalltau.costs, " and quality=", kendalltau.quality)
    print(spearmanr.name, " Spearman's r: cost=", spearmanr.costs, " and quality=", spearmanr.quality)


    fig.tight_layout()
    fig.legend()
    if args.tikz:
        tikz_save(name+".tex")
    plt.show()
Beispiel #7
0
 def evaluate_all(self):
     all_true = []
     all_pred = []
     logging.info((self.name, self.fold))
     for key in self.keys():
         y_true, y_pred = self.ordered_scores(key)
         if self.debug:
             tau, p_value = kendalltau(y_true, y_pred)
             logging.info(KendallTauUser(user=key, tau=tau, p=p_value))
         all_true.extend(y_true)
         all_pred.extend(y_pred)
     tau, p_value = kendalltau(all_true, all_pred)
     stat = KendallTauFold(fold=self.fold, tau=tau, p=p_value)
     logging.info(stat)
     return [stat]
Beispiel #8
0
def calc_kendall_tau(gam_unit, average=False):
    """
    Calculate Kendall tau value for predicted values. This tau scales between
    -1 (prefect negative correlation) and 1 (perfect correlation). 
    
    gam_unit : GamUnit
      has `actual` and `pred` attributes
    average : bool
      average across repeats before calculating tau
    """
    assert type(average) == bool

    if not average:
        act_flat = gam_unit.actual.flatten()
    else:
        act_flat = stats.nanmean(gam_unit.actual, axis=1).flatten()
    nans = np.isnan(act_flat)
    act_flat = act_flat[~nans]

    tau = np.zeros((gam_unit.pred.shape[0])) + np.nan
    P = np.zeros_like(tau) + np.nan
    for i, pred in enumerate(gam_unit.pred):
        if not average:
            pred_flat = pred.flatten()[~nans]
        else:
            pred_flat = stats.nanmean(pred, axis=1).flatten()
        tau[i], P[i] = stats.kendalltau(act_flat, pred_flat)
    return tau, P
def how_far_intime(paths, moment_of_infection, mode = 'abs'):
    res = []
    out_moment_of_infection = {}
    for p in paths:
        for step in p:
            if step[1] in out_moment_of_infection:
                out_moment_of_infection[step[1]] = min(out_moment_of_infection[step[1]], step[0])
            else:
                out_moment_of_infection[step[1]] = step[0]

            if step[2] in out_moment_of_infection:
                out_moment_of_infection[step[2]] = min(out_moment_of_infection[step[2]], step[0])
            else:
                out_moment_of_infection[step[2]] = step[0]
    sorted_out = [i[0] for i in sorted(out_moment_of_infection.items(), key=operator.itemgetter(1)) if i[0] in moment_of_infection]
    sorted_gt = [i[0] for i in sorted(moment_of_infection.items(), key=operator.itemgetter(1)) if i[0] in out_moment_of_infection]

    for k, v in out_moment_of_infection.iteritems():
        if k in moment_of_infection:
            if v > moment_of_infection[k]:
                t = (v - moment_of_infection[k]).total_seconds()
            else:
                t = -(moment_of_infection[k] - v).total_seconds()
                if mode == 'abs':
                    t = np.abs(t)
            res.append(t)
    #return res, stats.kendalltau(sorted_gt, sorted_out), stats.pearsonr(sorted_gt, sorted_out)
    try:
        tau = stats.kendalltau(sorted_gt, sorted_out)
    except:
        tau = 0.0
    return res, tau
def kendall_f_three():
    result = pd.DataFrame(columns =['first', 'second', 'third', 'y_col', 'kendaltau', 'abs_kendaltau'])
    pos = 0
    start = time.clock()
    for y_col in np.arange(0,23):
        max_val = 0
        for x_col in np.arange(23,23 + 48):
            for x_col_2 in np.arange(23,23 + 48):
                for x_col_3 in np.arange(23,23 + 48):
                    if x_col == x_col_2 or x_col == x_col_3 or x_col_2 == x_col_3:
                        continue
                    if not (x_col < x_col_2 and x_col_2 < x_col_3):
                        continue
                    
                    x = df.iloc[:,x_col]
                    x2 = df.iloc[:,x_col_2]
                    x3 = df.iloc[:,x_col_3]
                    y = df.iloc[:,y_col]
                    f = 3 * (x * x2 * x3)/ (x + x2 + x3)                    
                    val = kendalltau(f, y)[0]                   
                    result.loc[pos, 'first'], result.loc[pos,'second'], result.loc[pos,'third'] = df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3] 
                    result.loc[pos, 'y_col'], result.loc[pos,'kendaltau'], result.loc[pos, 'abs_kendaltau'] =  df.columns[y_col], val, abs(val)
                    

                    if val > max_val:
                        max_val = val
                        max_x , max_x_2, max_x_3, max_y = df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3], df.columns[y_col]
                    print df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3], " | ", df.columns[y_col], " : " ,  val
                    pos = pos + 1
        print 'max'
        print max_x, max_x_2, max_x_3, " | " , max_y, " : ",  max_val
    print 'ran for {0} minutes '.format((time.clock() - start)/60.0)
    result.to_excel(os.path.join(result_dir, 'kendal_f_measure_three.xlsx'), encoding = 'utf-8', index = False)
def f_measure_3():
    result = pd.DataFrame(columns =['first', 'second', 'third', 'y_col', 'kendaltau'])
    pos = 0
    for y_col in [1,2,3,4]:
        max_val = 0
        for x_col in [5, 6, 7, 8, 9, 10, 11, 12]:
            for x_col_2 in [6, 7, 8, 9, 10, 11, 12]:
                for x_col_3 in [7, 8, 9, 10, 11, 12]:
                    if x_col == x_col_2 or x_col == x_col_3 or x_col_2 == x_col_3:
                        continue
                    if not (x_col < x_col_2 and x_col_2 < x_col_3):
                        continue
                    
                    x = df.iloc[:,x_col]
                    x2 = df.iloc[:,x_col_2]
                    x3 = df.iloc[:,x_col_3]
                    y = df.iloc[:,y_col]
                    f = 3 * (x * x2 * x3)/ (x + x2 + x3)                    
                    val = kendalltau(f, y)[0]                   
                    result.loc[pos, 'first'], result.loc[pos,'second'], result.loc[pos,'third'] = df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3] 
                    result.loc[pos, 'y_col'], result.loc[pos,'kendaltau'] =  df.columns[y_col], val
                    

                    if val > max_val:
                        max_val = val
                        max_x , max_x_2, max_x_3, max_y = df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3], df.columns[y_col]
#                     print df.columns[x_col], df.columns[x_col_2], df.columns[x_col_3], " | ", df.columns[y_col], " : " ,  val
                    pos = pos + 1
        print 'max'
        print max_x, max_x_2, max_x_3, " | " , max_y, " : ",  max_val
    result.to_csv(os.path.join(result_dir, 'kendal_f_measure_3.csv'), encoding = 'utf-8', index = False)
 def get_tau(ccode1, ccode2):
     '''
     Closure to find K-Tau between two actors.
     '''
     x = np.array(adj_mat[name_to_pos[ccode1]])[0]
     y = np.array(adj_mat[name_to_pos[ccode2]])[0]
     return kendalltau(x, y)[0]
def calc_tau_and_chi(sorted_data, exp_key):
    """
    Consumes a dictionary of results that is output from sort_data
    and calculates kendall's tau and the chi square statistic value for each
    key with the specified key as the data representing the expected value
    """
    chitau_dict = {}
    bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
    t_exp_array = scipy.array(sorted_data[exp_key])
    c_exp_lst = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    for val in sorted_data[exp_key]:
        for i in xrange(1,11):
            if bins[i-1] < val and bins[i] > val:
                c_exp_lst[i-1] += 1
    c_exp_array = scipy.array(c_exp_lst)
    for k in sorted_data.keys():
        k_dict = {}
        t_obs_array = scipy.array(sorted_data[k])
        c_obs_lst = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        for val in sorted_data[k]:
            for i in xrange(1,11):
                if bins[i-1] < val and bins[i] > val:
                    c_obs_lst[i-1] += 1
        c_obs_array = scipy.array(c_obs_lst)
        chi_test = chisquare(c_obs_array, f_exp=c_exp_array)
        k_dict['tau'] = kendalltau(t_exp_array, t_obs_array)[0]
        k_dict['chi_sq_val'] = chi_test[0]
        k_dict['chi_sq_p'] = chi_test[1]
        chitau_dict[k] = k_dict
    return chitau_dict
Beispiel #14
0
def _plot_data(figure, datalist, labels, ylabels, xlabels, ncols=3):
    if isinstance(ylabels,str):
        ylabels = [ylabels]*len(datalist)
        xlabels = [xlabels]*len(datalist)
        ilabels = False
    else:
        ilabels = True

    nrows = int(np.ceil(len(datalist)/float(ncols)))
    minv = np.floor(min([d.min() for d in datalist]))
    maxv = np.ceil(max([d.max() for d in datalist]))
    vrange = maxv - minv
    delta = np.ceil(vrange/8.0)
    ticks = np.arange(minv,maxv+1.0,delta)

    for i,(data,label,ylabel,xlabel) in enumerate(zip(datalist,labels,ylabels,xlabels),1):
        a = figure.add_subplot(nrows,ncols,i)
        a.plot(data[:,1],data[:,0],".",color=colors.color(i-1))
        tau,tpval = stats.kendalltau(data[:,1],data[:,0])
        r,rpval = stats.pearsonr(data[:,1],data[:,0])
        print "%s\t%.5f\t%.5f\t%.5f\t%5f"%(label,r,rpval,tau,tpval)
        a.plot([minv,maxv],[minv,maxv],"--k")
        a.set_xlim(minv,maxv)
        a.set_ylim(minv,maxv)
        a.set_xticks(ticks)
        a.set_yticks(ticks)
        if ilabels or a.is_first_col():
            a.set_ylabel(ylabel)
        if ilabels or a.is_last_row():
            a.set_xlabel(xlabel)
        a.text(0.05,0.92,label,transform=a.transAxes)
        a.set_aspect('equal')

    figure.tight_layout()
    print ""
Beispiel #15
0
 def test_kendalltau(self):
     for n in self.get_n():
         x, y, xm, ym = self.generate_xy_sample(n)
         r = stats.kendalltau(x, y)
         rm = stats.mstats.kendalltau(xm, ym)
         assert_almost_equal(r[0], rm[0], decimal=10)
         assert_almost_equal(r[1], rm[1], decimal=7)
Beispiel #16
0
def calc_kendall_tau(gam_unit, average=False):
    '''
    Calculate Kendall tau value for predicted values. This tau scales between
    -1 (prefect negative correlation) and 1 (perfect correlation). 
    
    gam_unit : GamUnit
      has `actual` and `pred` attributes
    average : bool
      average across repeats before calculating tau
    '''
    assert(type(average) == bool)
    
    if not average:
        act_flat = gam_unit.actual.flatten()
    else:
        act_flat = stats.nanmean(gam_unit.actual, axis=1).flatten()
    nans = np.isnan(act_flat)
    act_flat = act_flat[~nans]

    tau = {}
    P   = {}
    for k, v in gam_unit.fits.iteritems():
        if not average:
            pred_flat = v.pred.flatten()[~nans]
        else:
            pred_flat = stats.nanmean(v.pred, axis=1).flatten()
        tau[k], P[k] = stats.kendalltau(act_flat, pred_flat)
    return tau, P
Beispiel #17
0
 def rankOrder(self, other,rank="kendall"):
     #A bit of a hack momentarily which works for AFM and PFA. Only looks at KC parameters
     #Changed it to work for eirt ONLY by changing from -1 to -2 in line below
     off=0
     if self.paranames[1]=="gamma":
         off=1
     else:
         off=2
     answerlist=np.zeros(len(self.parameters)-off)
     for i in range(len(answerlist)):
         pars1=[]
         pars2=[]
         skip1=skip2=0
         for j in range(len(self.parameters[0])+len(self.data.kcmis)):
             if j in self.data.kcmis:
                 skip1+=1
             if j in other.data.kcmis:
                 skip2+=1
             if not (j in self.data.kcmis or j in other.data.kcmis):
                 pars1.append(self.parameters[i][j-skip1])
                 pars2.append(other.parameters[i][j-skip2])
         if rank=="spearman":
             answerlist[i]=stat.spearmanr(pars1,pars2)[0]
         else:
             answerlist[i]=stat.kendalltau(pars1,pars2)[0]
     return answerlist
Beispiel #18
0
def _fill_stats(ws,coloffset,nentries,calcol,expcol,data,tau=None):
    """
    Fill in statistical analysis: MAD, r2, tau, r, slope and intercept
    """
    diffcol = ws.cell(row=3,column=5+coloffset).column
    ws.cell(row=3,column=6+coloffset).value = "MAD"
    ws.cell(row=3,column=7+coloffset).value = "=AVERAGE(%s3:%s%d)"%(diffcol,diffcol,nentries+2)

    ws.cell(row=4,column=6+coloffset).value = "r2"
    ws.cell(row=4,column=7+coloffset).value = "=CORREL(%s3:%s%d,%s3:%s%d)^2"% \
        (calcol,calcol,nentries+2,expcol,expcol,nentries+2)

    if tau is None:
        tau = stats.kendalltau(data[:,0],data[:,1])
    ws.cell(row=5,column=6+coloffset).value = "tau"
    ws.cell(row=5,column=7+coloffset).value = tau[0]
    ws.cell(row=5,column=8+coloffset).value = tau[1]

    ws.cell(row=6,column=6+coloffset).value = "r"
    ws.cell(row=6,column=7+coloffset).value = "=CORREL(%s3:%s%d,%s3:%s%d)"% \
        (calcol,calcol,nentries+2,expcol,expcol,nentries+2)
    ws.cell(row=6,column=8+coloffset).value = stats.pearsonr(data[:,0],data[:,1])[1]

    ws.cell(row=7,column=6+coloffset).value = "Slope"
    ws.cell(row=7,column=7+coloffset).value = "=SLOPE(%s3:%s%d,%s3:%s%d)^2"% \
        (expcol,expcol,nentries+2,calcol,calcol,nentries+2)

    ws.cell(row=8,column=6+coloffset).value = "Intercept"
    ws.cell(row=8,column=7+coloffset).value = "=INTERCEPT(%s3:%s%d,%s3:%s%d)^2"% \
        (expcol,expcol,nentries+2,calcol,calcol,nentries+2)
Beispiel #19
0
def runTest():
    sd = 110
    np.random.seed(sd)
    # Generate sample Theta's which tells the relative goodness of answers
    # Suppose there are 10 answers
    Theta = np.random.randn(10)

    # pDensity
    pDens = np.exp(Theta)/np.sum(np.exp(Theta))
    cDist = np.cumsum(pDens)
    aNum = np.ones(10)
    aNum[0] = 0
    aNum = np.cumsum(aNum)
    plt.bar(aNum, pDens)
    plt.plot(aNum, cDist)
    plt.title('PDensityF/CDistributionF')
    print "Close the figure to run simulation"
    plt.savefig('PDF-sd%d.png'%sd)
    plt.show()

    N = 200
    kTau = np.zeros(N)
    for i in range(N):
        ThetaEst = simulateClicks(i+1, cDist)
        kTau[i] = kendalltau(Theta, ThetaEst)[0]

    kTau2 = simulateClicks2(N, cDist, Theta)
    kTau3 = simClicksTime(N, cDist, Theta)
    plt.plot(range(1, N+1),kTau, 'b')
    plt.plot(range(1, N+1),kTau2,'k')
    plt.plot(range(1, N+1),kTau3,'r')
    plt.title('KendallTau')
    plt.xlabel('Number of clicks(N)')
    plt.savefig('KTau-sd%d.png'%sd)
    plt.show()
 def calculate_jaccard_kendall(page1, page2, from_file = True):
     """ calculate jaccard and kendall from two files. Return a tuple(jaccard, kendall), float number.
     page1, page2: full path to two files
     """
     Measurements.is_from_file = from_file
     
     alph = Measurements.pages_to_alphabet([page1, page2])
     str1 = Measurements.page_to_string(page1, alph)
     str2 = Measurements.page_to_string(page2, alph)
     str1 = unicode(str1, 'utf-8',errors='ignore')
     str2 = unicode(str2, 'utf-8',errors='ignore')
     l1 = [a for a in str1]
     l2 = [a for a in str2]
     while len(l1) < len(l2): l1.append('null')
     while len(l2) < len(l1): l2.append('null')
     #-----
     s1 = Measurements.link_to_set(page1)
     s2 = Measurements.link_to_set(page2)
     j = 0 # this is jaccard
     if len(s1) > len(s2):
         j = float(len(s1.intersection(s2)))/len(s1)
     else:
         j = float(len(s2.intersection(s1)))/len(s2)
     #-----
     k = kendalltau(l1, l2)[0]
     return j, k
 def _map(self, key, video_group):
     
     #Creates a data matrix with the number of views per video (rows) for
     #each referrer (columns).
     i = 0
     n_rows = len(video_group)
     n_cols = len(self.ref_group_to_int) + 1
     data = np.zeros(shape=(n_rows, n_cols))
     for video_data in video_group.values():
         referral_views = video_data[0]
         total_view = video_data[1]
         
         for ref_group, ref_views in referral_views:
             ref_group_id = self.ref_group_to_int[ref_group]
             data[i][ref_group_id] = ref_views
         
         #Last column has total views
         data[i][-1] = total_view;
         i += 1
 
     #Generating correlations
     total_view_array = data[:,-1]
     return_val = {}
     for ref_group, ref_group_id in self.ref_group_to_int.iteritems():
         ref_group_array = data[:,ref_group_id]
         k_tau = stats.kendalltau(total_view_array, ref_group_array)
         s_rho = stats.spearmanr(total_view_array, ref_group_array)
         return_val[ref_group] = (k_tau, s_rho)
     
     return return_val
def correlation():
    '''Pearson correlation, and two types of rank correlation (Spearman, Kendall)
    comparing age and %fat (measured by dual-photon absorptiometry) for 18 normal adults.
    '''
    
    # Get the data
    inFile = 'altman_11_1.txt'
    data = np.genfromtxt(inFile, delimiter=',')
    x = data[:,0]
    y = data[:,1]
    
    # --- >>> START stats <<< ---
    # Calculate correlations
    # Resulting correlation values are stored in a dictionary, so that it is
    # obvious which value belongs to which correlation coefficient.
    corr = {}
    corr['pearson'], _ = stats.pearsonr(x,y)
    corr['spearman'], _ = stats.spearmanr(x,y)
    corr['kendall'], _ = stats.kendalltau(x,y)
    # --- >>> STOP stats <<< ---
    
    print(corr)    
    
    # Assert that Spearman's rho is just the correlation of the ranksorted data
    np.testing.assert_almost_equal(corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0])
    
    return corr['pearson']  # should be 0.79208623217849117
def compute_kendalltau_corellations(meth_rankings,printRes=True):
    """Method for computing kendall tau corellations between different rankings
    
    Parameters
    ----------
    meth_rankings: dictionary
        dictionary {'method_name':ranking}
    printRes: bool, optional
        print correlations
    
    Returns
    -------
    corrs: list of tuples
        [('method_name_1', 'method_name_2', correlation, p_value), ]
    """
    corrs=[]
    for p in itertools.combinations(meth_rankings.items(), 2):
        
        z,pv=stats.kendalltau(p[0][1], p[1][1])
        
        corrs.append((p[0][0], p[1][0],z,pv))
    
    if printRes==True:
        show_corr_results(corrs)
        
    return corrs
def kendall_tau(position_scores, position_predictions, topN=TOP_N):
    """
    Each arg has form [(score, (name, team, id))].

    Extract IDs from each, find intersection, remap to unique IDs in [0,N), and
    use scipy.
    """

    def get_ids(score_list):
        return [id for score, (name, team, id) in score_list[:topN]]

    true_ids = get_ids(position_scores)
    pred_ids = get_ids(position_predictions)
    shared = set(true_ids) & set(pred_ids)
    frac_shared = float(len(shared)) / topN

    def get_scores(score_list):
        # Sort to ensure same order among lists
        idscore = sorted([(id, score) for score, (name, team, id)
                          in score_list if id in shared])
        return [score for id, score in idscore]
    true_scores = get_scores(position_scores)
    pred_scores = get_scores(position_predictions)

    if len(true_scores) < 2:
        return (0, 0), frac_shared

    return kendalltau(true_scores, pred_scores), frac_shared
def ternary_metrics(polarities, lexicon, eval_words, tau_lexicon=None):
    if not tau_lexicon == None:
        kendall_words = list(set(eval_words).intersection(tau_lexicon))
    y_prob, y_true = [], []
    polarities = {word:polarities[word] for word in eval_words}
    for w in polarities:
        y_prob.append(polarities[w])
        y_true.append(lexicon[w])
    y_prob = np.array(y_prob)
    y_true = np.array(y_true)
    y_prob = 2*(y_prob - np.min(y_prob)) / (np.max(y_prob) - np.min(y_prob)) - 1
    neg_prop = np.sum(np.array(lexicon.values()) == -1) / float(len(lexicon))
    pos_prop = np.sum(np.array(lexicon.values()) == 1) / float(len(lexicon))
    sorted_probs = sorted(y_prob)
    neg_thresh = sorted_probs[int(np.round(neg_prop*len(sorted_probs)))]
    pos_thresh = sorted_probs[-int(np.round(pos_prop*len(sorted_probs)))]
    cmn_labels = [1 if val >= pos_thresh else -1 if val <= neg_thresh else 0 for val in y_prob]
    if not tau_lexicon == None:
        tau = kendalltau(*zip(*[(polarities[word], tau_lexicon[word]) for word in kendall_words]))[0]
    else:
        tau = None
    maj_f1 = f1_score(y_true, np.repeat(sp.stats.mode(y_true)[0][0], len(y_true)), average="macro")
    cmn_f1 = f1_score(y_true, cmn_labels, average="macro")
    label_func = lambda entry : 1 if entry > pos_thresh else -1 if entry < neg_thresh else 0
    conf_mat = confusion_matrix(y_true, [label_func(entry) for entry in y_prob])
    return tau, cmn_f1, maj_f1, conf_mat
Beispiel #26
0
def evaluation_prediction():
    correlation_all_w = []
    correlation_all_tau = []
    correlation_all_rho = []
    len_ = 0
    for event_name in dict_name2:
        f = open(root + 'baseline_all_0509/' + event_name+ '/vgg_test_result_v2.cPickle','r')
        # f = open(root + 'baseline_all_noblock/' + event_name+ '/vgg_test_result_v2.cPickle','r')
        ground_truth = cPickle.load(f)
        f.close()
        f = open(root + 'CNN_all_event_1009/features/' + event_name+ '_test_combined_10_combine_dict.cPickle','r')
        prediction = cPickle.load(f)
        f.close()
        correlation_rho = []
        correlation_tau = []
        correlation_w = []
        for event_id in ground_truth:
            g = [i[2] for i in ground_truth[event_id]]
            p = [i[2] for i in prediction[event_id]]
            temp_rho,temp1 = spearmanr(g, p)
            temp_w = kendall_w({1:g,2:p})
            temp_tau, temp1 = kendalltau(g, p)
            correlation_rho.append(temp_rho)
            correlation_w.append(temp_w)
            correlation_tau.append(temp_tau)
        len_ += len(ground_truth)
        print event_name, ', rho:', np.mean(correlation_rho), ', kendall\'s tau:', np.mean(correlation_tau), ', kendall\'s W:', np.mean(correlation_w)
        correlation_all_rho.append(np.mean(correlation_rho) * len(ground_truth))
        correlation_all_tau.append(np.mean(correlation_tau) * len(ground_truth))
        correlation_all_w.append(np.mean(correlation_w) * len(ground_truth))
    print 'rho:', np.sum(correlation_all_rho) / len_, ', kendall\'s tau:', np.sum(correlation_all_tau) / len_, ', kendall\'s W:', np.sum(correlation_all_w) / len_
Beispiel #27
0
 def calculate_richness_out_change(g, last_value=None):
     richness_scores = richclub.richness_scores(g, richness='out_strength')
     if last_value:
        from scipy.stats import kendalltau
        return richness_scores, kendalltau(richness_scores, last_value)[0]
     else:
        return richness_scores
def get_kendall_tau(path1, path2):
    item_ranking = defaultdict(list)
    with open(path1, 'r') as f1:
        for line in f1:
            vid, tweetcount = line.rstrip().split()
            item_ranking[vid].append(int(tweetcount))

    with open(path2, 'r') as f2:
        for line in f2:
            vid, tweetcount = line.rstrip().split()
            if vid not in item_ranking:
                item_ranking[vid].append(0)
            item_ranking[vid].append(int(tweetcount))

    # fill zero in vid not appear in file2
    for tweetcounts in item_ranking.values():
        if len(tweetcounts) == 1:
            tweetcounts.append(0)

    # sort by value of file1
    sorted_item_ranking = sorted(item_ranking.items(), key=operator.itemgetter(1), reverse=True)
    file1_list = []
    file2_list = []
    for item in sorted_item_ranking:
        file1_list.append(item[1][0])
        file2_list.append(item[1][1])

    taus = []
    for i in xrange(start, end + 1, jump):
        tau, p_value = stats.kendalltau(file1_list[:i], file2_list[:i])
        taus.append(tau)

    return taus
Beispiel #29
0
def generate_mod_series(reference,series,RealKen):
    """
    Takes the series from generate_base_null, takes the list from data, and makes a null
    for each gene in data or uses the one previously calculated.
    Then it runs Kendall's Tau on the exp. series against the null
    """
    geneID = series[0]
    values = series[1:]
    binary = [1 if value!="NA" else np.nan for value in values]
    temp = reference*binary
    mod_reference = [value for value in temp if not np.isnan(value)]
    mod_values = [value for value in values if value!='NA']
#    print reference
#    print temp
#    print mod_reference
#    print mod_values

    if len(mod_values) < 3:
        tau,p = np.nan,np.nan
    elif mod_values.count(np.nan) == len(mod_values):
        tau,p = np.nan,np.nan
    elif mod_values.count(0) == len(mod_values):
        tau,p = np.nan,np.nan
    elif sum(mod_values)<0.00001:
        tau,p = np.nan,np.nan        
    else:
        tau,p=kendalltau(mod_values,mod_reference)
        if not np.isnan(tau):
            pk = RealKen.pval(tau,len(mod_values))
            if pk!=None:
                p=pk
    
    #print tau,p
    return geneID,tau,p
Beispiel #30
0
 def calculate_betweeness_change_kendall(g, last_value=None):
     betweeness_sequence = g.edge_betweenness(weights='weight')
     if last_value:
        from scipy.stats import kendalltau
        return betweeness_sequence, kendalltau(betweeness_sequence, last_value)[0]
     else:
        return betweeness_sequence
Beispiel #31
0
class JointPlot(FeatureVisualizer):
    """
    Joint plots are useful for machine learning on multi-dimensional data, allowing for
    the visualization of complex interactions between different data dimensions, their
    varying distributions, and even their relationships to the target variable for
    prediction.

    The Yellowbrick ``JointPlot`` can be used both for pairwise feature analysis and
    feature-to-target plots. For pairwise feature analysis, the ``columns`` argument can
    be used to specify the index of the two desired columns in ``X``. If ``y`` is also
    specified, the plot can be colored with a heatmap or by class. For feature-to-target
    plots, the user can provide either ``X`` and ``y`` as 1D vectors, or a ``columns``
    argument with an index to a single feature in ``X`` to be plotted against ``y``.

    Histograms can be included by setting the ``hist`` argument to ``True`` for a
    frequency distribution, or to ``"density"`` for a probability density function. Note
    that histograms requires matplotlib 2.0.2 or greater.

    Parameters
    ----------
    ax : matplotlib Axes, default: None
        The axes to plot the figure on. If None is passed in the current axes will be
        used (or generated if required). This is considered the base axes where the
        the primary joint plot is drawn. It will be shifted and two additional axes
        added above (xhax) and to the right (yhax) if hist=True.

    columns : int, str, [int, int], [str, str], default: None
        Determines what data is plotted in the joint plot and acts as a selection index
        into the data passed to ``fit(X, y)``. This data therefore must be indexable by
        the column type (e.g. an int for a numpy array or a string for a DataFrame).

        If None is specified then either both X and y must be 1D vectors and they will
        be plotted against each other or X must be a 2D array with only 2 columns. If a
        single index is specified then the data is indexed as ``X[columns]`` and plotted
        jointly with the target variable, y. If two indices are specified then they are
        both selected from X, additionally in this case, if y is specified, then it is
        used to plot the color of points.

        Note that these names are also used as the x and y axes labels if they aren't
        specified in the joint_kws argument.

    correlation : str, default: 'pearson'
        The algorithm used to compute the relationship between the variables in the
        joint plot, one of: 'pearson', 'covariance', 'spearman', 'kendalltau'.

    kind : str in {'scatter', 'hex'}, default: 'scatter'
        The type of plot to render in the joint axes. Note that when kind='hex' the
        target cannot be plotted by color.

    hist : {True, False, None, 'density', 'frequency'}, default: True
        Draw histograms showing the distribution of the variables plotted jointly.
        If set to 'density', the probability density function will be plotted.
        If set to True or 'frequency' then the frequency will be plotted.
        Requires Matplotlib >= 2.0.2.

    alpha : float, default: 0.65
        Specify a transparency where 1 is completely opaque and 0 is completely
        transparent. This property makes densely clustered points more visible.

    {joint, hist}_kws : dict, default: None
        Additional keyword arguments for the plot components.

    kwargs : dict
        Keyword arguments that are passed to the base class and may influence
        the visualization as defined in other Visualizers.

    Attributes
    ----------
    corr_ : float
        The correlation or relationship of the data in the joint plot, specified by the
        correlation algorithm.

    Examples
    --------

    >>> viz = JointPlot(columns=["temp", "humidity"])
    >>> viz.fit(X, y)
    >>> viz.show()
    """

    # TODO: should we couple more closely with Rank2D?
    correlation_methods = {
        "pearson": lambda x, y: pearsonr(x, y)[0],
        "spearman": lambda x, y: spearmanr(x, y)[0],
        "covariance": lambda x, y: np.cov(x, y)[0, 1],
        "kendalltau": lambda x, y: kendalltau(x, y)[0],
    }

    def __init__(self,
                 ax=None,
                 columns=None,
                 correlation="pearson",
                 kind="scatter",
                 hist=True,
                 alpha=0.65,
                 joint_kws=None,
                 hist_kws=None,
                 **kwargs):
        # Initialize the visualizer
        super(JointPlot, self).__init__(ax=ax, **kwargs)
        self._xhax, self._yhax = None, None

        # Set and validate the columns
        self.columns = columns
        if self.columns is not None and not isinstance(self.columns,
                                                       (int, str)):
            self.columns = tuple(self.columns)
            if len(self.columns) > 2:
                raise YellowbrickValueError((
                    "'{}' contains too many indices or is invalid for joint plot - "
                    "specify either a single int or str index or two columns as a list"
                ).format(columns))

        # Seet and validate the correlation
        self.correlation = correlation
        if self.correlation not in self.correlation_methods:
            raise YellowbrickValueError(
                "'{}' is an invalid correlation method, use one of {}".format(
                    self.correlation,
                    ", ".join(self.correlation_methods.keys())))

        # Set and validate the kind of plot
        self.kind = kind
        if self.kind not in {"scatter", "hex", "hexbin"}:
            raise YellowbrickValueError(
                ("'{}' is invalid joint plot kind, use 'scatter' or 'hex'"
                 ).format(self.kind))

        # Set and validate the histogram if specified
        self.hist = hist
        if self.hist not in {True, "density", "frequency", None, False}:
            raise YellowbrickValueError(
                ("'{}' is an invalid argument for hist, use None, True, "
                 "False, 'density', or 'frequency'").format(hist))

        # If hist is True, test the version availability
        if self.hist in {True, "density", "frequency"}:
            self._layout()

        # Set the additional visual parameters
        self.alpha = alpha
        self.joint_kws = joint_kws
        self.hist_kws = hist_kws

    @property
    def xhax(self):
        """
        The axes of the histogram for the top of the JointPlot (X-axis)
        """
        if self._xhax is None:
            raise AttributeError(
                "this visualizer does not have a histogram for the X axis")
        return self._xhax

    @property
    def yhax(self):
        """
        The axes of the histogram for the right of the JointPlot (Y-axis)
        """
        if self._yhax is None:
            raise AttributeError(
                "this visualizer does not have a histogram for the Y axis")
        return self._yhax

    def _layout(self):
        """
        Creates the grid layout for the joint plot, adding new axes for the histograms
        if necessary and modifying the aspect ratio. Does not modify the axes or the
        layout if self.hist is False or None.
        """
        # Ensure the axes are created if not hist, then return.
        if not self.hist:
            self.ax
            return

        # Ensure matplotlib version compatibility
        if make_axes_locatable is None:
            raise YellowbrickValueError((
                "joint plot histograms requires matplotlib 2.0.2 or greater "
                "please upgrade matplotlib or set hist=False on the visualizer"
            ))

        # Create the new axes for the histograms
        divider = make_axes_locatable(self.ax)
        self._xhax = divider.append_axes("top",
                                         size=1,
                                         pad=0.1,
                                         sharex=self.ax)
        self._yhax = divider.append_axes("right",
                                         size=1,
                                         pad=0.1,
                                         sharey=self.ax)

        # Modify the display of the axes
        self._xhax.xaxis.tick_top()
        self._yhax.yaxis.tick_right()
        self._xhax.grid(False, axis="y")
        self._yhax.grid(False, axis="x")

    def fit(self, X, y=None):
        """
        Fits the JointPlot, creating a correlative visualization between the columns
        specified during initialization and the data and target passed into fit:

            - If self.columns is None then X and y must both be specified as 1D arrays
              or X must be a 2D array with only 2 columns.
            - If self.columns is a single int or str, that column is selected to be
              visualized against the target y.
            - If self.columns is two ints or strs, those columns are visualized against
              each other. If y is specified then it is used to color the points.

        This is the main entry point into the joint plot visualization.

        Parameters
        ----------
        X : array-like
            An array-like object of either 1 or 2 dimensions depending on self.columns.
            Usually this is a 2D table with shape (n, m)

        y : array-like, default: None
            An vector or 1D array that has the same length as X. May be used to either
            directly plot data or to color data points.
        """
        # Convert python objects to numpy arrays
        if isinstance(X, (list, tuple)):
            X = np.array(X)

        if y is not None and isinstance(y, (list, tuple)):
            y = np.array(y)

        # Case where no columns are specified
        if self.columns is None:
            if (y is None and (X.ndim != 2 or X.shape[1] != 2)) or (
                    y is not None and (X.ndim != 1 or y.ndim != 1)):
                raise YellowbrickValueError((
                    "when self.columns is None specify either X and y as 1D arrays "
                    "or X as a matrix with 2 columns"))

            if y is None:
                # Draw the first column as x and the second column as y
                self.draw(X[:, 0], X[:, 1], xlabel="0", ylabel="1")
                return self

            # Draw x against y
            self.draw(X, y, xlabel="x", ylabel="y")
            return self

        # Case where a single string or int index is specified
        if isinstance(self.columns, (int, str)):
            if y is None:
                raise YellowbrickValueError(
                    "when self.columns is a single index, y must be specified")

            # fetch the index from X -- raising index error if not possible
            x = self._index_into(self.columns, X)
            self.draw(x, y, xlabel=str(self.columns), ylabel="target")
            return self

        # Case where there is a double index for both columns
        columns = tuple(self.columns)
        if len(columns) != 2:
            raise YellowbrickValueError(
                ("'{}' contains too many indices or is invalid for joint plot"
                 ).format(columns))

        # TODO: color the points based on the target if it is given
        x = self._index_into(columns[0], X)
        y = self._index_into(columns[1], X)
        self.draw(x, y, xlabel=str(columns[0]), ylabel=str(columns[1]))
        return self

    def draw(self, x, y, xlabel=None, ylabel=None):
        """
        Draw the joint plot for the data in x and y.

        Parameters
        ----------
        x, y : 1D array-like
            The data to plot for the x axis and the y axis

        xlabel, ylabel : str
            The labels for the x and y axes.
        """
        # This is a little weird to be here, but it is the best place to perform
        # this computation given how fit calls draw and returns.
        self.corr_ = self.correlation_methods[self.correlation](x, y)

        # First draw the joint plot
        joint_kws = self.joint_kws or {}
        joint_kws.setdefault("alpha", self.alpha)
        joint_kws.setdefault("label",
                             "{}={:0.3f}".format(self.correlation, self.corr_))

        # Draw scatter joint plot
        if self.kind == "scatter":
            self.ax.scatter(x, y, **joint_kws)

            # TODO: Draw best fit line (or should this be kind='reg'?)

        # Draw hexbin joint plot
        elif self.kind in ("hex", "hexbin"):
            joint_kws.setdefault("mincnt", 1)
            joint_kws.setdefault("gridsize", 50)
            joint_kws.setdefault("cmap", "Blues")
            self.ax.hexbin(x, y, **joint_kws)

        # Something bad happened
        else:
            raise ValueError("unknown joint plot kind '{}'".format(self.kind))

        # Set the X and Y axis labels on the plot
        self.ax.set_xlabel(xlabel)
        self.ax.set_ylabel(ylabel)

        # If we're not going to draw histograms, stop here
        if not self.hist:
            # Ensure the current axes is always the main joint plot axes
            plt.sca(self.ax)
            return self.ax

        # Draw the histograms
        hist_kws = self.hist_kws or {}
        hist_kws.setdefault("bins", 50)
        if self.hist == "density":
            hist_kws.setdefault("density", True)

        self.xhax.hist(x, **hist_kws)
        self.yhax.hist(y, orientation="horizontal", **hist_kws)

        # Ensure the current axes is always the main joint plot axes
        plt.sca(self.ax)
        return self.ax

    def finalize(self, **kwargs):
        """
        Finalize executes any remaining image modifications making it ready to show.
        """
        # Set the aspect ratio to make the visualization square
        # TODO: still unable to make plot square using make_axes_locatable
        # x0,x1 = self.ax.get_xlim()
        # y0,y1 = self.ax.get_ylim()
        # self.ax.set_aspect(abs(x1-x0)/abs(y1-y0))

        # Add the title to the plot if the user has set one.
        self.set_title("")

        # TODO: use manual legend so legend works with both scatter and hexbin
        # Set the legend with full opacity patches using manual legend.
        # Or Add the colorbar if this is a continuous plot.
        if self.kind == "scatter":
            self.ax.legend(loc="best", frameon=True)

        # Finalize the histograms
        if self.hist:
            plt.setp(self.xhax.get_xticklabels(), visible=False)
            plt.setp(self.yhax.get_yticklabels(), visible=False)
            plt.sca(self.ax)

        # Call tight layout to maximize readability
        plt.tight_layout()

    def _index_into(self, idx, data):
        """
        Attempts to get the column from the data using the specified index, raises an
        exception if this is not possible from this point in the stack.
        """
        try:
            if is_dataframe(data):
                # Assume column indexing
                return data[idx]
            # Otherwise assume numpy array-like indexing
            return data[:, idx]
        except Exception as e:
            raise IndexError(
                "could not index column '{}' into type {}: {}".format(
                    self.columns, data.__class__.__name__, e))
Beispiel #32
0
#extras_x = extras_x.fillna(0)
y = y.fillna(0)
#extras_y = extras_y.fillna(0)
for train, test in k_fold.split(x):
    train_x, test_x = x.iloc[train], x.iloc[test]
    train_y, test_y = y.iloc[train], y.iloc[test]
    #rand_for.fit(pd.concat([train_x, extras_x]), pd.concat([train_y, extras_y]))
    rand_for.fit(train_x, train_y)
    #svr.fit(train_x, train_y)
    predictions = rand_for.predict(test_x)
    # i=0
    # for row in test:
    #     dataframe["predicted_label"].iat[row] = predictions[i]
    #     i += 1
    kappa_avg += cohen_kappa_score(test_y, predictions, weights="linear")
    tau_avg += stats.kendalltau(test_y, predictions)[0]
    acc_avg += accuracy_score(test_y, predictions)
    #score += rand_for.score(test_x, test_y)
    # cm = confusion_matrix(test_y, predictions)
    # for i in range(5):
    #   for j in range(5):
    #         try:
    #             confusion[i][j] += cm[i][j]
    #         except:
    #             pass
    for item in range(len(paras)):
        i = paras[item]
        d_x = test_x[dataframe["marked_par"] == i]
        if len(d_x) > 0:
            d_y = test_y[d_x.index.values]
            p = rand_for.predict(d_x)
Beispiel #33
0
def compute_correlations(test_path, predictions, human_metric, mode):
    """
    Computes the correlations between BERT output and the other human metrics.
    :param test_path: Path to the test data.
    :param predictions: The predictions of the model.
    :param human_metric: The metric for which the model is trained. It is needed only on 'Single Task' mode.
    :param mode: Depending on your choice : ['Single Task', 'Multi Task-1', 'Multi Task-5'].
    """

    test_data = dict(np.load(test_path, allow_pickle=True).item())

    ordered_ids = test_data['peer_ids']
    system_ids = {i for i in ordered_ids}
    empty_ids = test_data['empty_ids']

    correlations = {}  # Here will be store the correlations

    test_human_metrics = {
        'Q1': test_data['test_Q1'],
        'Q2': test_data['test_Q2'],
        'Q3': test_data['test_Q3'],
        'Q4': test_data['test_Q4'],
        'Q5': test_data['test_Q5']
    }

    for k in range(predictions.shape[1]):
        output_aggregation_table = np.zeros([len(system_ids)])
        human_aggregation_table = np.zeros([len(system_ids)])

        # Choose only Q_k to compute the correlation.
        # At single task, we have only one dimension on predictions
        if mode == 'Multi Task-1' or mode == 'Multi Task-5':
            predictions_of_metric = predictions[:, k]
            metric_real = test_human_metrics['Q' + str(k + 1)]
        else:
            predictions_of_metric = predictions
            metric_real = test_human_metrics[human_metric]

        for i, s_id in enumerate(system_ids):
            id_predictions = []
            id_human_scores = []

            for j, o_id in enumerate(ordered_ids):
                if s_id == o_id:

                    id_predictions.append(predictions_of_metric[j])
                    id_human_scores.append(metric_real[j])

            # Empty ids is a list with the peer_ids which the summary they sent was empty.
            # Each position corresponds to a doc_id-peer_id. if the system had sent more
            # than one empty summaries, it will be appeared on list multiple times, so when we
            # check each s_id we will append 0 as many times as the empty summaries it sent
            for e_id in empty_ids:
                if e_id == s_id:
                    id_predictions.append(0)
                    id_human_scores.append(0)

            output_aggregation_table[i] = np.mean(np.array(id_predictions))
            human_aggregation_table[i] = np.mean(np.array(id_human_scores))

        if mode == 'Multi Task-1' or mode == 'Multi Task-5':
            correlations['Q{}'.format(k + 1)] = {
                'Spearman': spearmanr(human_aggregation_table, output_aggregation_table)[0],
                'Kendall': kendalltau(human_aggregation_table, output_aggregation_table)[0],
                'Pearson': pearsonr(human_aggregation_table, output_aggregation_table)[0]
            }

        else:
            correlations[human_metric] = {
                'Spearman': spearmanr(human_aggregation_table, output_aggregation_table)[0],
                'Kendall': kendalltau(human_aggregation_table, output_aggregation_table)[0],
                'Pearson': pearsonr(human_aggregation_table, output_aggregation_table)[0]
            }

    if mode == 'Multi Task-1' or mode == 'Multi Task-5':
        log_msg = 'Q1 -> {} \nQ2 -> {} \nQ3 -> {} \nQ4 -> {} \nQ5 -> {} \n'.format(
            ''.join(['{}={:.3f}  '.format(metric, correlations['Q1'][metric])
                     for metric in ['Spearman', 'Kendall', 'Pearson']]),
            ''.join(['{}={:.3f}  '.format(metric, correlations['Q2'][metric])
                     for metric in ['Spearman', 'Kendall', 'Pearson']]),
            ''.join(['{}={:.3f}  '.format(metric, correlations['Q3'][metric])
                     for metric in ['Spearman', 'Kendall', 'Pearson']]),
            ''.join(['{}={:.3f}  '.format(metric, correlations['Q4'][metric])
                     for metric in ['Spearman', 'Kendall', 'Pearson']]),
            ''.join(['{}={:.3f}  '.format(metric, correlations['Q5'][metric])
                     for metric in ['Spearman', 'Kendall', 'Pearson']]))
    else:
        log_msg = '{} -> {} \n'.format(human_metric, ''.join(
            ['{}={:.3f}  '.format(metric, correlations[human_metric][metric])
             for metric in ['Spearman', 'Kendall', 'Pearson']]))

    LOGGER.info(log_msg)
 def score(self, X, y):
     yp = self.predict(X)
     return kendalltau(y, yp)[0]
Beispiel #35
0
def kendalltau_eval(preds_targs, preds_col="preds", targs_col="targs"):
    preds_targs = preds_targs[["preds", "targs"]]
    preds_targs = preds_targs[~preds_targs.isnull().any(axis=1)]
    preds_targs_rank = preds_targs.rank(method="average")
    return kendalltau(preds_targs_rank[preds_col].values,
                      preds_targs_rank[targs_col].values)[0]
def clean_data_and_find_correlations(*,
                                     df,
                                     row_Filter=None,
                                     colName_main_variable,
                                     verbose=False):
    """
        =================   ===============================================================================
        Property            Description
        =================   ===============================================================================
        
        * Function          This function, takes df, with numeric data in each column, 
                            computes all column to one of them, called colName_main_variable
                            the data are formatted and cleaned in each pari of column individually, (by removing na)
        
        Parameters/Input              
        _________________   _______________________________________________________________________________ 
        
        . Input .
        * df                DataFrame, with unique column names
        * row_filter        list[bool], with len(row_filter )==df.shape[0]
        * verbose           bool, if True, shows info
        
        Returns             
        _________________   _______________________________________________________________________________
        
        * comparisons_dct   dictionary, where key=column name in df, 
                            inside each entry, there is another dict wiht condition names, raw and filtered data
                            and results of correlation made with 3 different methods (pearson, spearman and kendal) 

    """

    #### filter the data:
    if row_Filter == None:
        row_Filter = [True] * df.shape[0]
    # ........
    filtered_df_main_col = pd.Series(
        df.loc[row_Filter, colName_main_variable])  # pd.Series
    filtered_df = df.loc[row_Filter, :]  # pd Series or pd.DataFrame

    #### rename column with main group to avoid having duplicates,
    filtered_df.rename(
        columns={
            str(colName_main_variable):
            "".join([str(colName_main_variable), "_"])
        },
        inplace=True)  # to ensute, that we have no columns with the same name,

    #### Loop over each columns and compare it with main group,
    comparisons_dct = dict()
    # .......
    for i, colName in enumerate(list(filtered_df.columns)):

        if verbose == True:
            print(i, colName_main_variable, " - with - ", colName)

        # some columns had to many repeats r nan. i remove them later, but warningn  were annoying, thus I removed them;
        import warnings
        warnings.filterwarnings("ignore", category=RuntimeWarning)

        # --------------------------
        # Prepare dct for the data,
        # --------------------------

        # new dict,
        one_comparison_dct = dict()
        one_comparison_dct["X_main_group_sliced_with"] = row_Filter
        one_comparison_dct["X_main_group"] = colName_main_variable
        one_comparison_dct["Y_compared_with"] = colName

        # --------------------------
        # PREPARE THE DATA
        # --------------------------

        # ................... raw data, .........................

        # prepare the data,
        filtered_df_one_col = filtered_df.loc[:, colName]
        data_for_comparison_df = pd.concat(
            [filtered_df_main_col, filtered_df_one_col], axis=1)
        data_for_comparison_df_full = data_for_comparison_df.copy(
        )  # for hitograms

        # remove missing data,
        data_for_comparison_df = data_for_comparison_df.dropna(
            how="any", axis=0)  # to create X/Y_cleaned

        # names, and basic data to display,
        sample_number = int(data_for_comparison_df.shape[0])  # int
        atribute_names = list(data_for_comparison_df.columns)  # list

        # ................... control, ............................

        # check if you can continue,
        if sample_number <= 2:
            # add to dict,
            one_comparison_dct["X_total"] = [None]
            one_comparison_dct["X_cleaned"] = [None]
            one_comparison_dct["X_cleaned_log"] = [None]
            # .......
            one_comparison_dct["Y_total"] = [None]
            one_comparison_dct["Y_cleaned"] = [None]
            one_comparison_dct["Y_cleaned_log"] = [None]
            # .......
            one_comparison_dct["pearson_results"] = [None]
            one_comparison_dct["sperman_results"] = [None]
            one_comparison_dct["kendalltau_results"] = [None]
            # ..........
            one_comparison_dct["linregress_results"] = [None]
            one_comparison_dct["linregress_results_log"] = [None]

            if verbose == True:
                print(
                    f"Caution, (column cobination nr {i}) -ie.- {atribute_names[0]},vs{atribute_names[1]}, has less then 3 items to compare !"
                )
            ############################################################################
            comparisons_dct[colName] = one_comparison_dct
            ############################################################################

        # else,
        if sample_number > 2:

            # ... X,Y data for plots and correlation, ................

            # all data, without removing NaN in each row, - for hist,
            X_total = data_for_comparison_df_full.iloc[:, 0]
            X_total = X_total.dropna(how="any").values.flatten()
            Y_total = data_for_comparison_df_full.iloc[:, 1]
            Y_total = Y_total.dropna(how="any").values.flatten()

            # data,
            X_cleaned = data_for_comparison_df.iloc[:, 0].values.flatten()
            Y_cleaned = data_for_comparison_df.iloc[:, 1].values.flatten()

            # transform values into log(x+2), +2 to avoid having log= inf, or zero
            X_cleaned_log = np.log(X_cleaned + 16)
            Y_cleaned_log = np.log(Y_cleaned + 16)

            # .....

            # add to dict,
            one_comparison_dct["X_total"] = X_total
            one_comparison_dct["X_cleaned"] = X_cleaned
            one_comparison_dct["X_cleaned_log"] = X_cleaned_log
            # .......
            one_comparison_dct["Y_total"] = Y_total
            one_comparison_dct["Y_cleaned"] = Y_cleaned
            one_comparison_dct["Y_cleaned_log"] = Y_cleaned_log

            # --------------------------
            # FIND CORR.
            # --------------------------

            # ... Correlation, ....................................

            # correlations,
            pearson_results = stats.pearsonr(X_cleaned_log,
                                             Y_cleaned_log)  # linear
            sperman_results = stats.spearmanr(
                X_cleaned, Y_cleaned)  # rank, with rho value,
            kendalltau_results = stats.kendalltau(
                X_cleaned,
                Y_cleaned)  # rank,based on orientation of pairs of ranks
            # ............
            one_comparison_dct["pearson_results"] = pearson_results
            one_comparison_dct["sperman_results"] = sperman_results
            one_comparison_dct["kendalltau_results"] = kendalltau_results

            # Compute a least-squares regression for two sets of measurements.
            LR_slope, LR_intercept, LR_r_value, LR_p_value, LR_std_err = stats.linregress(
                X_cleaned, X_cleaned)
            # ..........
            linregress_results = {
                "slope": LR_slope,
                "intercept": LR_intercept,
                "r_value": LR_r_value,
                "p_value": LR_p_value,
                "std_err": LR_std_err
            }
            # ..........
            one_comparison_dct["linregress_results"] = linregress_results

            # Compute a least-squares regression for two sets of measurements.
            LR_slope, LR_intercept, LR_r_value, LR_p_value, LR_std_err = stats.linregress(
                X_cleaned_log, Y_cleaned_log)
            # ..........
            linregress_results_log = {
                "slope": LR_slope,
                "intercept": LR_intercept,
                "r_value": LR_r_value,
                "p_value": LR_p_value,
                "std_err": LR_std_err
            }
            # ..........
            one_comparison_dct[
                "linregress_results_log"] = linregress_results_log

            ############################################################################
            comparisons_dct[colName] = one_comparison_dct
            ############################################################################

    return comparisons_dct
Beispiel #37
0
clust = np.load("erp_cluster.npz")
dfs, data = load_data_clust_av(clust['times'], clust['spaces'])

all_data = np.concatenate(data)
df = pd.concat(dfs)
df['target'] = all_data
df.is_correct = df.is_correct.astype(bool)
df.target *= 1e12
df = df[df.confidence.notna()]
# df.confidence /= 100
# df.confidence -= df.confidence.mean()
# df.target = st.boxcox(df.target - df.target.min() * 1.01)[0]
# df.target=st.boxcox(df.target - df.target.min() * 1.01, lmbda=1)

st.spearmanr(df.confidence, df.target)
st.kendalltau(df.is_correct, df.target)
st.pearsonr(df.confidence, df.target)

md = smf.mixedlm(
    "target ~ is_correct*confidence",
    data=df,
    groups=df.subject,
    # re_formula="~confidence",
)
mdf = md.fit(method="powell")
mdf.summary()

df_sep = df.copy()
df_low = df_sep[df_sep.confidence < 40]
df_high = df_sep[df_sep.confidence > 50]
print("std", np.std(y1))
# # This is needed as per statsmodel documentation
# print('x before : ' , x)
x = sm.add_constant(x)
# print('x after  : ' , x)
# ##################################### regression
model = sm.OLS(y, x)
results = model.fit()

print("summary : ", results.summary())
# # print('Parameters: ', results.params)
print('results.params : ', results.params)
# print(x1, y)
pc = stats.pearsonr(x1, y)
print('pc : ', pc)
tau = stats.kendalltau(x1, y)
# print(tau)
rho = stats.spearmanr(x1, y)
# print(rho)

# # creating regression line
xx = x1
# print(type(results.params[1]), results.params[1])
print('x1 ', type(x1), ' results.params[1] ', type(results.params[1]))
# print('calculations : ', pc*np.std(y1)/np.std(x1))
yy = results.params[0] + x1 * results.params[1]
plt.scatter(x1,
            y,
            s=None,
            marker='o',
            color='g',
Beispiel #39
0
def ctsimilarities_cal(data1,
                       data2,
                       sub_opt=1,
                       chl_opt=1,
                       time_win=10,
                       time_step=5,
                       method='spearman'):
    """
    Calculate the Cross-Temporal Similarities between neural data under two conditions

    Parameters
    ----------
    data1 : array
        EEG/MEG data from a time-window under condition1.
        The shape of data should be [n_subs, n_channels, n_ts]. n_subs, n_channels, n_ts represent the number of
        conditions, the number of subjects, the number of channels and the number of time-points respectively.
    data2 : array
        EEG/MEG data from a time-window under condition2.
        The shape of data should be [n_subs, n_channels, n_ts]. n_subs, n_channels, n_ts represent the number of
        conditions, the number of subjects, the number of channels and the number of time-points respectively.
    sub_opt : int 0 or 1. Default is 1.
        Caculate the CTRDMs for each subject or not.
        If sub_opt=1, return the CTRDMs for each subjects.
        If sub_opt=0, return the avg CTRDMs among all subjects.
    chl_opt : int 0 or 1. Default is 1.
        Caculate the CTRDMs for each channel or not.
        If chl_opt=1, calculate the CTRDMs for each channel.
        If chl_opt=0, calculate the CTRDMs after averaging the channels.
    time_win : int. Default is 10.
        Set a time-window for calculating the CTRDM for different time-points.
        If time_win=10, that means each calculation process based on 10 time-points.
    time_step : int. Default is 5.
        The time step size for each time of calculating.
    method : string 'spearman' or 'pearson' or 'kendall' or 'similarity' or 'distance'. Default is 'spearman'.
        The method to calculate the similarities.
        If method='spearman', calculate the Spearman Correlations. If method='pearson', calculate the Pearson
        Correlations. If methd='kendall', calculate the Kendall tau Correlations. If method='similarity', calculate the
        Cosine Similarities. If method='distance', calculate the Euclidean Distances.

    Returns
    -------
    CTSimilarities : array
        Cross-temporal similarities.
        If method='spearman' or 'pearson' or 'kendall':
            If sub_opt=1 and chl_opt=1, the shape of CTSimilarities will be [n_subs, n_channels,
            int((n_ts-time_win)/time_step)+1, int((n_ts-time_win)/time_step)+1, 2]
            If sub_opt=1 and chl_opt=0, the shape of CTSimilarities will be [n_subs, int((n_ts-time_win)/time_step)+1,
            int((n_ts-time_win)/time_step)+1, 2]
            If sub_opt=0 and chl_opt=1, the shape of CTSimilarities will be [n_channels, int((n_ts-time_win)/time_step)
            +1, int((n_ts-time_win)/time_step)+1, 2]
            If sub_opt=0 and chl_opt=0, the shape of CTSimilarities will be [int((n_ts-time_win)/time_step)+1,
            int((n_ts-time_win)/time_step)+1, 2]
        If method='similarity' or 'distance':
            If sub_opt=1 and chl_opt=1, the shape of CTSimilarities will be [n_subs, n_channels,
            int((n_ts-time_win)/time_step)+1, int((n_ts-time_win)/time_step)+1]
            If sub_opt=1 and chl_opt=0, the shape of CTSimilarities will be [n_subs, int((n_ts-time_win)/time_step)+1,
            int((n_ts-time_win)/time_step)+1]
            If sub_opt=0 and chl_opt=1, the shape of CTSimilarities will be [n_channels, int((n_ts-time_win)/time_step)
            +1, int((n_ts-time_win)/time_step)+1]
            If sub_opt=0 and chl_opt=0, the shape of CTSimilarities will be [int((n_ts-time_win)/time_step)+1,
            int((n_ts-time_win)/time_step)+1]
    """

    n_cons, n_subs, n_chls, n_ts = np.shape(data1)

    nts = int((n_ts - time_win) / time_step) + 1

    # chl_opt=0
    if chl_opt == 0:

        newdata1 = np.zeros([n_subs, nts, n_chls, time_win], dtype=np.float)
        newdata2 = np.zeros([n_subs, nts, n_chls, time_win], dtype=np.float)

        for sub in range(n_subs):
            for t in range(nts):
                for chl in range(n_chls):
                    newdata1[sub, t,
                             chl] = data1[sub, chl, t *
                                          time_step:t * time_step + time_win]
                    newdata2[sub, t,
                             chl] = data2[sub, chl, t *
                                          time_step:t * time_step + time_win]

        newdata1 = np.reshape(newdata1, [n_subs, nts, n_chls * time_win])
        newdata2 = np.reshape(newdata2, [n_subs, nts, n_chls * time_win])

        CTSimilarities = np.zeros([n_subs, nts, nts, 2], dtype=np.float)

        for sub in range(n_subs):
            for t1 in range(nts):
                for t2 in range(nts):

                    if method == 'spearman':
                        CTSimilarities[sub, t1,
                                       t2] = spearmanr(newdata1[sub, t1],
                                                       newdata2[sub, t2])
                    if method == 'pearson':
                        CTSimilarities[sub, t1,
                                       t2] = pearsonr(newdata1[sub, t1],
                                                      newdata2[sub, t2])
                    if method == 'kendall':
                        CTSimilarities[sub, t1, t2] = kendalltau(
                            newdata1[sub, t1], newdata2[sub, t2])
                    if method == 'similarity':
                        V1 = np.mat(newdata1[sub, t1])
                        V2 = np.mat(newdata2[sub, t2])
                        num = float(V1 * V2.T)
                        denom = np.linalg.norm(V1) * np.linalg.norm(V2)
                        cos = num / denom
                        CTSimilarities[sub, t1, t2, 0] = 0.5 + 0.5 * cos
                    if method == 'distance':
                        CTSimilarities[sub, t1, t2,
                                       0] = np.linalg.norm(newdata1[sub, t1] -
                                                           newdata2[sub, t2])

        if sub_opt == 0:

            CTSimilarities = np.average(CTSimilarities, axis=0)

            if method == 'spearman' or method == 'pearson' or method == 'kendall':
                return CTSimilarities

            if method == 'similarity' or method == 'distance':
                return CTSimilarities[:, :, 0]

        if sub_opt == 1:

            if method == 'spearman' or method == 'pearson' or method == 'kendall':
                return CTSimilarities

            if method == 'similarity' or method == 'distance':
                return CTSimilarities[:, :, :, 0]

    if chl_opt == 1:

        newdata1 = np.zeros([n_subs, n_chls, nts, time_win], dtype=np.float)
        newdata2 = np.zeros([n_subs, n_chls, nts, time_win], dtype=np.float)

        for sub in range(n_subs):
            for chl in range(n_chls):
                for t in range(nts):
                    newdata1[sub, chl,
                             t] = data1[sub, chl,
                                        t * time_step:t * time_step + time_win]
                    newdata2[sub, chl,
                             t] = data2[sub, chl,
                                        t * time_step:t * time_step + time_win]

        CTSimilarities = np.zeros([n_subs, n_chls, nts, nts, 2],
                                  dtype=np.float)

        for sub in range(n_subs):
            for chl in range(n_chls):
                for t1 in range(nts):
                    for t2 in range(nts):

                        if method == 'spearman':
                            CTSimilarities[sub, chl, t1, t2] = spearmanr(
                                newdata1[sub, t1], newdata2[sub, t2])
                        if method == 'pearson':
                            CTSimilarities[sub, chl, t1, t2] = pearsonr(
                                newdata1[sub, t1], newdata2[sub, t2])
                        if method == 'kendall':
                            CTSimilarities[sub, chl, t1, t2] = kendalltau(
                                newdata1[sub, t1], newdata2[sub, t2])
                        if method == 'similarity':
                            V1 = np.mat(newdata1[sub, chl, t1])
                            V2 = np.mat(newdata2[sub, chl, t2])
                            num = float(V1 * V2.T)
                            denom = np.linalg.norm(V1) * np.linalg.norm(V2)
                            cos = num / denom
                            CTSimilarities[sub, t1, t2, 0] = 0.5 + 0.5 * cos
                        if method == 'distance':
                            CTSimilarities[sub, t1, t2, 0] = np.linalg.norm(
                                newdata1[sub, chl, t1] -
                                newdata2[sub, chl, t2])

        if sub_opt == 0:

            CTSimilarities = np.average(CTSimilarities, axis=0)

            if method == 'spearman' or method == 'pearson' or method == 'kendall':
                return CTSimilarities

            if method == 'similarity' or method == 'distance':
                return CTSimilarities[:, :, :, 0]

        if sub_opt == 1:

            if method == 'spearman' or method == 'pearson' or method == 'kendall':
                return CTSimilarities

            if method == 'similarity' or method == 'distance':
                return CTSimilarities[:, :, :, :, 0]
Beispiel #40
0
        # (v_degrees == k) es una lista por nodo: 1 si el nodo tiene grado k, 0 si no
        # essential_vertex es una lista de escencialidad de nodo: 1 si el nodo es escencial, 0 si no
        # el producto actua como operador "y"-logico: 1 si es escencial y de grado k
        essential_hist[i] = np.sum(  ( v_degrees == k )*essential_vertex )

    data = pd.DataFrame({'degrees':degrees,'nodes':hist,'essentials':essential_hist}) # histogramas como funcion de k


    percent = np.linspace(0,1,100)                          #  
    x,y = essential_fraction_array(data,percent,v_degrees)  #
    subplot.plot(x,y,'-',label=name)                        # 



    # Medicion de correlacion    
    x,y ,kcut= essential_fraction(data,args.percent_cut,v_degrees,return_data_cut=True)

    tau, tp_value = kendalltau(x,y) 
    rho, rp_value = spearmanr(x,y)

    print("%25s: %.2f(%g)\t%.2f(%.2g)\t%3i"%(name,tau,tp_value,rho,rp_value,kcut))

subplot.set_xlabel('Fraccion de hubs en la red',fontsize=20)
subplot.set_ylabel('Fraccion de hubs esenciales',fontsize=20)
subplot.tick_params(labelsize=20)
subplot.legend(loc='best')
plt.savefig('ess_hub.pdf')
plt.show()


    def calculate_average_kendall_tau(self, rankings, values, weights, ranks):
        kendall = {i: {} for i in ["reg", "max", "mean"]}
        change_rate = {}
        rbo_min_models = {}
        for model in rankings:
            rankings_list_lm = rankings[model]
            last_list_index_lm = {}

            epochs = sorted(list(rankings_list_lm.keys()))
            for epoch in epochs:
                for query in rankings_list_lm[epoch]:
                    if not kendall["reg"].get(query, False):
                        kendall["reg"][query] = {}
                        kendall["max"][query] = {}
                        kendall["mean"][query] = {}
                        change_rate[query] = {}
                        rbo_min_models[query] = {}
                    if not kendall["reg"][query].get(model, False):
                        kendall["reg"][query][model] = []
                        kendall["mean"][query][model] = []
                        kendall["max"][query][model] = []
                        change_rate[query][model] = {"reg": [], "winner": []}
                        rbo_min_models[query][model] = []
                    current_list_svm = rankings_list_lm[epoch][query]
                    if not last_list_index_lm.get(query, False):
                        last_list_index_lm[query] = current_list_svm
                        continue
                    if current_list_svm.index(
                            5) != last_list_index_lm[query].index(5):
                        change_rate[query][model]["reg"].append(1)
                        change_rate[query][model]["winner"].append(
                            float(1) / (weights[epoch][query][
                                ranks[model][epoch][query][0]] + 1))
                    else:
                        change_rate[query][model]["reg"].append(0)
                        change_rate[query][model]["winner"].append(0)
                    kt = kendalltau(current_list_svm,
                                    last_list_index_lm[query])[0]
                    kt_max = weighted_kendall_tau(
                        ranks[model][epoch][query],
                        ranks[model][epoch - 1][query], weights[epoch][query],
                        "max")
                    kt_mean = weighted_kendall_tau(
                        ranks[model][epoch][query],
                        ranks[model][epoch - 1][query], weights[epoch][query],
                        "mean")
                    if not np.isnan(kt):
                        kendall["reg"][query][model].append(kt)
                    kendall["max"][query][model].append(kt_max)
                    kendall["mean"][query][model].append(kt_mean)
                    rbo = r.rbo_dict(
                        {
                            x: j
                            for x, j in enumerate(last_list_index_lm[query])
                        }, {x: j
                            for x, j in enumerate(current_list_svm)},
                        0.7)["min"]
                    rbo_min_models[query][model].append(rbo)
                    last_list_index_lm[query] = current_list_svm
        for query in kendall["reg"]:
            for model in kendall["reg"][query]:
                kendall["reg"][query][model] = np.mean(
                    kendall["reg"][query][model])
                kendall["max"][query][model] = np.mean(
                    kendall["max"][query][model])
                kendall["mean"][query][model] = np.mean(
                    kendall["mean"][query][model])
                rbo_min_models[query][model] = np.mean(
                    rbo_min_models[query][model])
                change_rate[query][model]["reg"] = np.mean(
                    change_rate[query][model]["reg"])
                change_rate[query][model]["winner"] = np.mean(
                    change_rate[query][model]["winner"])
        return kendall, change_rate, rbo_min_models
Beispiel #42
0
 def time_kendalltau(self, nan_policy, method, variant):
     tau, p_value = stats.kendalltau(self.a,
                                     self.b,
                                     nan_policy=nan_policy,
                                     method=method,
                                     variant=variant)
                               save_best_only=True),
            cb.EarlyStopping(patience=args.num_epochs // 8,
                             restore_best_weights=True),
            cb.CSVLogger(os.path.join(test_dir, 'train_log.csv')),
            cb.TerminateOnNaN()
        ])

    # Run on the validation set and assess statistics
    y_true = np.hstack([np.squeeze(x[1].numpy()) for x in iter(test_loader)])
    test_time = perf_counter()
    y_pred = np.squeeze(model.predict(test_loader))
    test_time = perf_counter() - test_time

    pd.DataFrame({
        'true': y_true,
        'pred': y_pred
    }).to_csv(os.path.join(test_dir, 'test_results.csv'), index=False)

    with open(os.path.join(test_dir, 'test_summary.json'), 'w') as fp:
        json.dump(
            {
                'r2_score': float(np.corrcoef(y_true, y_pred)[1, 0]**
                                  2),  # float() converts from np.float32
                'spearmanr': float(spearmanr(y_true, y_pred)[0]),
                'kendall_tau': float(kendalltau(y_true, y_pred)[0]),
                'mae': float(np.mean(np.abs(y_pred - y_true))),
                'rmse': float(np.sqrt(np.mean(np.square(y_pred - y_true))))
            },
            fp,
            indent=2)
Beispiel #44
0
 def func(a, b):
     return kendalltau(a, b)[0]
Beispiel #45
0
def kendall_correlation(gt_video_rank, test_video_rank):

    assert (len(gt_video_rank) == len(test_video_rank))
    tau, p_value = stats.kendalltau(range(len(gt_video_rank)), test_video_rank)

    return tau, p_value
Beispiel #46
0
 def eval_corr_single(self, alpha, beta):
     corr, _ = stats.kendalltau(alpha, beta)
     return corr
        requeues, overlimits
    ]
    for i in range(0, len(x)):
        x[i] = float(x[i])
    data.append(x)
    # retrans = int(retrans)
    y.append(ratio)

x = ("ratio", "drop_count", "busy_time", "ext_busy_time", "rx_time", "tx_time",
     "scan_time", "freq", "noise", "bytes", "packets", "qlen", "backlog",
     "drops", "requeues", "overlimits")

IG = info_gain(data, y)
# print IG
print "iGGGGGGGGGGGGGGGGG"
for k, v in IG:
    try:
        print x[k], v
    except Exception:
        print k, len(x)
data = np.matrix(data).T
for i in range(0, len(x)):
    try:
        print x[i], 'a', stats.kendalltau(data[i, ], data[0, ])
    except Exception:
        print i, 'abc'
if csvfile:
    csvfile.close()
# del data, y
gc.collect()
                                })))

#initialize an empty dataframe to append to
final_win_loss = []

#loop through all the teams and have the rows append
for j in range(len(team_ids)):
    row = team_win_loss(margins_long, j)
    final_win_loss.append(row)

final_win_loss = (pd.concat(final_win_loss).sort_values(
    by=["Wins", "Ties", "Points"],
    ascending=False).assign(Standing=np.arange(1, 11)).sort_values(
        by=["Team"]).reset_index(drop=True))

comparison = (final_win_loss.replace({
    "Team": mapping
}).set_index("Team").merge(
    how="right", right=predicted_ranks, left_index=True,
    right_index=True).loc[:,
                          ["Rank", "Standing"]].rename(columns={
                              "Rank": "Predicted",
                              "Standing": "Observed"
                          }))
tau, p_value = stats.kendalltau(comparison.Predicted, comparison.Observed)

#heatmap of the percentages
counts2 = counts.reindex(predicted_ranks.index) / 10
cmap = sns.diverging_palette(10, 150, as_cmap=True)
sns.heatmap(counts2, cmap=cmap, cbar=False, annot=counts2, linewidth=0.5)
plt.title("1000 Simulations - Percentage of Each Final Standing")
Beispiel #49
0
    def correlation_test(self,
                         data1,
                         data2,
                         normal_dist=True,
                         corr_algo="spearman"):
        """
        Checking if two samples are related. The following 3 rank correlation are provided.

        1. Pearson’s Correlation Coefficient
            Tests whether two samples have a monotonic relationship.

            Assumptions
                Observations in each sample are independent and identically distributed (iid).
                Observations in each sample are normally distributed.
                Observations in each sample have the same variance.

            Interpretation
                H0: the two samples are independent.
                H1: there is a dependency between the samples.

        2. Spearman’s Rank Correlation
            Tests whether two samples have a monotonic relationship.

            Assumptions
                Observations in each sample are independent and identically distributed (iid).
                Observations in each sample can be ranked.

            Interpretation
                H0: the two samples are independent.
                H1: there is a dependency between the samples.

        3. Kendall’s Rank Correlation
            Tests whether two samples have a monotonic relationship.

            Assumptions
                Observations in each sample are independent and identically distributed (iid).
                Observations in each sample can be ranked.

            Interpretation
                H0: the two samples are independent.
                H1: there is a dependency between the samples.

        Args:
            data1: input data1
            data2: input data2
            normal_dist: if samples have Normal Distribution.
            corr_algo: rank correlation algorithm name.

        Returns:
            correlations
        """
        algo_name_spearman = "spearman"
        algo_name_kendall = "kendall"

        if normal_dist is True:
            corr, p = pearsonr(data1, data2)
        else:
            if corr_algo == algo_name_spearman:
                corr, p = spearmanr(data1, data2)
            elif corr_algo == algo_name_kendall:
                corr, p = kendalltau(data1, data2)
            else:
                raise ValueError("not supported rank correlation!")

        # interpret the significance
        alpha = 0.05
        if p > alpha:
            print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
        else:
            print('Samples are correlated (reject H0) p=%.3f' % p)

        return corr
Beispiel #50
0
def correlateFile(f,directory,coefficient,pvalue, cutoff):

    develdict = c.OrderedDict()
    
    infile = open(directory+"/"+f, 'rU')
    line = infile.readline()
    
    for line in infile:
        line = re.sub("\n", "", line)
        split = re.split("\t", line)
        score = split[1:]
        develdict[split[0]] = score
        
    taxondict = c.OrderedDict()
    taxonlist = list()
    
    name1 = re.sub("taxcoIn","taxcoList",f)
    name1 = re.split("\.",name1)
    name1 = name1[0]+"_"+coefficient+"_"+str(pvalue)+"."+name1[1]
    outfile1 = open(directory+"/"+name1,'w')
    outfile1.write("Taxon1\tTaxon2\t"+coefficient+"\tp-value")
    
    usedDict = dict()
        
    for taxon in develdict:
        if not(taxon in taxondict):
            thelist = list()
            taxonlist.append(taxon)
        else:
            thelist = taxondict[taxon]
            
        for taxon2 in develdict:
            if (taxon == taxon2):
                correlation = (0.0, 0.0)
                
            
            else:
                if coefficient== "pearson":
                    scores1 = list(map(float, develdict[taxon]))
                    scores2 = list(map(float, develdict[taxon2]))
                    correlation = scistats.pearsonr(scores1, scores2)
                elif coefficient== "spearman":
                    correlation = scistats.spearmanr(develdict[taxon], develdict[taxon2], 0)
                elif coefficient== "kendall":
                    correlation = scistats.kendalltau(develdict[taxon], develdict[taxon2], True)
                else:
                    print("This correlation coefficient is not supported. Please use spearman, kendall or pearson.")
                    exit(-1)
            if correlation == 1:
                correlation = (numpy.NaN, numpy.NaN)
            rounded = correlation[0]
            rounded = round(abs(rounded),2)
            if((correlation[1]<pvalue) and (rounded > cutoff)):
                if(not ((taxon in usedDict)and(taxon2 in usedDict[taxon]))):
                    outfile1.write("\n"+taxon+"\t"+taxon2+"\t"+str(correlation[0])+"\t"+str(correlation[1]))
                    if(taxon in usedDict):
                        liste = usedDict[taxon]
                    else:
                        liste = list()
                    liste.append(taxon2)
                    usedDict[taxon] = liste 
                    if(taxon2 in usedDict):
                        liste = usedDict[taxon2]
                    else:
                        liste = list()
                    liste.append(taxon)
                    usedDict[taxon2] = liste    
            thelist.append(correlation)
            taxondict[taxon] = thelist
            
    outfile1.close()
    infile.close()
    
    name = re.sub("taxcoIn","taxcoCor",f)
    name = re.split("\.",name)
    name = name[0]+"_"+coefficient+"_"+str(pvalue)+"."+name[1]
    outfile = open(directory+"/"+name,'w')
    line1 = "Taxon"
    for taxon in taxondict:
        line1 = line1+"\t"+taxon
    outfile.write(line1+"\n")
    for taxon in taxondict:
        outfile.write(taxon)
        for correlation in taxondict[taxon]:
            if correlation[1] < pvalue:
                outfile.write("\t"+str(correlation[0]))
            else:
                outfile.write("\t0.0")
        outfile.write("\n")
        
    outfile.close()
def custom(a, b):
    v, p = stats.kendalltau(a, b)
    return round(p, 4)
Beispiel #52
0
def _correlation(table,
                 vars,
                 method='pearson',
                 display_plt=True,
                 height=2.5,
                 corr_prec=2):

    size = len(vars)
    result_arr = []

    for i in range(size):
        for j in range(i):
            if method == 'pearson':
                r, p = stats.pearsonr(table[vars[i]], table[vars[j]])
            elif method == 'spearman':
                r, p = stats.spearmanr(table[vars[i]], table[vars[j]])
            else:
                r, p = stats.kendalltau(table[vars[i]], table[vars[j]])

            result_arr.append([vars[i], vars[j], r, p])

    df_result = pd.DataFrame(result_arr, columns=['x', 'y', 'corr', 'p_value'])

    rb = BrtcReprBuilder()
    if display_plt:
        s_default = plt.rcParams['lines.markersize']**2.
        scatter_kws = {"s": s_default * height / 6.4}

        def corr(x, y, **kwargs):
            if kwargs['method'] == 'pearson':
                r, p = stats.pearsonr(x, y)
            elif kwargs['method'] == 'spearman':
                r, p = stats.spearmanr(x, y)
            else:
                r, p = stats.kendalltau(x, y)

            p_stars = ''
            if p <= 0.05:
                p_stars = '*'
            if p <= 0.01:
                p_stars = '**'
            if p <= 0.001:
                p_stars = '***'

            corr_text = '{:.{prec}f}'.format(r, prec=corr_prec)
            font_size = abs(r) * 15 * 2 / corr_prec + 5
            ax = plt.gca()
            ax.annotate(corr_text, [
                .5,
                .5,
            ],
                        xycoords="axes fraction",
                        ha='center',
                        va='center',
                        fontsize=font_size * height)
            ax.annotate(p_stars,
                        xy=(0.65, 0.6),
                        xycoords=ax.transAxes,
                        color='red',
                        fontsize=17 * height)

        g = sns.PairGrid(table, vars=vars, height=height)
        g.map_diag(sns.distplot)
        if method == 'pearson':
            g.map_lower(sns.regplot, scatter_kws=scatter_kws)
        else:
            g.map_lower(sns.regplot, lowess=True, scatter_kws=scatter_kws)
        g.map_upper(corr, method=method)

        fig_corr = plt2MD(plt)
        plt.clf()

        rb.addMD(
            strip_margin(""" ## Correlation Results
            | ### Correlation Matrix
            | {fig_corr}
            |
            | ### Correlation Table
            | {table}
            """.format(fig_corr=fig_corr, table=pandasDF2MD(df_result))))

        params = {'vars': vars, 'method': method, 'height': height}

    else:
        rb.addMD(
            strip_margin(""" ## Correlation Results
            | ### Correlation Table
            | {table}
            """.format(table=pandasDF2MD(df_result))))

        params = {'vars': vars, 'method': method}

    res = dict()
    res['params'] = params
    res['corr_table'] = df_result
    res['_repr_brtc_'] = rb.get()

    return {'result': res}
graphRed.add_edges_from([('0', '6'), ('0', '4'), ('1', '2'), ('1', '5'),
                         ('1', '6'), ('1', '7'), ('2', '1'), ('2', '3'),
                         ('2', '7'), ('3', '2'), ('3', '7'), ('3', '4'),
                         ('4', '3'), ('4', '7'), ('4', '6'), ('4', '0'),
                         ('4', '5'), ('5', '4'), ('5', '6'), ('5', '1'),
                         ('6', '0'), ('6', '1'), ('6', '7'), ('6', '4'),
                         ('6', '5')])

prRed = nx.pagerank(graphRed, dampFactor)
print("Approximate pagerank of red Graph:", prRed)

truePrRed, kendalLisRed = realPageRank(pr.get('RED'), prRed, 0, 7)
print("Real pagerank of red Graph:", truePrRed)

listRed = fromListToVector(prRed, 0, 7)
corrRed, _ = kendalltau(listRed, kendalLisRed)
print('Kendall Rank correlation between reds: %.5f' % corrRed)

print("\n\n")

# green graph
graphGreen = nx.DiGraph()
graphGreen.add_edges_from([
    ('8', '9'), ('8', '14'), ('9', '8'), ('9', '10'), ('9', '11'), ('9', '13'),
    ('10', '9'), ('10', '14'), ('10', '13'), ('10', '11'), ('11', '10'),
    ('11', '9'), ('11', '12'), ('12', '11'), ('12', '10'), ('12', '13'),
    ('13', '12'), ('13', '10'), ('13', '9'), ('13', '14'), ('14', '13'),
    ('14', '10'), ('14', '8')
])

prGreen = nx.pagerank(graphGreen, dampFactor)
Beispiel #54
0
 def _kendall(a, b):
     # kendallttau returns a tuple of the tau statistic and pvalue
     rs = kendalltau(a, b)
     return rs[0]
Beispiel #55
0
coefficient, p_val = stats.pearsonr(
    df[(df['retweet'] == 'No')
       & (df['hashtag_count'] != 4)]['favorite_count_detrend_weekdayhour'],
    df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['hashtag_count'])
pearson_favorite = ["Favorite", 'Pearson', coefficient, p_val]

coefficient, p_val = stats.spearmanr(df['retweet_count_detrend_weekdayhour'],
                                     df['hashtag_count'])
spearman_retweet = ["Retweet", 'Spearman', coefficient, p_val]
coefficient, p_val = stats.spearmanr(
    df[(df['retweet'] == 'No')
       & (df['hashtag_count'] != 4)]['favorite_count_detrend_weekdayhour'],
    df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['hashtag_count'])
spearman_favorite = ["Favorite", 'Spearman', coefficient, p_val]

coefficient, p_val = stats.kendalltau(df['retweet_count_detrend_weekdayhour'],
                                      df['hashtag_count'])
kendall_retweet = ["Retweet", 'Kendall', coefficient, p_val]
coefficient, p_val = stats.kendalltau(
    df[(df['retweet'] == 'No')
       & (df['hashtag_count'] != 4)]['favorite_count_detrend_weekdayhour'],
    df[(df['retweet'] == 'No') & (df['hashtag_count'] != 4)]['hashtag_count'])
kendall_favorite = ["Favorite", 'Kendall', coefficient, p_val]

pd.DataFrame([
    pearson_retweet, spearman_retweet, kendall_retweet, pearson_favorite,
    spearman_favorite, kendall_favorite
],
             columns=[
                 'Dependent Variable', 'Correlation Test', 'Coefficient',
                 'p-value'
             ])
def plot_p(ds_name):

    eif_dir = './result_p/' + ds_name
    results_eif, res_eif_full, ps = read_stream_csv(eif_dir, 3)

    # results_eif_end = np.array([res[10] for res in results_eif])

    eif_percision = [
        np.array(res)[:, 0] / np.array(res)[:, 1] for res in results_eif
    ]
    eif_recall = [
        np.array(res)[:, 0] / np.array(res)[:, 3] for res in results_eif
    ]
    timeline = [np.array(res)[:, 2] for res in results_eif]
    # eif_recall = results_eif_end[:,0]/results_eif_end[:,3]
    ps = np.array(ps, dtype=float)

    from scipy.interpolate import interp1d
    intp_precision, intp_recall = [], []
    for res in results_eif:
        res = np.insert(res, 0, np.array([0, 0, 0, 0]), axis=0)
        f_per = interp1d(
            np.array(res)[:, 2],
            np.array(res)[:, 0] / np.array(res)[:, 1])
        intp_pre = f_per(np.arange(res[-1, 2] - 255) + 255)

        f_rec = interp1d(
            np.array(res)[:, 2],
            np.array(res)[:, 0] / np.array(res)[:, 3])
        intp_rec = f_per(np.arange(res[-1, 2] - 255) + 255)

        intp_precision.append(intp_pre)
        intp_recall.append(intp_rec)

    intp_precision, intp_recall = np.array(intp_precision), np.array(
        intp_recall)

    from scipy.stats import pearsonr, kendalltau

    corrs_recall = []
    corrs_precision = []
    for idx in np.linspace(0, len(intp_precision[0]), num=11)[1:]:
        idx = int(idx)
        pre = intp_precision[:, idx - 1]
        corr, _ = kendalltau(ps, pre)
        corrs_precision.append(corr)

        rec = intp_recall[:, idx - 1]
        corr, _ = kendalltau(ps, rec)
        corrs_recall.append(corr)

    return corrs_recall

    # print(corrs)
    # from matplotlib import rcParams, cycler
    # cmap = plt.cm.coolwarm
    # rcParams['axes.prop_cycle'] = cycler(color=cmap(np.linspace(0, 1, 11)))
    # from matplotlib.lines import Line2D
    # custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4),
    #                 Line2D([0], [0], color=cmap(.5), lw=4),
    #                 Line2D([0], [0], color=cmap(1.), lw=4)]

    # fig, ax = plt.subplots()
    # for i in np.argsort(ps):
    #     lines = ax.plot(timeline[i], eif_recall[i])
    # ax.legend(custom_lines, ['Cold', 'Medium', 'Hot'])

    # rgba_colors = np.zeros((len(ps),4))
    # rgba_colors[:,0] = 1.0
    # rgba_colors[:, 3] = ps

    # # print(eif_recall.shape, eif_percision.shape, ps.shape)
    # fig, ax = plt.subplots()
    # im = ax.plot(timeline, eif_percision, c=ps, cmap=plt.cm.jet)
    # fig.colorbar(im, ax=ax)

    # plt.xlabel('Recall')
    # plt.ylabel('Precision')
    # plt.title(ds_name)
    plt.show()
Beispiel #57
0
 def _kendall(a, b):
     rs = kendalltau(a, b)
     if isinstance(rs, tuple):
         return rs[0]
     return rs
mean_per_change_topk = np.zeros((len(top_K_protocols_list),n_batches-1))

for k, n in enumerate(top_K_protocols_list):
    top_protocol_idx = np.argsort(-mean)[0:n]
    mean_change_topk[k,:] = np.mean(np.abs(change[top_protocol_idx]),axis=0)
    mean_per_change_topk[k,:] = np.mean(np.abs(per_change[top_protocol_idx]),axis=0)

## find change in rankings per kendall tau
tau = np.zeros((len(top_K_protocols_list),n_batches-2))
for k1, n in enumerate(top_K_protocols_list):
    top_protocol_idx = np.argsort(-mean)[0:n]
    for k2 in [1,2,3]: # don't start with 0 - all are tied
        ranks1 = len(param_space) - rankdata(data[k2])[top_protocol_idx]
        ranks2 = len(param_space) - rankdata(data[k2+1])[top_protocol_idx]
        #print(k1,k2,ranks1,ranks2)
        tau[k1,k2-1] = kendalltau(ranks1,ranks2)[0]

## plot
batches = np.arange(n_batches-1)+1
plt.subplots(3,3,figsize=figsize)

ax0 = plt.subplot2grid((3, 3), (0, 0))  
ax1 = plt.subplot2grid((3, 3), (0, 1))  
ax2 = plt.subplot2grid((3, 3), (0, 2))
ax3 = plt.subplot2grid((3, 3), (1, 0))
ax4 = plt.subplot2grid((3, 3), (1, 1))
ax5 = plt.subplot2grid((3, 3), (1, 2))
ax6 = plt.subplot2grid((3, 3), (2, 0), colspan=3)

ax0.set_title('a', loc='left', weight='bold', fontsize=8)
ax1.set_title('b', loc='left', weight='bold', fontsize=8)
                  'Selected fuel consumption(L)', 'Selected kilometer(L)', 'Selected speed(km/h)',
                  'Service stop status', 'Odometer speed', 'Wheel speed', 'Engine torque mode',
                  'Percentage of torque on driving instructions', 'Actual percentage of engine torque', 'RPM',
                  'Coolant temperature', 'Oil pressure', 'ECU fuel consumption', 'Accelerator pedal position',
                  'Parking brake switch', 'Clutch switch', 'Brake switch', 'Urea tank level',
                  'Urea tank temperature', 'Engine input voltage', 'Ignition switch voltage',
                  'Cumulative engine running time', 'Cumulative engine revolutions', 'Engine fuel rate',
                  'Instantaneous engine fuel rate', 'Average fuel consumption', 'Particle catcher inlet pressure',
                  'Relative boost pressure', 'Intake manifold temperature', 'Absolute boost pressure',
                  'Discharge temperature', 'Atmospheric pressure', 'Cabin temperature', 'Atmospheric temperature',
                  'Cold start light', 'Kilometers of this driving cycle', 'Total kilometers', 'Fuel contains water',
                  'Target gear', 'Actual speed ratio', 'Current gear', 'Gauge fuel level',
                  'Odometer subcounts kilometer', 'Total odometer kilometer', 'Integral kilometer',
                  'Integral fuel consumption', 'Interval brake times', 'Merger marks', 'Compensation transmission']
df.set_index('ID',inplace=True)

a, b = df['Interval brake times'], df['Accelerator pedal position']
df1 = pd.concat([a,b],1)
df1 = df1.dropna()
print(a.corr(b, method='spearman'))
# sns.regplot(x='Brakes', y='Accelerator', data=df1)
# plt.show()
# x1 = stats.pearsonr(a, b)

x2 = stats.spearmanr(a, b)
#
x3 = stats.kendalltau(a, b)
#
print(x2,x3)
if __name__ == '__main__':
    pass
Beispiel #60
0
            if not get_is_mol(player):
                appearance = AppearanceExtractor.get_relative_occurrence(player, parsed_video, [True])
                appearances.append(appearance)

        for feat1, feat2 in itertools.permutations(appearances, 2):
            input.append(feat1)
            output.append(feat2)

r, p_value = pearsonr(input, output)
print("Pearson Test (Between):")
print("R value: " + str(r))
print("R-squared value: " + str(r ** 2))
print("p-value: " + str(p_value))
print()

t, p_value = kendalltau(input, output)
print("Kendall Test (Between):")
print("Tau value: " + str(t))
print("p-value: " + str(p_value))
print()

input = []
output = []
for season in TEST_SEASONS:
    for episode in itertools.count(1):
        appearances = []
        parsed_video = VideoParser.load_parsed_video(season, episode)
        if parsed_video is None:
            break

        for player in parsed_video.alive_players: