Example #1
2
File: all.py Project: KWMalik/tau
    def _extract(self, img):
        cA, (cH, cV, cD) = dwt2(img, self.mother)
        cH2 = cH.reshape(cH.size)
        cV2 = cV.reshape(cV.size)
        cD2 = cD.reshape(cD.size)
        assert cH2.shape == cV2.shape == cD2.shape
        buffers = (cH2, cV2, cD2)
        chunk_size = (cH2.size * 3) // (self.total_bits)
        seq0 = self.seq0[:chunk_size]
        seq1 = self.seq1[:chunk_size]

        byte = 0
        output = bytearray()
        for i in range(self.total_bits):
            target = buffers[i % 3]
            offset = (i//3) * chunk_size
            chunk = target[offset : offset + seq0.size]
            corr0, _ = pearsonr(chunk, seq0)
            corr1, _ = pearsonr(chunk, seq1)
            bit = int(corr1 > corr0)
            byte = (byte << 1) | bit
            if i % 8 == 7:
                output.append(byte)
                byte = 0
        #print repr(output)
        return output
def norm_distri(DF):

## Creating a Normalised data set of filtered products: based on code from http://chrisalbon.com/python/pandas_normalize_column.html
	#calculating the normalised values of the SP mutant products

	DF["norm_fcp"] = (DF['fcp'] - DF['fcp'].min()) / (DF['fcp'].max() - DF['fcp'].min())
	
	DF.plot(kind='scatter', x='norm_fcp', y='fc');
	plt.ylim((-0.01,1.01))
	plt.xlim((-0.01,1.01))
	plt.ylabel('Double Mutant FC Values')
	plt.xlabel('Normalised Product of corresponding Single Mutant FC Values')
	#plot the linear correlation (default = red)
	#assign x, y because x and y above are in plot()'s namespace 
	x=DF['norm_fcp']
	y=DF['fc']
	z = np.polyfit(x, y, 1)
	p = np.poly1d(z)
	pylab.plot(x,p(x),"r-")
	# the line equation:
	#print "y=%.6fx+(%.6f)"%(z[0],z[1])

	from scipy.stats import pearsonr
	pearsonr(x, y)
	#print pearsonr(x, y)
	plt.suptitle(pearsonr(x, y))
	pylab.show()
def compare_images(indexpairs,imagematrix,pca_list,Target,Best):
    imagebag = []
    corrarray = []
    #calculate correlations
    for image in imagematrix:
        corr = pearsonr(image,Target)[0]
        corrarray.append(corr)
    corrarray = np.array(corrarray)
    #calculate best correlation in this generation
    maxcorr = np.amax(corrarray)
    #calculate average correlation in this generation
    avgcorr = np.mean(corrarray)
    #check if best correlation this gen is better than overall best
    if maxcorr > pearsonr(Best,Target)[0]:
        maxindex = np.argmax(corrarray)
        Best = np.array(imagematrix[maxindex])
    #Let the Hunger Games begin
    for i in indexpairs:
        corr1 = corrarray[i[0]]
        corr2 = corrarray[i[1]]
        if corr1 > corr2:
            imagebag.append(i[0])
        else:
            imagebag.append(i[1])
    #survivors
    newindices = np.array(random.sample(imagebag,100))
    pca_list_nextgen = pca_list[newindices]
    return pca_list_nextgen,maxcorr,avgcorr,Best
Example #4
0
def xcorr(x, y, method = 'pearsonr', shift = 5):
    from numpy import correlate, array, argmax, arange, linspace
    from scipy.stats import spearmanr, pearsonr
    method = method.lower()
    vCorr = []
    tCorr = []
    if shift == 0:
     if len(x) >= len(y):
      if method == 'pearsonr':
       v,t =  pearsonr(x[0:len(y)],y)
      elif method == 'spearmanr':
       v,t = spearmanr(x[0:len(y)],y)
     elif len(x) < len(y):
      if method == 'pearsonr':
       v,t =  pearsonr(x,y[0:len(x)])
      elif method == 'spearmanr':
       v,t = spearmanr(x,y[0:len(x)])
      vCorr.append(v)
    elif shift > 0 and shift < len(y):
     iCounter = 0
     for j in range(0,len(y)-shift,shift):
      vCorr.append([])
      tCorr.append([])
      for i in range(0,len(x)):
       if len(x[i:i+len(y[j:j+shift])]) == len(y[j:j+shift]):
        if method == 'pearsonr':
         v,t =  pearsonr(x[i:i+len(y[j:j+shift])],y[j:j+shift])
        elif method == 'spearmanr':
         v,t = spearmanr(x[i:i+len(y[j:j+shift])],y[j:j+shift])
        vCorr[iCounter].extend([v])
        tCorr[iCounter].extend([i])
      iCounter = iCounter + 1
        

    return array(vCorr), array(tCorr)
Example #5
0
    def predict(self, X):

        X = np.array(X)

        predictions = [self.classes_[int(pearsonr(self.B1, x)[0] > pearsonr(self.A1, x)[0])] for x in X]

        return predictions
Example #6
0
def train_model(lrmodel, opt, cost, X, Y, devX, devY, devscores):
    """
    Train model, using pearsonr on dev for early stopping
    """
    done = False
    best = -1.0
    r = np.arange(1, 6)

    train_set = ArrayIterator(X=X, y=Y, make_onehot=False)
    valid_set = ArrayIterator(X=devX, y=devY, make_onehot=False)

    eval_epoch = 10

    while not done:
        callbacks = Callbacks(lrmodel, eval_set=valid_set)

        lrmodel.fit(train_set, optimizer=opt, num_epochs=eval_epoch,
                    cost=cost, callbacks=callbacks)

        # Every 10 epochs, check Pearson on development set
        yhat = np.dot(lrmodel.get_outputs(valid_set), r)
        score = pearsonr(yhat, devscores)[0]
        if score > best:
            neon_logger.display('Dev Pearson: {}'.format(score))
            best = score
            bestlrmodel = copy.copy(lrmodel)
        else:
            done = True

        eval_epoch += 10

    yhat = np.dot(bestlrmodel.get_outputs(valid_set), r)
    score = pearsonr(yhat, devscores)[0]
    neon_logger.display('Dev Pearson: {}'.format(score))
    return bestlrmodel
def bootstrap_test(sample1, sample2, k = 1000, p_value = 0.05, two_tailed = True):
    """
    Test the null hypothesis that the two samples are independent from each other 
    thanks to pearson coefficients.
    Note that we keep nan values during the resampling (and eliminate them to compute 
    the pearson coefficient).
    
    Remember: if True, the coefficient is NOT significant.
    """
    # eliminate all entries which have a nan in one of the sample. 
    
    sample1_bis, sample2_bis = zip(*[zz for zz in zip(sample1, sample2) if not np.isnan(zz[0]) and not np.isnan(zz[1])])
    r_sample = pearsonr(sample1_bis, sample2_bis)[0]

    sample1_bis = np.array(sample1_bis)
    sample2_bis = np.array(sample2_bis)
    
    n = len(sample1_bis)
    try:
        assert n == len(sample2_bis)
    except:
        Exception("Samples must have same sizes.")

    r_resample = np.zeros(k)
    for i in xrange(k):
        s1_rand = sample1_bis[randint(0, n, n)] # Resampling with the same size
        s2_rand = sample2_bis[randint(0, n, n)] 
        s1_rand_bis, s2_rand_bis = zip(*[zz for zz in zip(s1_rand, s2_rand) if not np.isnan(zz[0]) and not np.isnan(zz[1])])
        r_resample[i] = pearsonr(s1_rand_bis, s2_rand_bis)[0]
        
    ci = np.percentile(r_resample, [p_value/2., 1.-p_value/2.])
    
    #print "Percentiles:", ci
    
    return  ci[0]<r_sample<ci[1]
Example #8
0
def plot_gpa_vs_grade(filename=None):
    plt.scatter(GPA,grade)
    plt.xlabel("Prior GPA")
    plt.ylabel("Class Grade (on GPA scale)")
    plt.title("GPA vs. Grade in BIOL 141 FA12")
    print pearsonr(GPA,grade)
    maybesave(filename)
def on_off_experiment1():
    """compare MI vs Gini on synthetic motifs"""
    n, L, des_ic, num_motifs = 50,10,10,1000
    sigma = 1
    copies = 10*n
    def f(Ne):
        return motif_ic(sample_on_off_motif(sigma, Ne, L, copies, n)) - des_ic
    Ne = log_regress_spec2(f,[1,10],tol=10**-5)
    motifs = [sample_on_off_motif(sigma, Ne, L, copies, n) for i in trange(num_motifs)]
    ics = map(motif_ic, motifs)
    ginis = map(motif_gini, motifs)
    mis = map(total_motif_mi, motifs)
    plt.subplot(1,3,1)
    plt.scatter(ics, ginis)
    plt.xlabel("IC (bits)")
    plt.ylabel("Gini")
    print "ic vs gini:",pearsonr(ics,ginis)
    plt.subplot(1,3,2)
    plt.scatter(ics, mis)
    plt.xlabel("IC (bits)")
    plt.ylabel("MI (bits)")
    print "ic vs mi:",pearsonr(ics,mis)
    plt.subplot(1,3,3)
    plt.scatter(ginis, mis)
    plt.xlabel("Gini")
    plt.ylabel("Mi (bits)")
    print "gini vs mi:",pearsonr(ginis,mis)
    plt.tight_layout()
    param_template = ", ".join("{0}=%({0})s".format(v) for v in "n L des_ic sigma copies num_motifs".split())
    param_string = param_template % vars()
    plt.title("Gini vs MI in On-Off Simulations")
    print "Pearson correlation:",pearsonr(ginis,mis)
    print "parameters:", param_string
Example #10
0
def user_analysis():
    for t in t_test_set:
        print "Analyzing ", t
        u_data = pd.read_csv(location + "par_normed_exp_data_repu_norm_users_" + t + ".csv")
        act = u_data["act"]
        exp = u_data["exp"]
        par = u_data["par"]
        # par = par-act
        act, exp, par = normalize_par(act, exp, par)
        ######### analyzig user side
        plot_distribution(act, "act", t)
        plot_distribution(exp, "exp", t)
        plot_distribution(par, "par", t)
        print ".data description"
        # print '..', u_data.describe()
        print "..act", mean(act), median(act), std(act)
        print "..exp", mean(exp), median(exp), std(exp)
        print "..par", mean(par), median(par), std(par)

        all_data = []
        all_data.append(act)
        all_data.append(exp)
        all_data.append(par)
        plot_correlation(all_data, t)
        print ".correlation between features"
        print "..act-exp", stats.pearsonr(act, exp)
        print "..act-par", stats.pearsonr(act, par)
        print "..par-exp", stats.pearsonr(par, exp)
    return 0
Example #11
0
def plot_si_vs_grade(filename=None):
    plt.scatter(si,grade)
    plt.xlabel("SI attendance count")
    plt.ylabel("Class Grade (on GPA scale)")
    plt.title("SI attendance vs. Grade in BIOL 141 FA12")
    print pearsonr(si,grade)
    maybesave(filename)
 def add_pearson_cor(self, vector1, vector2, rownames=False):
     '''
     Given two vectors, calculate pearson correlation,
     add pearson correlation and p-value into the list
     of self.pearson
     '''
     if rownames == False:
         pearson_cor, pearson_pval = stats.pearsonr(vector1, vector2)
         self.pearson_dic.append((pearson_cor, pearson_pval))
     elif rownames == True:
         if vector1[0] == vector2[0]:
             row_name = vector1[0]
             try:
                 vector1 = [float(i) for i in vector1[1:len(vector1)]]
                 vector2 = [float(i) for i in vector2[1:len(vector2)]]
             except ValueError:
                 print('Rowname: %s strange. Skipping for pearson calculation.' %row_name)
                 return None
                 '''
                 vector1 = [0] * (len(vector1) - 1)
                 vector2 = [0] * (len(vector2) - 1)
                 self.nacount += 1
                 self.nagenes.append(row_name)
                 row_name = 'NAgene'
                 '''
             pearson_cor, pearson_pval = stats.pearsonr(vector1, vector2)
             self.pearson_dic[row_name].append((pearson_cor, pearson_pval))
         else:
             print('Row name of two vectors not equal, %s and %s' %(vector1[0], vector2[0]))
             sys.exit('Exiting...')
     else:
         print('Rownames must be either True or False.')
         sys.exit('Exiting...')
Example #13
0
def train_model(lrmodel, X, Y, devX, devY, devscores):
    """
    Train model, using pearsonr on dev for early stopping
    """
    done = False
    best = -1.0
    r = np.arange(1, 6)

    while not done:
        # Every 100 epochs, check Pearson on development set
        lrmodel.fit(X, Y, verbose=2, shuffle=False,
                    validation_data=(devX, devY))
        yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r)
        score = pearsonr(yhat, devscores)[0]
        if score > best:
            print score
            best = score
            bestlrmodel = copy.deepcopy(lrmodel)
        else:
            done = True

    yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r)
    score = pearsonr(yhat, devscores)[0]
    print 'Dev Pearson: ' + str(score)
    return bestlrmodel
def pearson_r():
    tab_strings, tab_values, ground_truth = read_in_sim_data_as_table()
    tab_values = np.array(tab_values).transpose()
    print tab_values.shape
    tab_values = list(tab_values)
    for i in range(0, len(tab_values)):
        print pearsonr(tab_values[i], ground_truth)
def task3():
	iris = datasets.load_iris()
	data = iris.data
	target = iris.target
	target_names = iris.target_names
	feature_names = iris.feature_names
	model = PCA(n_components=data.shape[1])
	model.fit(data, target)
	data_new = model.transform(data)
	plot_iris(data_new, target, target_names)
	corrs1 = []
	corrs2 = []
	x1_new = data_new[:, 0] - data_new[:, 0].mean()
	x2_new = data_new[:, 1] - data_new[:, 1].mean()
	for i in xrange(data.shape[1]):
		x_i = data[:, i] - data[:, i].mean()
		corrs1.append(np.abs(pearsonr(x_i, x1_new)[0]))
		corrs2.append(np.abs(pearsonr(x_i, x2_new)[0]))

	list_pc1 = []
	list_pc2 = []
	for i in xrange(len(corrs1)):
		if corrs1[i] > corrs2[i]:
			list_pc1.append(i+1)
		else:
			list_pc2.append(i+1)
	write_answer_3(list_pc1, list_pc2)
def correlation():
    '''Pearson correlation, and two types of rank correlation (Spearman, Kendall)
    comparing age and %fat (measured by dual-photon absorptiometry) for 18 normal adults.
    '''
    
    # Get the data
    inFile = 'altman_11_1.txt'
    data = np.genfromtxt(inFile, delimiter=',')
    x = data[:,0]
    y = data[:,1]
    
    # --- >>> START stats <<< ---
    # Calculate correlations
    # Resulting correlation values are stored in a dictionary, so that it is
    # obvious which value belongs to which correlation coefficient.
    corr = {}
    corr['pearson'], _ = stats.pearsonr(x,y)
    corr['spearman'], _ = stats.spearmanr(x,y)
    corr['kendall'], _ = stats.kendalltau(x,y)
    # --- >>> STOP stats <<< ---
    
    print(corr)    
    
    # Assert that Spearman's rho is just the correlation of the ranksorted data
    np.testing.assert_almost_equal(corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0])
    
    return corr['pearson']  # should be 0.79208623217849117
def GroupRegression(GroupDF,goodsubj,feedback,numFolds=10,addMotion=True,verbose=False):

    numberOfICs=10
    columnNames=[]
    for rsnNumber in range(numberOfICs):
            columnNames.append('RSN%d' % rsnNumber)
    dmnIdeal=pd.read_csv('/home/jmuraskin/Projects/NFB/analysis/DMN_ideal_2.csv')

    SubjectDF = GroupDF[GroupDF.Subject_ID.isin(goodsubj)].groupby(['Subject_ID','FB','TR']).mean()
    clf = linear_model.LinearRegression()

    for indx,subj in enumerate(unique(GroupDF['Subject_ID'])):
        if verbose:
            print "Running Subject- %s" % subj
        if addMotion:
            X=np.column_stack((np.array(SubjectDF.loc[subj,feedback][columnNames]),zscore(SubjectDF.loc[subj,feedback]['fd'])))
        else:
            X=np.array(SubjectDF.loc[subj,feedback][columnNames])
        if verbose:
            print X.shape
        predicted,intercepts,coef = leaveOneOutCV(clf,X,dmnIdeal['Wander']-dmnIdeal['Focus'],numFolds=numFolds)
        if indx==0:
            groupGLM=pd.DataFrame({'TR':range(408),'predicted':predicted,'subj':[subj]*408})
            coefs=pd.DataFrame({'Coef':coef,'pe':range(X.shape[1]),'subj':[subj]*X.shape[1]})
            performance=pd.DataFrame({'R':[pearsonr(dmnIdeal['Wander']-dmnIdeal['Focus'],predicted)[0]],'subj':[subj]})
        else:
            df=pd.DataFrame({'TR':range(408),'predicted':predicted,'subj':[subj]*408})
            groupGLM=pd.concat((groupGLM,df),ignore_index=True)
            coefs=pd.concat((coefs,pd.DataFrame({'Coef':coef,'pe':range(X.shape[1]),'subj':[subj]*X.shape[1]})),ignore_index=True)
            performance=pd.concat((performance,pd.DataFrame({'R':[pearsonr(dmnIdeal['Wander']-dmnIdeal['Focus'],predicted)[0]],'subj':[subj]})),ignore_index=True)

    return groupGLM,coefs,performance
Example #18
0
def allDirectionalityRatios(ratioFunction):
    """
    A simple plot which calculates all directionality ratios, plots them
    and puts lines at 20 top highly expressed genes (Supp figure from our paper)
    This is mostly matplotlib code.
    """
    if not os.path.exists("savedHeatmaps"):
        os.mkdir("savedHeatmaps")
    wildRatio = np.log(ratioFunction("Wildtype_0min_BglII_rep1"))
    for j, dataset in enumerate(datasets):
        ax = plt.subplot(len(datasets), 1, j + 1)
        curRatio = (ratioFunction(dataset))
        plt.title("{1},  r = {0:.2f}, p={2:.2e}".format(pearsonr(curRatio, wildRatio)[0], names[dataset],
                                                      pearsonr(curRatio, wildRatio)[1]), fontsize=10)
        plt.tick_params(axis='both', which='major', labelsize=10)
        plt.tick_params(axis='both', which='minor', labelsize=8)
        plt.plot(curRatio)
        plt.ylim((0.25, 0.75))
        plt.xlim((0, len(curRatio)))
        #plt.ylim((0, 1))
        plt.yticks((0.25, 0.5, 0.75))
        geneCoor = [1162773, 3509071, 1180887, 543099, 1953250, 2522439, 3328524, 1503879, 900483, 242693, 3677144, 3931680, 3677704, 3762707, 3480870, 3829656, 1424678, 901855, 1439056, 3678537]
        genePos = [i / 10000. for i in geneCoor]
        #genePos = []
        for lpos in genePos:
            plt.vlines(lpos , -.8, .8, alpha=0.2, linewidth=1, color="black")
        plt.xticks([0, 50, 100, 150, 200, 250, 300, 350, 400], ["" for i in xrange(9)], fontsize=98)
        removeAxes(ax=ax)
        plt.subplots_adjust(0.07, 0.05, 0.94, 0.95, 0.2, 0.5)



    plt.show()
    exit()
def dualPlot(age, meanWithin, meanBetween, title):

    fig, (within, between) = plt.subplots(1, 2, sharex=True, sharey=False)

    # fitshit
    wP = np.polyfit(age, meanWithin, 1)
    bP = np.polyfit(age, meanBetween, 1)
    xnew = np.arange(age.min() - 1, age.max() + 1, 0.1)
    wFit = np.polyval(wP, xnew)
    bFit = np.polyval(bP, xnew)

    within.set_title("within network")
    between.set_title("between network")

    withinCorr, withinP = st.pearsonr(age, meanWithin)
    within.plot(age, meanWithin, "k.")
    within.plot(xnew, wFit, "r", label=(str(np.round(withinCorr, 2)) + " " + str(np.round(withinP, 4))))
    within.set_xlabel("mean connectivity")
    within.set_ylabel("age")
    within.legend()

    betweenCorr, betweenP = st.pearsonr(age, meanBetween)
    between.plot(age, meanBetween, "k.")
    between.plot(xnew, bFit, "b", label=(str(np.round(betweenCorr, 2)) + " " + str(np.round(betweenP, 4))))
    between.set_xlabel("mean connectivity")
    between.set_ylabel("age")
    between.legend()

    fig.suptitle(title)
    plt.show()
    raw_input("Press Enter to continue...")
    plt.close()
Example #20
0
def main(kernel, stats_dir, outputfile=None):
    stats = get_results(stats_dir, kernel)
    variations = get_vars()

    x = np.array([variations[s] for s in SIVAL])
    y = np.array([difference(stats, s) for s in SIVAL])
    r = pearsonr(x, y)[0]
    p = pearsonr(x, y)[1]
    print 'Correlation: %f' % r
    print 'P-value: %f' % p

    X = np.column_stack([x, np.ones(x.size)])
    bestfit = np.linalg.lstsq(X, y)[0]
    xx = np.linspace(np.min(x), np.max(x), 3)
    XX = np.column_stack([xx, np.ones(xx.size)])
    yy = np.dot(XX, bestfit)

    fig = pl.figure(figsize=(16,8))
    ax = fig.add_subplot(111)
    ax.scatter(x, y, s=25, edgecolor='none', color='k')
    ax.plot(xx, yy, '-', lw=3, color='k')
    
    if outputfile is None:
        pl.show()
    else:
        pdf = PdfPages(outputfile)
        pdf.savefig(fig, bbox_inches='tight')
        pdf.close()
Example #21
0
def correlation_bw_sinks(ref_sink,other_sinks, cut=True):
	'''ref sink -- CombinedSinkData() object
	other_sinks -- dictionary of CombinedSinkData() objects
	cut -- removes first 1 tBH from mdot data b/c this is transiet phase and always increases, we want to know if equilibrium rates are correlated
	correlation between ref sink and sinks in N different models, either at same location as ref sink or diff location
	returns pearson r correlation coeff for same location and different location
	'''
	r_same_loc, r_diff_loc= [],[]
	for key in other_sinks.keys():
		size= min(other_sinks[key].mdot.shape[1],ref_sink.mdot.shape[1])
		#if cut=False, then size is all we need, if cut=True then if ref starts at 10 and other at 2 then cannot just grab 10:size and 2:size, would be unequal array lengths
		cut_ref= index_before_tBH(ref_sink,1.)
		cut_other= index_before_tBH(other_sinks[key],1.)
		end_ref= size-other_sinks[key].mdot[:,:cut_other].shape[1] #if other starts at index 10, and ref at 5, then ref should end sooner than other (size - 10)
		end_other= size-ref_sink.mdot[:,:cut_ref].shape[1]
		#number of indices to grab has been figured out, finallly lets do pearson r calculation 
		for i in range(64):
			if i == 0: #same location 
				if cut: r_same_loc.append( pearsonr(other_sinks[key].mdot[i,cut_other:end_other],ref_sink.mdot[0,cut_ref:end_ref])[1] )
				else: r_same_loc.append( pearsonr(other_sinks[key].mdot[i,:size],ref_sink.mdot[0,:size])[1] )
			else: #different location 
				if cut: r_diff_loc.append( pearsonr(other_sinks[key].mdot[i,cut_other:end_other],ref_sink.mdot[0,cut_ref:end_ref])[1] )
				else: r_diff_loc.append( pearsonr(other_sinks[key].mdot[i,:size],ref_sink.mdot[0,:size])[1] )
	assert(len(r_same_loc) == len(other_sinks.keys()) )
	assert(len(r_diff_loc) == (len(other_sinks.keys())*(64-1) ) )
	return r_same_loc,r_diff_loc
Example #22
0
def new_abbas():

	jurkatA = 0.25; jurkatB = 0.05; jurkatC = 0.01; jurkatD = 0.002;
	im9A = 0.125; im9B = 0.317; im9C = 0.495; im9D = 0.333;
	rajiA = 0.25; rajiB = 0.475; rajiC = 0.165; rajiD = 0.333;
	thp1A = 0.375; thp1B = 0.158; thp1C = 0.33; thp1D = 0.333;

	i = 0
	result = []
	while i <= 95:
		liste = []
		j = 0
		while j <= 95:
			cells = []
			for k in range(3, 7):
				line = linecache.getline('../../../Master_files/abbas/Abbas_log_' + str(i) + '_' + str(j), k)
				cells.append([float(line.split('\t')[1]), float(line.split('\t')[2]), float(line.split('\t')[3]), float(line.split('\t')[4])])
			
			cells = [list(x) for x in zip(*cells)]
			
			for k in range(len(cells)):
				cells[k] = [float(l)/sum(cells[k]) for l in cells[k]]

			corrA = stats.pearsonr(cells[0], [jurkatA, im9A, rajiA, thp1A])
			corrB = stats.pearsonr(cells[1], [jurkatB, im9B, rajiB, thp1B])
			corrC = stats.pearsonr(cells[2], [jurkatC, im9C, rajiC, thp1C])
			corrD = stats.pearsonr(cells[3], [jurkatD, im9D, rajiD, thp1D])

			liste.append((corrA[0] + corrB[0] + corrC[0] + corrD[0]) / 4.0)
			j += 5
		result.append(liste)
		i += 5
	result = [list(x) for x in zip(*result)]

	return result
Example #23
0
	def corr(self):
		index=['RMSE','TRMSE','area','CLength','CEdges']
		index_Rank = [e+'_Rank' for e in index]
		for i in index:
			setattr(self,i+'_corr',pearsonr([route[i] for route in self.routes],[route['possibility'] for route in self.routes]))
		for i in index_Rank:
			setattr(self,i+'_corr',pearsonr([route[i] for route in self.routes],[route['possibility_Rank'] for route in self.routes]))
Example #24
0
def ks_2d_2samp(data1, data2):
    """Computes the 2-dimensional Kolmogorov-Smirnof statistic on 2 samples.
       This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution.
        Parameters :
        a, b : sequence of 1-D ndarrays
        two arrays of sample observations assumed to be drawn from a continuous distribution, sample sizes can be different

        Returns :
        D : float   KS statistic
        p-value : float  two-tailed p-value.
        High value means we cannot reject the hypothesis that they are from the same distribution.
        low D => high p
    """
    npt1 = np.shape(data1)[0]
    #Compute D using data1 as the origins
    D1 = np.max([max_diff_for_orig(data1[i,:], data1, data2) for i in xrange(npt1)])
    npt2 = np.shape(data2)[0]
    #Compute D using data2 as the origins
    D2 = np.max([max_diff_for_orig(data2[i,:], data1, data2) for i in xrange(npt2)])
    #Their mean
    D = (D1+D2)/2.
    #The approximate p-value: this is detailed in NR 14.8
    neff = npt1*npt2/(1.*npt1+npt2)
    (rr1,p) = st.pearsonr(data1[:,0], data1[:,1])
    (rr2,p) = st.pearsonr(data2[:,0], data2[:,1])
    reff = (rr1**2+rr2**2)/2.
    ksarg = neff*D/(1+np.sqrt(1-reff)*(025-0.75/np.sqrt(neff)))
Example #25
0
def test_recon(pAnom, scaled, start1, end1, start2, end2):
  """Compute reconstruction statistics."""

  pAnom1 = pAnom.ix[start1:end1].values.flatten()
  pAnom2 = pAnom.ix[start2:end2].values.flatten()

  scaled1 = scaled.ix[start1:end1].values.flatten()
  scaled2 = scaled.ix[start2:end2].values.flatten()

  mpAnom1 = pAnom1.mean()
  mpAnom2 = pAnom2.mean()

  pMinusScaled = pAnom2 - scaled2
  pMinusCal    = pAnom2 - mpAnom1
  pMinusVer    = pAnom2 - mpAnom2

  SSD  = (pMinusScaled*pMinusScaled).sum()
  SSMa = (pMinusCal*pMinusCal).sum()
  SSMb = (pMinusVer*pMinusVer).sum()

  MSE = SSD
  RE  = 1 - SSD/SSMa
  CE  = 1 - SSD/SSMb

  glk = GLK(pAnom2, scaled2)

  rcal, p = pearsonr(pAnom1, scaled1)
  rver, p = pearsonr(pAnom2, scaled2)

  return { 'mse': MSE, 're': RE, 'ce': CE, 'rcal': rcal, 'rver': rver, 'glk': glk }
Example #26
0
        def train(df, target='ki'):
            y = df['ki']
            columns = df.columns.tolist()
            columns.remove('ki')
            X = df[columns]

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.5)

            pipe_line = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=1.0,
                                          min_df=0.0,
                                          lowercase=False,
                                          token_pattern=r'(?u)\b\S+\b',
                                          analyzer='word')
                 ), ('model', RandomForestRegressor(n_estimators=50,
                                                    n_jobs=1))
            ])

            pipe_line.fit(X_train['tokens'], y_train)
            ofn = "/work/jaydy/working/pdbbind/PDBBind_refined_15/rf.{}.pkl".format(
                target)
            joblib.dump(pipe_line, ofn)

            prediction = pipe_line.predict(X_test['tokens'])
            corr = pearsonr(y_test, prediction)

            dfire_corr = pearsonr(y_test, X_test['dfire'])
            uncorr_dfire_corr = pearsonr(y_test, X_test['uncor_dfire'])

            print("rf correlation: {}".format(corr))
            print("dfire correlation: {}".format(dfire_corr))
            print("uncorrelated-dfire correlation: {}".format(
                uncorr_dfire_corr))
Example #27
0
 def wordsim(self, path = "wordsim/wordsim353/combined.tab"):
     (pairs, scores) = self.loadCorpus(path)
     #m = self.loadSenna("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") #dict{str: np.array()}
     m = Word2Vec.load_word2vec_format("../google_data/GoogleNews-vectors-negative300.bin.gz", binary=True)
     print "--- Original Pairs: ---"
     for pair in pairs:
         print pair
     words = set(m.index2word)
     (pairs,nums) = self.checkWords(m, pairs)
     print "--- After Matching: ---"
     ### For WS dataset.
     #nums = [0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 13, 16, 17, 19, 23, 24, 25, 27, 28, 29, 30, 31, 32, 36, 37, 40, 43, 44, 49, 54, 55, 56, 57, 58, 59, 60, 61, 62, 65, 70, 74, 75, 83, 84, 85, 86, 88, 90, 94, 96, 97, 98, 99, 100, 102, 107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 141, 142, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156, 161, 162, 163, 164, 165, 169, 171, 173, 174, 177, 178, 183, 184, 188, 190, 191, 194, 197, 198, 206, 210, 213, 214, 218, 219, 220, 221, 224, 225, 226, 227, 228, 230, 235, 238, 242, 247, 255, 256, 257, 259, 260, 267, 269, 273, 275, 277, 278, 279, 280, 282, 285, 286, 287, 288, 289, 291, 296, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 317, 318, 320, 321, 324, 325, 332, 334, 335, 336, 340, 343, 344, 347, 348, 350, 351, 352]
     print nums
     print "Original Number of Words",len(pairs)
     for pair in pairs:
         print pair
     matched_pairs = [pairs[num] for num in nums]
     matched_scores = [scores[num] for num in nums]
     print "--- After deleting unmatched: ---"
     print "Number of remaining words", len(matched_pairs)
     print matched_pairs
     print matched_scores
     cosine_scores = []
     for tmp in matched_pairs:
         cosine = 1 - spatial.distance.cosine(m[tmp[0]], m[tmp[1]])
         cosine_scores.append(cosine)
     print "--- After calculating cosine scores:--- "
     print cosine_scores
     print "--- Spearman Corelation ---"
     print stats.spearmanr(matched_scores, cosine_scores)
     print stats.pearsonr(matched_scores, cosine_scores)
Example #28
0
def new_cibersort():

	jurkatA = 0.25; jurkatB = 0.05; jurkatC = 0.01; jurkatD = 0.002;
	im9A = 0.125; im9B = 0.317; im9C = 0.495; im9D = 0.333;
	rajiA = 0.25; rajiB = 0.475; rajiC = 0.165; rajiD = 0.333;
	thp1A = 0.375; thp1B = 0.158; thp1C = 0.33; thp1D = 0.333;

	i = 0
	result = []
	while i <= 95:
		liste = []
		j = 0
		while j <= 95:
			cells = []
			for k in range(2, 6):
				line = linecache.getline('../../../Master_files/output/CIBERSORT_R_log_' + str(i) + '_' + str(j), k)
				cells.append([float(line.split('\t')[1]), float(line.split('\t')[2]), float(line.split('\t')[3]), float(line.split('\t')[4])])

			corrA = stats.pearsonr(cells[0], [jurkatA, im9A, rajiA, thp1A])
			corrB = stats.pearsonr(cells[1], [jurkatB, im9B, rajiB, thp1B])
			corrC = stats.pearsonr(cells[2], [jurkatC, im9C, rajiC, thp1C])
			corrD = stats.pearsonr(cells[3], [jurkatD, im9D, rajiD, thp1D])

			liste.append((corrA[0] + corrB[0] + corrC[0] + corrD[0]) / 4.0)
			j += 5
		result.append(liste)
		i += 5
	#result = reversed(result)
	result = [list(x) for x in zip(*result)]

	return result
Example #29
0
    def _call(self, dataset):
        """Computes the aslmap_dcm = sl_dcm(group_data)verage correlation in similarity structure across chunks."""
        
        chunks_attr = self.chunks_attr
        nchunks = len(np.unique(dataset.sa[chunks_attr]))
        if nchunks < 2:
            raise StandardError("This measure calculates similarity consistency across "
                                "chunks and is not meaningful for datasets with only "
                                "one chunk:")

        #calc neur sim b/w targ_comp targets per subject
        neur_sim={}
        for s in np.unique(dataset.sa[chunks_attr]):
            ds_s = dataset[dataset.sa.chunks == s]
            neur_sim[s+'1'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp1[0]],ds_s[ds_s.sa.targets == self.targ_comp1[1]])[0][1]            
            neur_sim[s+'2'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp2[0]],ds_s[ds_s.sa.targets == self.targ_comp2[1]])[0][1]            

        #combine xSs_behavs
        xSs_behav = {}
        for s in self.xSs_behav1:
            xSs_behav[s+'1'] = self.xSs_behav1[s]
        for s in self.xSs_behav2:
            xSs_behav[s+'2'] = self.xSs_behav2[s]

        #create dsets where cols are neural sim and mt sim for correlations
        behav_neur = np.array([[xSs_behav[s],neur_sim[s]] for s in neur_sim])
        #correlate behav with neur sim b/w subjects
        if self.comparison_metric == 'spearman':
            xSs_corr = pearsonr(rankdata(behav_neur[:,0]),rankdata(behav_neur[:,1])) 
        xSs_corr = pearsonr(behav_neur[:,0],behav_neur[:,1])
        
        #returns fish z transformed r coeff ; could change to be p value if wanted...
        return Dataset(np.array([np.arctanh(xSs_corr[0])])) 
Example #30
0
 def extractExpFit(self,sample):
     samplingRange = sample.shape[1]
     sampleLen = sample.shape[0]
     
     sampleFl = sample.astype(np.float32)
     totals = np.array([0.0,0.0,0.0])
     
     for iPerpProfile in range(0,sample.shape[0]):
         profile = sampleFl[iPerpProfile,:]
         if(len(profile.shape) == 2):
             for iChannel in range(0,3):
                 y = np.log(profile[:,iChannel])
                 cor = -st.pearsonr(np.arange(0,samplingRange),y)[0]
                 if(math.isnan(cor)):
                     cor = 0.0
                 totals[iChannel] += cor
         else:
             y = np.log(profile)
             cor = -st.pearsonr(np.arange(0,samplingRange),y)[0]
             if(math.isnan(cor)):
                 cor = 0.0 
             totals[0] += cor
     totals /= sampleLen    
     
     if(len(sample.shape) == 3):
         return np.array(totals,dtype=ExtractTool.outputDType)
     else:
         return ExtractTool.outputDType(totals[0])
Example #31
0
 def r2(x, y):
     return stats.pearsonr(x, y)[0] ** 2
Example #32
0
def PCA1():
       
       print (rcsetup.all_backends)
       
       data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None)
       
       data.columns
       # rename column names to be similar to R naming convention
       data.columns = ["V"+str(i) for i in range(1, len(data.columns)+1)]  
       data.V1 = data.V1.astype(float)
       # independent variables data
       X = data.loc[:, "V1":]  
       # dependednt variable data
       Y = data.V1  
       #data
       #print (X)
       
       
       #if you want them stacked vertically 
       #f, (ax1, ax2, ax3) = plt.subplots(1, 3)
       
#==============================================================================
# Scatter plot
#==============================================================================
       pd.tools.plotting.scatter_matrix(data.loc[:, "V2":"V6"], diagonal="hist")
       plt.tight_layout()
       plt.show()
       sns.lmplot("V4", "V5", data, hue="V1", fit_reg=True)
       #ax.xaxis.tick_top()

#==============================================================================
# Profile plot
#==============================================================================
       ax = data[["V2","V3","V4","V5","V6"]].plot()
       plt.figure()
       ax.legend(loc='center left', bbox_to_anchor=(1, 0.5));

#==============================================================================
# Summary statistics
#==============================================================================      
      
       '''
       print (X.apply(np.mean))
       print (X.apply(np.std))
       '''

#==============================================================================
 #Extract out just cultivar 2 - for example (same can be done for cultivar 1 and 3)
#==============================================================================       

       '''
       class2data = data[Y==2] 
       print (class2data.loc[:, "V2":].apply(np.mean))
       print (class2data.loc[:, "V2":].apply(np.std))
       '''
       
#==============================================================================
# Within and Between Groups Variance 
#==============================================================================       
       #printMeanAndSdByGroup(X, Y)
       
       '''
       print (calcWithinGroupsVariance(X.V2, Y))
       print (calcBetweenGroupsVariance(X.V2, Y))
       calcSeparations(X, Y)
       print ("Within Group Co-Variance = ", calcWithinGroupsCovariance(X.V8, X.V11, Y))
       print ("Between Group Co-Variance = ", calcBetweenGroupsCovariance(X.V8, X.V11, Y))
       '''

#==============================================================================
# Co-orelation text matrix and the heatMap
#==============================================================================      
       
       corrmat = X.corr()
       print ("\n *****FIRST DATA OUTPUT: Co-orelation matrix*****::\n\n", corrmat)
       plt.figure()
       sns.heatmap(corrmat, vmax=1., square=True)
       ax.xaxis.tick_top()

#==============================================================================
# Most highly co-orelated
#==============================================================================       
       
       cor = stat.pearsonr (X.V2, X.V3)
       print ("\n ***** SECOND DATA OUTPUT *****::\n\n")
       print ("Cor:", cor[0], "\t p-value:", cor[1], "\n")
       print ("\n ***** THIRD DATA OUTPUT *****::\n\n")       
       print (mosthighlycorrelated(X, 10))
          
#==============================================================================
# Standardize before running PCA
#==============================================================================
       
       standardisedX = scale(X)
       standardisedX = pd.DataFrame(standardisedX, index=X.index, columns=X.columns)
       standardisedX.apply(np.mean)
       standardisedX.apply(np.std)
       
#==============================================================================
# Run the PCA process
#==============================================================================
       '''
       PCA Process
       '''
       pca = PCA().fit(standardisedX)
       summary = pca_summary(pca, standardisedX)
       plt.figure()
       screeplot(pca, standardisedX)

#==============================================================================
# First Principal Component
#==============================================================================                    
       print ("\n ***** FIRST PRINCIPAL COMPONENT *****::\n\n")
       print (pca.components_[0])
       print ("Sum of Variances:", np.sum(pca.components_[0]**2))

       #Calculate the values of the first principal component
       print (calcpc(standardisedX, pca.components_[0]))
       #Another way - Calculate the values of the first principal component
       #print (pca.transform(standardisedX)[:, 0])
       
#==============================================================================
# Second Principal Component
#==============================================================================
       print ("\n ***** SECOND PRINCIPAL COMPONENT *****::\n\n")
       print (pca.components_[1])
       print ("Sum of Variances: ", np.sum(pca.components_[1]**2))
       
       #Calculate the values of the second principal component
       print (calcpc(standardisedX, pca.components_[1]))
       #Another way - Calculate the values of the second principal component
       #print (pca.transform(standardisedX)[:, 1])

#==============================================================================
# Scatter Plot for the principal components
#==============================================================================       

       pca_scatter(pca, standardisedX, Y)
       
     
       return
Example #33
0
import pandas as pd
from scipy.stats import pearsonr

df = pd.read_csv('wdbc.csv')
df.set_index('ID_number', inplace=True)

corr, _ = pearsonr(df.area, df.perimeter)
print('Pearsons Correlation Coefficient of Area and Perimeter: %.3f' % corr)
print(
    'The pearsonr() SciPy function is used to calculate the Pearson correlation coefficient between two data samples \n'
    'Area and Perimeter. The Pearsons Correlation Coefficient of Area and Perimeter is 0.987, which indicates a notable \n'
    'correlation between the Area and Perimeter of cell nucleus. Therefore, we can interpret the area of the cell nucleus is \n'
    'proportional to its perimeter.')
def validate():
    """
    run KFOLD method for regression 
    """
    #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/eraFiveLag"
    dir_out = "/lustre/fs0/home/mtadesse/mlrValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    
    x = 52
    y = 53
    
    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])
    
    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)
        
        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        print("it's all good!", '\n')
        print(pred_standardized.shape)
    
        # #merge predictors and surge to find common time frame
        # pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        # pred_surge.sort_values(by = 'date', inplace = True)
        predSubset = pred_standardized.iloc[:,:2]
        dateSubset = pd.merge(predSubset, surge_new.iloc[:,:2], on ='date', how = 'left')
        pred_standardized['surge'] = dateSubset['surge']
        pred_surge = pred_standardized.copy()
        del pred_standardized
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)
        
        metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
            #train regression model
            lm = LinearRegression()
            lm.fit(X_train, y_train)
            
            #predictions
            predictions = lm.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)    
            
            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)
        
        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')
        
        #original size and pca size of matrix added
        new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis = 0)
        
        
        #save df as cs - in case of interruption
        os.chdir(dir_out)
        # deltaName = dir_out.split('\\')[-1]
        # saveName = 'era5_lrreg_kfold_'+deltaName+'.csv'
        df.to_csv(tg_name)
        
        #cd to dir_in
        os.chdir(dir_in)
Example #35
0
 def score(self, preds, targets):
     #preds['target'][0, 0] += random.random()*1e-6
     score, _ = pearsonr(preds['target'][:, 0], targets['target'])
     base_score, _ = pearsonr(preds['target'][:, 0],
                              shuffle(targets['target']))
     return "{0:.4f}/{1:.4f}".format(score, base_score)
Example #36
0
                savefig('figures/draw_graph_fa_{}_cluster_trajectory_age_k{}.png'
                        .format(NAMESPACE, knn), ax)

    if VIZ_CORR_PSEUDOTIME:
        sc.pp.neighbors(adata, n_neighbors=20)

        draw_graph(adata, layout='fa')

        tprint('Diffusion pseudotime analysis...')

        tprint('pseudotime')
        sc.tl.diffmap(adata)
        adata.uns['iroot'] = np.flatnonzero(adata.obs['age'] < 14.6)[0]
        sc.tl.dpt(adata)
        finite_idx = np.isfinite(adata.obs['dpt_pseudotime'])
        tprint(pearsonr(adata.obs['dpt_pseudotime'][finite_idx],
                        adata.obs['age'][finite_idx]))
        tprint(spearmanr(adata.obs['dpt_pseudotime'][finite_idx],
                         adata.obs['age'][finite_idx]))

        ax = sc.pl.draw_graph(
            adata, color='dpt_pseudotime', edges=True, edges_color='#CCCCCC',
            color_map='inferno', show=False,
        )
        savefig('figures/draw_graph_fa_{}_cluster_trajectory_dpt.png'
                .format(NAMESPACE), ax)

        pair2corr = {}
        assert(len(gene_pairs) == X_dimred.shape[1])
        for pair_idx, pair in enumerate(gene_pairs):
            pair2corr[pair] = pearsonr(
                X_dimred[finite_idx, pair_idx],
    return val1


def p_val_mi(x, y):
    count = 0.0
    iterations = 10000
    score = metrics.adjusted_mutual_info_score(x, y)
    for i in range(iterations):
        shuffle(x)
        shuffle(y)
        if metrics.adjusted_mutual_info_score(x, y) >= score:
            count += 1.0
    return count / iterations


pearsonc = lambda x, y: stats.pearsonr(np.array(x), np.array(y))[0]
p_val_test1 = lambda x, y: stats.pearsonr(np.array(x), np.array(y))[1]

ajd_mi_bin = lambda x, y: metrics.adjusted_mutual_info_score(
    bin_variable(x), bin_variable(y))
p_val_test2 = lambda x, y: p_val_mi(bin_variable(x), bin_variable(y))

spearmanc = lambda x, y: stats.spearmanr(np.array(x), np.array(y))[0]

# In[ ]:

## Evaluating p-distribution for p-value computation
num_samples_estimation = 100000
sig_to_noise_rate = [j * 0.1 for j in range(21)]  #0, 0.1 ,...,0.9, 1,...2
sig_to_noise_rate[0] = 0.0001
#num_points=[k*10 for k in range(1,16)]#10,20,30,...150
Example #38
0

df = pd.read_csv('ANDR1602_clean.csv', sep=',')
# df = df.drop(columns=["id", "time step"])
features = ["wind direction", "temperature", "humidity", "pressure",
            "dewpoint", "wind speed at 2 meters", "solar radiation"]
target = ["wind speed"]

X = [i for i in range(samples)]
features_df = df[features]
y = df[target]

# plt.scatter(X_test, y_test_true, color='red')
# plt.plot(idx_test, y_test_true, color='blue')
# plt.plot(X[10:100], y[10:100], color='pink')
# plt.show()

#### correlation calculation ###
for f_index, f in enumerate(features):
    my_feature = features_df.values[:, f_index]
    my_target = y.values[:, 0]
    coeff, pvalue = pearsonr(my_feature, my_target)
    print("feature:", f, ", yield:", target[0], "; coeff:", coeff, ";p-value:", pvalue)

# yield_year_count = len(labels.columns)
# for f in range(features):
#     for y in range(yield_year_count):
#         f_label = df.columns[8+f]
#         y_label = df.columns[3 + y]
#         coeff, pvalue = pearsonr(dat[:, f], labels.values[:, y])
#         print("feature:", f_label, ", yield:", y_label, "; coeff:", coeff, ";p-value:", pvalue)
Example #39
0
def bounds(input_bigWig, peaks_df, peak_width, smoothing_params=[7, 81]):
    """
        Function to compute lower & upper bounds, and average profile
        performance for cross entropy and jsd metrics
        
        Args:
            input_bigWig (str): path to bigWig file
            
            peaks_df (str): pandas dataframe containing peaks 
                information. 
                
                The dataframe should have 'chrom', 'start', and 'end'
                as first 3 columns. Each peak should have the same
                width (equal to peak_width) i.e 'end' - 'start' is the
                same for all rows in the dataframe.
                
            peak_width (int): width of each peak.
            
            smoothing_params (list): list of length 2, containing sigma
                and window_size values for 1D gaussian smoothing of 
                profiles
        
        Returns:
            tuple: (numpy array of average profile, pandas dataframe
                with bounds values in columns)
                
    """
    
    # compute the average profile
    print("Computing average profile ...")
    avg_profile = get_average_profile(input_bigWig, peaks_df, peak_width)
    
    # get average profile as probabilities
    avg_profile_prob = avg_profile / np.sum(avg_profile)
    
    # open the bigWig file for reading
    bw = pyBigWig.open(input_bigWig)
        
    # arrays to hold metrics values for mnll, cross entropy, jsd, 
    # pearson and spearman correlation of the peak profile computed 
    # against uniform, average and self(observed peak) profile

    # mnll
    mnll_uniform = np.zeros(peaks_df.shape[0])
    mnll_average = np.zeros(peaks_df.shape[0])
    mnll_self = np.zeros(peaks_df.shape[0])    
    
    # cross entropy
    ce_uniform = np.zeros(peaks_df.shape[0])
    ce_average = np.zeros(peaks_df.shape[0])
    ce_self = np.zeros(peaks_df.shape[0])
    
    # jsd
    jsd_uniform = np.zeros(peaks_df.shape[0])
    jsd_average = np.zeros(peaks_df.shape[0])
    jsd_self = np.zeros(peaks_df.shape[0])
    
    # pearson
    pearson_uniform = np.zeros(peaks_df.shape[0])
    pearson_average = np.zeros(peaks_df.shape[0])
    pearson_self = np.zeros(peaks_df.shape[0])
    
    # spearman
    spearman_uniform = np.zeros(peaks_df.shape[0])
    spearman_average = np.zeros(peaks_df.shape[0])
    spearman_self = np.zeros(peaks_df.shape[0])

    print("Computing bounds ...")

    # iterate through all peaks
    for idx, row in tqdm(peaks_df.iterrows(), desc='peak', 
                         total=peaks_df.shape[0]):

        # raise exception if 'end' - 'start' is not equal to peak_width
        if (row['end'] - row['start']) != peak_width:

            raise quietexception.QuietException(
                "Inconsistent peak width found at: {}:{}-{}".format(
                    row['chrom'], row['start'], row['end']))

        # get bigWig profile
        profile = np.nan_to_num(
            bw.values(row['chrom'], row['start'], row['end']))

        # if we find that the profile at this peak is all zeros
        if sum(profile) == 0:

            print("Found 'zero' profile at {}: ({}, {})".format(
                row['chrom'], row['start'], row['end']))

            # assign nans to all 
            mnll_uniform[idx] = np.nan
            mnll_average[idx] = np.nan
            mnll_self[idx] = np.nan

            ce_uniform[idx] = np.nan
            ce_average[idx] = np.nan
            ce_self[idx] = np.nan

            jsd_uniform[idx] = np.nan
            jsd_average[idx] = np.nan
            jsd_self[idx] = np.nan

            pearson_uniform[idx] = np.nan
            pearson_average[idx] = np.nan
            pearson_self[idx] = np.nan

            spearman_uniform[idx] = np.nan
            spearman_average[idx] = np.nan
            spearman_self[idx] = np.nan

            continue

        # uniform distribution profile
        uniform_profile = np.ones(peak_width) * (1.0 / peak_width)

        # smoothed profile 
        profile_smooth = gaussian1D_smoothing(profile, smoothing_params[0], 
                                              smoothing_params[1])

        # smoothed profile as probabilities 
        profile_smooth_prob = profile_smooth / np.sum(profile_smooth)

        # profile as probabilities
        profile_prob = profile / np.sum(profile)

        # mnll of profile with uniform profile
        mnll_uniform[idx] = mnll(profile, probs=uniform_profile)

        # mnll of profile with average profile
        mnll_average[idx] = mnll(profile, probs=avg_profile_prob)

        # mnll of profile with itself
        mnll_self[idx] = mnll(profile, probs=profile_prob)

        # cross entropy of profile with uniform profile
        ce_uniform[idx] = profile_cross_entropy(profile, 
                                                probs=uniform_profile)

        # cross entropy of profile with average profile
        ce_average[idx] = profile_cross_entropy(profile, 
                                                probs=avg_profile_prob)

        # cross entropy of profile with itself
        ce_self[idx] = profile_cross_entropy(profile, probs=profile_prob)

        # jsd of profile with uniform profile
        jsd_uniform[idx] = jensenshannon(profile_prob, uniform_profile)

        # jsd of profile with average profile
        jsd_average[idx] = jensenshannon(profile_prob, avg_profile_prob)

        # jsd of profile with itself (upper bound)
        jsd_self[idx] = 0.0

        # pearson of profile with uniform profile
        ### nothing to do ... leave it as zeros

        # pearson of profile with average profile
        pearson_average[idx] = pearsonr(profile, avg_profile_prob)[0]
        
        # pearson of profile with itself
        pearson_self[idx] = pearsonr(profile, profile)[0]
        
        # spearman of profile with uniform profile
        ### nothing to do ... leave it as zeros

        # spearman of profile with average profile
        spearman_average[idx] = spearmanr(profile, avg_profile_prob)[0]

        spearman_self[idx] = spearmanr(profile, profile)[0]

    # create a pandas dataframe to hold the upper & lower bound, 
    # and avg profile performance values 
    column_names = ['mnll_uniform', 'mnll_average', 'mnll_self',
                    'ce_uniform', 'ce_average', 'ce_self',
                    'jsd_uniform', 'jsd_average', 'jsd_self',
                    'pearson_uniform', 'pearson_average', 'pearson_self', 
                    'spearman_uniform', 'spearman_average', 'spearman_self']
    
    # create a pandas dataframe to store all the bounds values
    bounds_df = pd.DataFrame(columns = column_names)
        
    # assign values to the dataframe columns
    bounds_df['mnll_uniform'] = np.nan_to_num(mnll_uniform)
    bounds_df['mnll_average'] = np.nan_to_num(mnll_average)
    bounds_df['mnll_self'] = np.nan_to_num(mnll_self)
    bounds_df['ce_uniform'] = np.nan_to_num(ce_uniform)
    bounds_df['ce_average'] = np.nan_to_num(ce_average)
    bounds_df['ce_self'] = np.nan_to_num(ce_self)
    bounds_df['jsd_uniform'] = np.nan_to_num(jsd_uniform)
    bounds_df['jsd_average'] = np.nan_to_num(jsd_average)
    bounds_df['jsd_self'] = np.nan_to_num(jsd_self)
    bounds_df['pearson_uniform'] = np.nan_to_num(pearson_uniform)
    bounds_df['pearson_average'] = np.nan_to_num(pearson_average)
    bounds_df['pearson_self'] = np.nan_to_num(pearson_self)
    bounds_df['spearman_uniform'] = np.nan_to_num(spearman_uniform)
    bounds_df['spearman_average'] = np.nan_to_num(spearman_average)
    bounds_df['spearman_self'] = np.nan_to_num(spearman_self)

    return avg_profile, bounds_df
    100.11, 104.53, 106.46, 92.33, 101.0, 99.53, 116.2, 97.9, 102.54, 111.68,
    85.02, 109.92, 99.53, 80.96, 71.91, 99.73, 92.6, 75.75, 98.29, 104.49,
    112.13
]

dexterity1 = np.array(dexterity1)
dexterity2 = np.array(dexterity2)

kwargs = dict()
for key in ['mse', 'corr', 'r2', 'betas']:
    data = apply_function(dataframes[0],
                          keys=['nodes_1', 'nodes_2', 'y_attr', 'band'],
                          attr=key,
                          fx=lambda x: np.mean(x))

    r = np.array([pearsonr(zscore(dexterity1), v) for v in data[key].values])
    data['r'] = r[:, 0]
    data['p'] = r[:, 1]
    #kwargs['p'] = [pearsonr(dexterity1, v)[1] for v in data[key].values]

    grid = sns.FacetGrid(data, col="y_attr", row="band")
    grid.map(plot_matrix, "nodes_1", "nodes_2", 'r', 'p', **kwargs)


def plot_matrix(nodes1, nodes2, acc, p, **kwargs):

    df = dict(n1=nodes1, n2=nodes2, a=acc)
    df = pd.DataFrame(df)
    pdf = df.pivot("n1", "n2", "a")

    nz = np.nonzero(np.isnan(pdf.values))
def custom(a, b):
    v,_ = stats.pearsonr(a, b)
    return round(v, 4)
Example #42
0
File: ptdt.py Project: edg1983/ptdt
def ptdt_analysis(PRS, PRS_iid, PRS_prs, structured):
    '''
    Takes PRS score file and strcture matrix and returns
    pTDT summary statistics.
    
    '''

    PRS_values = {}
    for line in PRS:
        try:
            PRS_values[line.split()[int(PRS_iid)]] = float(line.split()[int(PRS_prs)]) 
            #creating dict for each ID to call its PRS
        except ValueError:
            continue
            
    print('Creating pTDT matrix.', end="") #progress
    data = [] #initialize list to store each row of matched matrix

    for list_ in structured:
        row = [] #create empty list per row
        values = list_
        row.append(values[0])
        n = 4
        if quad == 'True': #Include sibling PRS if quad flag called
            n += 1
        for i in range(1,n): #fill each place in list with values from dict
            try:
                row.append(str(PRS_values[values[i]]))

            except KeyError: 
                row.append('NA') #return NA for missing PRS values
        data.append(row)

    print('.', end="")
    if quad == 'True': #Create a sibling column if quad flag called
        matrix = pd.DataFrame(data, columns=['ID','Proband','Father','Mother','Sibling'])
    else: 
        matrix = pd.DataFrame(data, columns=['ID','Proband','Father','Mother'])
    # convert dict values to numeric and force NaNs from missing
    IDs = matrix['ID']
    matrix = matrix.apply(pd.to_numeric, errors='coerce')
    matrix['ID'] = IDs #rescue non numeric IDs

    print('.', end="")

    # create output matrix
    output = pd.DataFrame(index = range(0, matrix.shape[0]), columns = ["FID","mp_PRS","pro_PRS","pro_pTDT"])

    output['FID'] = matrix['ID']
    output['mp_PRS'] = (matrix.iloc[:,2]+ matrix.iloc[:,3])/2
    output['pro_PRS'] = matrix['Proband']
    sd = np.std(output['mp_PRS'], ddof=1)
    output['pro_pTDT'] = (output.iloc[:,2] - output.iloc[:,1])/sd
    if quad == 'True':
        output['sib_PRS'] = matrix['Sibling']
        output['sib_pTDT'] = (output.iloc[:,4] - output.iloc[:,1])/sd
    print(' done.')

    # Quality control
    output1 = output.dropna(subset=['pro_PRS','mp_PRS'])
    corr1 = ss.pearsonr(output1['pro_PRS'],output1['mp_PRS'])[0]
    if quad == 'True':
        output2 = output.dropna(subset=['sib_PRS','mp_PRS'])
        corr2 = ss.pearsonr(output2['sib_PRS'],output2['mp_PRS'])[0]
    else:
        corr2 = 1
    if corr1 >= .2 and corr2 >= .2:
        print('QC pass.')
        log.write('QC pass.\n')
    else:
        print('WARNING: QC fail - Low correlation between mid-parent PRS and proband/sibling PRS.')
        log.write('WARNING: QC fail - Low correlation between mid-parent PRS and proband/sibling PRS.\n')
    
    # t-test
    x = output['pro_pTDT']
    x_totlength = len(x)
    x = x[~np.isnan(x)] # remove NaNs 
    x_usedlength = len(x)
    print('{0} probands used in pTDT analysis ({1} skipped due to missingness).' .format(x_usedlength,x_totlength-x_usedlength))
    log.write('{0} probands used in pTDT analysis ({1} skipped due to missingness).\n' .format(x_usedlength,x_totlength-x_usedlength))
    if x_usedlength == 0:
        raise RuntimeError('Column order in PRS file is incorrect')
    pTDT_mean = np.mean(x)
    pTDT_std = np.std(x,ddof=1)/np.sqrt(len(x))
    pTDT_pvalue = ss.ttest_1samp(x, popmean=0).pvalue

    values = [pTDT_mean,pTDT_std,pTDT_pvalue]

    if quad == 'True':
        y = output['sib_pTDT']
        y_totlength = len(y)
        y = y[~np.isnan(y)]
        y_usedlength = len(y)
        print('{0} siblings used in pTDT analysis ({1} skipped due to missingness).' .format(y_usedlength,y_totlength-y_usedlength))
        log.write('{0} siblings used in pTDT analysis ({1} skipped due to missingness).\n' .format(y_usedlength,y_totlength-y_usedlength))
        unaffected_mean = np.mean(y)
        unaffected_std = np.std(y,ddof=1)/np.sqrt(len(y))
        unaffected_pvalue = ss.ttest_1samp(y, popmean=0).pvalue
        values2 = [unaffected_mean,unaffected_std,unaffected_pvalue]
    
        values = values + values2
    
    if table == 'True':
        output['dad_PRS'] = matrix['Father']
        output['mom_PRS'] = matrix['Mother']
        if quad == 'True':
            cols = ['FID','pro_PRS','sib_PRS','dad_PRS','mom_PRS','mp_PRS','pro_pTDT','sib_pTDT']
        else:
            cols = ['FID','pro_PRS','dad_PRS','mom_PRS','mp_PRS','pro_pTDT']
        output = output[cols]
        output = output.round(4)
        headers = list(output.columns.values)
        outfile = open(outname+'.ptdt.table','w+')
        outfile.flush()
        output.to_csv(outname+'.ptdt.table', na_rep = 'NA', header=headers, index=None, sep='\t', mode='a+')

    return values
Example #43
0
#nmb_son=100.*((np.nanmean(sites_ammonium_son))- np.nanmean(gc_data_ammonium_son))/np.nanmean(gc_data_ammonium_son)
#nmb_djf=100.*((np.nanmean(sites_ammonium_djf))- np.nanmean(gc_data_ammonium_djf))/np.nanmean(gc_data_ammonium_djf)

nmb_Annual=100.*((np.nanmean(gc_data_ammonium_annual))- np.nanmean(sites_ammonium_AM))/np.nanmean(sites_ammonium_AM)
nmb_mam=100.*((np.nanmean(gc_data_ammonium_mam))- np.nanmean(sites_ammonium_mam))/np.nanmean(sites_ammonium_mam)
nmb_jja=100.*((np.nanmean(gc_data_ammonium_jja))- np.nanmean(sites_ammonium_jja))/np.nanmean(sites_ammonium_jja)
nmb_son=100.*((np.nanmean(gc_data_ammonium_son))- np.nanmean(sites_ammonium_son))/np.nanmean(sites_ammonium_son)
nmb_djf=100.*((np.nanmean(gc_data_ammonium_djf))- np.nanmean(sites_ammonium_djf))/np.nanmean(sites_ammonium_djf)

print(' DEFRA NMB_Annual= ', nmb_Annual)
print(' DEFRA NMB_mam = ', nmb_mam)
print(' DEFRA NMB_jja = ', nmb_jja)
print(' DEFRA NMB_son = ', nmb_son)
print(' DEFRA NMB_djf = ', nmb_djf)
#correlation
correlate_Annual=stats.pearsonr(gc_data_ammonium_annual,sites_ammonium_AM)

# dropping nan values and compute correlation
nas_mam = np.logical_or(np.isnan(gc_data_ammonium_mam), np.isnan(sites_ammonium_mam))
correlate_mam = stats.pearsonr(gc_data_ammonium_mam[~nas_mam],sites_ammonium_mam[~nas_mam])

nas_jja = np.logical_or(np.isnan(gc_data_ammonium_jja), np.isnan(sites_ammonium_jja))
correlate_jja = stats.pearsonr(gc_data_ammonium_jja[~nas_jja],sites_ammonium_jja[~nas_jja])

nas_son = np.logical_or(np.isnan(gc_data_ammonium_son), np.isnan(sites_ammonium_son))
correlate_son = stats.pearsonr(gc_data_ammonium_son[~nas_son],sites_ammonium_son[~nas_son])

nas_djf = np.logical_or(np.isnan(gc_data_ammonium_djf), np.isnan(sites_ammonium_djf))
correlate_djf = stats.pearsonr(gc_data_ammonium_djf[~nas_djf],sites_ammonium_djf[~nas_djf])

print('Correlation = ',correlate_Annual)
Example #44
0
###  the decision to stay or switch might be a lot more calculated than
###  the decision to UP or DN.
###  Including UP and DN separately in entropy might introduce variability
###  that isn't reflective of cognitive thought, but noise

fig = _plt.figure(figsize=(10, 10))
if1 = -1
for sfeat1 in ["entropyS", "entropyD", "entropyU"]:
    feat1 = eval(sfeat1)
    if1 += 1
    if2 = -1
    for sfeat2 in ["entropyW2", "entropyT2", "entropyL2"]:
        feat2 = eval(sfeat2)
        if2 += 1
        fig.add_subplot(3, 3, if1*3 + if2 + 1)
        pc, pv = _ss.pearsonr(feat1, feat2)
        _plt.title("%(pc).2f  %(pv).1e" % {"pc" : pc, "pv" : pv}) 
        _plt.scatter(feat1, feat2, color="black", s=5)
        if if2 == 0:
            _plt.ylabel(sfeat1, fontsize=18)
        if if1 == 2:
            _plt.xlabel(sfeat2, fontsize=18)
        _plt.xticks(fontsize=13)
        _plt.yticks(fontsize=13)        
fig.subplots_adjust(wspace=0.25, hspace=0.25)
_plt.savefig("corr_btwn_ent_comps_sim2")

fig = _plt.figure(figsize=(10, 10))
if1 = -1
for sfeat1 in ["entropyS", "entropyD", "entropyU"]:
    feat1 = eval(sfeat1)
Example #45
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr  9 14:59:34 2020

@author: dustan
"""

import numpy as np
import scipy.stats as stats

n = 10
x = np.random.normal(size=n)
y = 2 * x + np.random.normal(size=n)
# Compute with scipy
cor, pval = stats.pearsonr(x, y)
Example #46
0
from scipy.stats import pearsonr
from scipy.stats import linregress
from matplotlib import pyplot as plt
import numpy as np

sat = np.array([595, 520, 715, 405, 680, 490, 565])
gpa = np.array([3.4, 3.2, 3.9, 2.3, 3.9, 2.5, 3.5])

fig1 = plt.figure(1)
ax = plt.subplot(1, 1, 1)

pearson = pearsonr(sat, gpa)

plt.scatter(sat, gpa, label="data")

# Get linear regression parameters
slope, intercept, r_value, p_value, std_err = linregress(sat, gpa)

# Format the chart
plt.xlabel("SAT Scores")
plt.ylabel("GPA")
plt.title(
    "Scatter Plot with Linear Regression Fit\nY=a*X + b\na=%0.4f, b=%0.4f" %
    (slope, intercept))
plt.grid()

# Create linear regression x values
x_lr = sat

# Create linear regression y values: Y = slope*X + intercept
y_lr = slope * x_lr + intercept
Example #47
0
def normalization_and_graphing():
    ###probably useful data strucutures for this function:
    # vader_polarity_list
    # textblob_polarity_list
    # final_idx_list #Not used within function, but could be useful
    ##################

    global everything_included_list_vader
    everything_included_list_vader = np.log1p(vader_polarity_list)
    global everything_included_list_textblob_polarity
    everything_included_list_textblob_polarity = np.log1p(
        textblob_polarity_list)
    global everything_included_list_textblob_subjectivity
    everything_included_list_textblob_subjectivity = np.log1p(
        textblob_subjectivity_list)

    skew_textblob = skew(textblob_polarity_list)
    skew_vader = skew(vader_polarity_list)
    skew_textblob_subjectivity = skew(textblob_subjectivity_list)
    print("length of final score list: ", len(vader_polarity_list))
    print("length of textblob final score list: ", len(textblob_polarity_list))
    print("length of textblob subjectivity final score list: ",
          len(textblob_subjectivity_list))

    print("skew vader: ", skew_vader)
    print("skew textblo_polarityb: ", skew_textblob)
    print("skew textblob_subjectivity: ", skew_textblob_subjectivity)

    plt_1 = plt.figure(1)
    plt.hist(vader_polarity_list, bins=80, range=[-1, 1], align='mid')
    plt.ylabel("Frequency")
    plt.title('vader_polarity - original')
    plt.show()
    input("Press Enter to continue...")

    plt_2 = plt.figure(2)
    plt.hist(textblob_polarity_list, bins=80, range=[-1, 1], align='mid')
    plt.ylabel("Frequency")
    plt.title('textblob_polarity - original')
    plt.show()
    input("Press Enter to continue...")

    plt_3 = plt.figure(3)
    plt.hist(textblob_subjectivity_list, bins=80, range=[0, 1], align='mid')
    plt.ylabel("Frequency")
    plt.title('textblob_subjectivity - original')
    plt.show()
    input("Press Enter to continue...")

    print('removing outliers vader')

    elements = np.array(vader_polarity_list)

    mean = np.mean(elements)
    sd = np.std(elements)

    temp_id = []
    current_id = 0
    for x in vader_polarity_list:
        if (x > mean - 3 * sd and x < mean + 3 * sd):
            ignore = 1  #does nothing
        else:
            temp_id.append(current_id)
        current_id = current_id + 1

    temp_id_1 = temp_id.copy()

    print('removing outliers textblob_polarity')

    elements = np.array(textblob_polarity_list)

    mean = np.mean(elements)
    sd = np.std(elements)

    temp_id = []
    current_id = 0
    for x in textblob_polarity_list:
        if (x > mean - 3 * sd and x < mean + 3 * sd):
            ignore = 1  #does nothing
        else:
            temp_id.append(current_id)
        current_id = current_id + 1

    temp_id_2 = temp_id.copy()

    print('removing outliers textblob_subjectivity')

    elements = np.array(textblob_subjectivity_list)

    mean = np.mean(elements)
    sd = np.std(elements)

    temp_id = []
    current_id = 0
    for x in textblob_subjectivity_list:
        if (x > mean - 3 * sd and x < mean + 3 * sd):
            ignore = 1  # does nothing
        else:
            temp_id.append(current_id)
        current_id = current_id + 1

    temp_id_3 = temp_id.copy()

    temp_id_4 = temp_id_1 + temp_id_2 + temp_id_3  #temp_id_4 now contains all outlier ids

    current_id = 0
    flag = 0
    for placeholder7 in vader_polarity_list:
        for placeholder8 in temp_id_4:
            if (current_id == placeholder8):
                flag = 1
                break
        if (flag == 0):
            no_outliers_list_vader.append(vader_polarity_list[current_id])
            no_outliers_list_textblob_polarity.append(
                textblob_polarity_list[current_id])
            no_outliers_list_textblob_subjectivity.append(
                textblob_subjectivity_list[current_id])

            no_errors_no_outliers_list_vader.append(
                vader_polarity_list[current_id])
            no_errors_no_outliers_list_textblob_polarity.append(
                textblob_polarity_list[current_id])
            no_errors_no_outliers_list_textblob_subjectivity.append(
                textblob_subjectivity_list[current_id])

            everything_included_outlier_list.append("not outlier")

        if (flag == 1):
            no_outliers_list_vader.append(
                "Outlier ERROR- YOU SHOULD NEVER SEE THIS")
            no_outliers_list_textblob_polarity.append(
                "Outlier ERROR- YOU SHOULD NEVER SEE THIS")
            no_outliers_list_textblob_subjectivity.append(
                "Outlier ERROR- YOU SHOULD NEVER SEE THIS")

            everything_included_outlier_list.append("outlier")

        flag = 0
        current_id = current_id + 1

    print("length of vader_polarity - outliers removed: ",
          len(no_errors_no_outliers_list_vader))
    print("length of textblob_polarity - outliers removed: ",
          len(no_errors_no_outliers_list_textblob_polarity))
    print("length of textblob_subjectivity - outliers removed: ",
          len(no_errors_no_outliers_list_textblob_subjectivity))

    print("outliers removed vader_polarity skew: ",
          skew(no_errors_no_outliers_list_vader))
    print("outliers removed textblob_polarity skew: ",
          skew(no_errors_no_outliers_list_textblob_polarity))
    print("outliers removed textblob_subjectivity skew: ",
          skew(no_errors_no_outliers_list_textblob_subjectivity))

    plt_4 = plt.figure(4)
    plt.hist(no_errors_no_outliers_list_vader,
             bins=80,
             range=[-1, 1],
             align='mid')
    plt.ylabel("Frequency")
    plt.title('vader_polarity - outliers removed')
    plt.show()
    input("Press Enter to continue...")

    plt_5 = plt.figure(5)
    plt.hist(no_errors_no_outliers_list_textblob_polarity,
             bins=80,
             range=[-1, 1],
             align='mid')
    plt.ylabel("Frequency")
    plt.title('textblob_polarity - outliers removed')
    plt.show()
    input("Press Enter to continue...")

    plt_6 = plt.figure(6)
    plt.hist(no_errors_no_outliers_list_textblob_subjectivity,
             bins=80,
             range=[0, 1],
             align='mid')
    plt.ylabel("Frequency")
    plt.title('textblob_subjectivity - outliers removed')
    plt.show()
    input("Press Enter to continue...")

    print("MAKE SURE THESE ARE SAME VALUE: ")
    print(len(no_outliers_list_vader))
    print(len(no_outliers_list_textblob_polarity))
    print(len(no_outliers_list_textblob_subjectivity))

    print("and these: ")

    print("Finding log of both score lists...")

    no_errors_logged_list_vader = np.log1p(no_errors_no_outliers_list_vader)
    no_errors_logged_list_textblob_polarity = np.log1p(
        no_errors_no_outliers_list_textblob_polarity)
    no_errors_logged_list_textblob_subjectivity = np.log1p(
        no_errors_no_outliers_list_textblob_subjectivity)

    i = 0
    for placeholder10 in no_outliers_list_vader:
        if (isinstance(placeholder10, str)):
            logged_list_vader.append("outlier")
            logged_list_textblob_polarity.append("outlier")
            logged_list_textblob_subjectivity.append("outlier")

        else:
            logged_list_vader.append(math.log1p(no_outliers_list_vader[i]))
            logged_list_textblob_polarity.append(
                math.log1p(no_outliers_list_textblob_polarity[i]))
            logged_list_textblob_subjectivity.append(
                math.log1p(no_outliers_list_textblob_subjectivity[i]))

        i = i + 1

    plt_7 = plt.figure(7)
    plt.hist(no_errors_logged_list_vader, bins=80, range=[-1, 1], align='mid')
    plt.ylabel("Frequency")
    plt.title('vader_polarity - normalized')
    plt.show()
    input("Press Enter to continue...")

    plt_8 = plt.figure(8)
    plt.hist(no_errors_logged_list_textblob_polarity,
             bins=80,
             range=[-1, 1],
             align='mid')
    plt.ylabel("Frequency")
    plt.title('textblob_polarity - normalized')
    plt.show()
    input("Press Enter to continue...")

    plt_9 = plt.figure(9)
    plt.hist(no_errors_logged_list_textblob_subjectivity,
             bins=80,
             range=[0, 1],
             align='mid')
    plt.ylabel("Frequency")
    plt.title('textblob_subjectivity - normalized')
    plt.show()
    input("Press Enter to continue...")

    print(
        "Finding The Pearson Correlation Coefficient between vader_polarity scores and textblob_polarity scores..."
    )
    print(
        "Pearson Correlation Coefficient: ",
        pearsonr(no_errors_logged_list_textblob_polarity,
                 no_errors_logged_list_vader))
Example #48
0
def main():
    st.title("Statistical Testing Web App")
    st.sidebar.title("Statistical Testing Web App")
    st.markdown("What would you like to do today?")
    st.sidebar.markdown("Select what would you like to do today")

    data = st.file_uploader("Upload a Dataset", type=["csv", "txt"])
    if data is not None:
        df = pd.read_csv(data)
        st.dataframe(df.head())
        activities = ['Parametric Test', 'Non-Parametric Test','Normality Tests','Correlation']
        choice = st.sidebar.selectbox("Select a Statistical Testing",activities)

        if choice == 'Parametric Test':
             Parametric= ["student's t-test","Pairwise t-test","F-test","chi-square"]
             tests = st.sidebar.selectbox("Select the tests you want to conduct",Parametric)
             if tests == "student's t-test":
                 data1 = st.sidebar.selectbox("Select the 1st variable you want to perform the test on", df.columns.tolist())
                 data2 = st.sidebar.selectbox("Select the 2nd variable you want to perform the test on", df.columns.tolist())
                 
                 if st.sidebar.button("Show results"):
                     st.subheader("student's t-test Results")
                     stat, p = stats.ttest_ind(df[data1],df[data2])
                     st.write("t-statistics: ", stat,"p-value: ", p)
            
                     if p > 0.05:
                         st.write('Do not reject the hypothesis')
                     else:
                         st.write('Reject the hypothesis')
             if tests == "Pairwise t-test":
                 data1 = st.sidebar.selectbox("Select the 1st variable you want to perform the test on", df.columns.tolist())
                 data2 = st.sidebar.selectbox("Select the 2nd variable you want to perform the test on", df.columns.tolist())
                 
                 if st.sidebar.button("Show results"):
                     st.subheader("Pairwise t-test Results")
                     stat, p = stats.ttest_rel(df[data1],df[data2])
                     st.write("t-statistics: ", stat,"p-value: ", p)
            
                     if p > 0.05:
                         st.write('Do not reject the hypothesis')
                     else:
                         st.write('Reject the Hypothesis')
             
             if tests == "F-test":
                 data1 = st.sidebar.selectbox("Select the 1st variable you want to perform the test on", df.columns.tolist())
                 data2 = st.sidebar.selectbox("Select the 2nd variable you want to perform the test on", df.columns.tolist())

                 if st.sidebar.button("Show results"):
                     st.subheader("F-test Results")
                     stat, p = stats.f_oneway(df[data1],df[data2])
                     st.write("F-statistics: ", stat,"p-value: ", p)
        
                     if p > 0.05:
                         st.write('Do not reject the hypothesis')
                     else:
                         st.write('Reject the Hypothesis')
             
             if tests == "chi-square" :
                 data1 = st.sidebar.selectbox("Select the observed frequency", df.columns.tolist())
                 data2 = st.sidebar.selectbox("Select the expected frequency", df.columns.tolist())
                 
                 if st.sidebar.button("Show results"):
                     st.subheader("Chi Square test Results")
                     stat, p = stats.chisquare(df[data1],df[data2],axis=0)
                     st.write("chi square-statistics: ", stat,"p-value: ", p)
        
                     if p > 0.05:
                         st.write('Do not reject the hypothesis')
                     else:
                         st.write('Reject the Hypothesis')

            
            

        elif choice == 'Non-Parametric Test':
            Non_parametric = ['Mann-Whitney','Wilcoxon','Kruskal-Wallis','Friedman']
            tests = st.sidebar.selectbox("Select the tests you want to conduct",Non_parametric)
            if tests == "Mann-Whitney" :
                 data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist())
                 data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist())
                 
                 if st.sidebar.button("Show results"):
                     st.subheader("Mann Whitney test Results")
                     stat, p = stats.mannwhitneyu(df[data1], df[data2])
                     st.write("Mann Whitney statistics: ", stat,"p-value: ", p)
        
                     if p > 0.05:
                         st.write('Do not reject the hypothesis')
                     else:
                         st.write('Reject the hypothesis')
            
            if tests == "Wilcoxon" :
                 data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist())
                 data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist())
                 
                 if st.sidebar.button("Show results"):
                     st.subheader("Wilcoxon test Results")
                     stat, p = stats.wilcoxon(df[data1], df[data2])
                     st.write("Wilcoxon statistics: ", stat,"p-value: ", p)
        
                     if p > 0.05:
                         st.write('Do not reject the hypothesis')
                     else:
                         st.write('Reject the hypothesis')
            
            if tests == "Kruskal-Wallis" :
                 data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist())
                 data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist())
                 
                 if st.sidebar.button("Show results"):
                     st.subheader("Kruskal-Wallis test Results")
                     stat, p = stats.kruskal(df[data1], df[data2])
                     st.write("Kruskal-Wallis statistics: ", stat,"p-value: ", p)
        
                     if p > 0.05:
                         st.write('Do not reject the hypothesis')
                     else:
                         st.write('Reject the hypothesis')
            
            if tests == "Friedman" :
                 data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist())
                 data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist())
                 data3 = st.sidebar.selectbox("Select the 3rd variable to perform the test on", df.columns.tolist())

                 if st.sidebar.button("Show results"):
                     st.subheader("Friedman Results")
                     stat, p = stats.friedmanchisquare(df[data1], df[data2],df[data3])
                     st.write("Friedman statistics: ", stat,"p-value: ", p)
        
                     if p > 0.05:
                         st.write('Do not reject the hypothesis')
                     else:
                         st.write('Reject the hypothesis')
                
   
        elif choice == 'Normality Tests':
            Normal= ["Shapiro–Wilk","Anderson–Darling","Kolmogorov–Smirnov","Normal-Test"]
            tests = st.sidebar.selectbox("Select the test you want to conduct",Normal)
            if tests == "Shapiro–Wilk" :
                user_input = str(st.text_input('Do you need specific columns? Y or N'))
                if user_input == "Y":
                     selected_columns = st.sidebar.multiselect("Select the variables you want to perform the test on", df.columns.tolist())
                     new_df = df[selected_columns]
                         
                     if st.sidebar.button("Show results"):
                         st.subheader("Shapiro Wilk test Results")
                         stat, p = stats.shapiro(new_df)
                         st.write("Shapiro Wilk statistics: ", stat,"p-value: ", p)
        
                         if p > 0.05:
                             st.write('Probably Gaussian')
                         else:
                             st.write('Probably not Gaussian')
            
                else:
                    if st.sidebar.button("Show results"):
                        stat, p = stats.shapiro(df)
                        st.write("Shapiro Wilk statistics: ", stat,"p-value: ", p)
                        if p > 0.05:
                            st.write('Probably Gaussian')
                        else:
                            st.write('Probably not Gaussian')
        
            if tests == "Anderson–Darling" :
                selected_columns = st.sidebar.multiselect("Select the variables you want to perform the test on", df.columns.tolist())
                new_df = df[selected_columns]
                    
                if st.sidebar.button("Show results"):
                    st.subheader("Anderson–Darling test Results")
                    result = stats.anderson(new_df,dist='norm')
                    st.write("Anderson–Darling statistics: ", result.statistics)
                    for i in range(len(result.critical_values)):
                         sl , cv = result.significance_level[i], result.critical_values[i]
                         if result.statistic < cv:
                                st.write('Probably Gaussian at the %.1f%% level' % (sl))
                         else:
                                st.write('Probably not Gaussian at the %.1f%% level' % (sl))
   
    
            if tests == "Kolmogorov–Smirnov":
                data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist())
                data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist())
                 
                if st.sidebar.button("Show results"):
                     st.subheader("Kolmogorov–Smirnov test Results")
                     p,stat = stats.ks_2samp(df[data1], df[data2])
                     st.write("Kolmogorov–Smirnov statistics: ", stat,"p-value: ", p)
        
                     if p > 0.05:
                         st.write('Probably the same distribution')
                     else:
                         st.write('Probably different distributions')
                         
                         
            if tests == "Normal-Test" :
                user_input = st.text_input('Do you need specific columns?Y or N')
                if user_input == "Y":
                     selected_columns = st.sidebar.multiselect("Select the variables you want to perform the test on", df.columns.tolist())
                     new_df = df[selected_columns]
                         
                     if st.sidebar.button("Show results"):
                         st.subheader("Normal-Test test Results")
                         p, stat = stats.normaltest(new_df)
                         st.write("Normal-Test statistics: ", stat,"p-value: ", p)
        
                         if p > 0.05:
                             st.write('Probably Gaussian')
                         else:
                             st.write('Probably not Gaussian')
            
                else:
                    stat, p = stats.normaltest(df)
                    st.write("Normal-Test statistics: ", stat,"p-value: ", p)
                    if p > 0.05:
                        st.write('Probably Gaussian')
                    else:
                        st.write('Probably not Gaussian')         
            
        elif  choice == 'Correlation':
            corr= ["Spearman's Rank","Pearson","Kendall"]
            tests = st.sidebar.selectbox("Select the tests you want to conduct",corr)
            
            if tests == "Spearman's Rank":
                data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist())
                data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist())
                 
                if st.sidebar.button("Show results"):
                    st.subheader(" Spearman's Rank Results")
                    p, stat = stats.spearmanr(df[data1], df[data2])
                    st.write("Spearman's Rank statistics: ", stat,"p-value: ", p)
        
                    if p > 0.05:
                        st.write('Probably dependent')
                    else:
                        st.write('Probably independent')
            
            if tests == "Pearson":
                data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist())
                data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist())
                 
                if st.sidebar.button("Show results"):
                    st.subheader("Pearson Results")
                    p, stat = stats.pearsonr(df[data1], df[data2])
                    st.write("Pearson statistics: ", stat,"p-value: ", p)
        
                    if p > 0.05:
                        st.write('Probably dependent')
                    else:
                        st.write('Probably independent')
            
            if tests == "Kendall":
                data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist())
                data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist())
                 
                if st.sidebar.button("Show results"):
                    st.subheader("Kendall Results")
                    p,stat = stats.kendalltau(df[data1], df[data2])
                    st.write("Kendall statistics: ", stat,"p-value: ", p)
        
                    if p > 0.05:
                        st.write('Probably dependent')
                    else:
                        st.write('Probably independent')
                        
    
            if st.sidebar.checkbox('Show Correlation plot'):
                 st.write('To see the correlation plot of all variables')
                 st.write(sns.heatmap(df.corr(),annot=True))
                 st.pyplot(height=800)
                 
        if st.sidebar.checkbox('EDA'):
        
            if st.sidebar.checkbox('Show data summary'):
                st.write(df.describe())
        
            if st.sidebar.checkbox('Show Value Counts'):
                st.write(df.iloc[:,-1].value_counts())
Example #49
0
    ax[0].set_ylim((-0.02, 0.12))
ax[0].axhline(0, linestyle='--', color='k')
ax[0].set_xticks([0, 1])
ax[0].set_xticklabels(['Active', 'Passive'], rotation=45)
ax[0].set_xlabel('Behavior State')
ax[0].set_ylabel(r'Noise Correlation ($r_{sc}$)')

# correlation with overall behavior
resg = res.groupby(by=['snr', 'f', 'site']).mean()
#ax[1].scatter(resg[di_metric], resg['diff'], s=50, edgecolor='white', color='tab:orange')
sns.regplot(x=di_metric, y='diff', data=resg, ax=ax[1], color='tab:orange')
ax[1].set_xlabel('Behavior performance (DI)')
ax[1].set_ylabel(r"$\Delta r_{sc}$" + "\n(Active - Passive)")
ax[1].axhline(0, linestyle='--', color='k')
ax[1].axvline(0.5, linestyle='--', color='k')
r, p = ss.pearsonr(resg[di_metric], resg['diff'])
ax[1].set_title(r"$r$: %s, $p$: %s" % (round(r, 3), round(p, 3)))

f.tight_layout()

f.savefig(DIR + 'pyfigures/rsc_behavior.svg')

# ===================================== FIGURE 2 ====================================
# break down correlation vs. behavior into different time windows
tbins1 = ['0.25_0.35', '0.35_0.45', '0.45_0.55', '0.55_0.65']
tbins2 = ['0_0.1', '0.1_0.2', '0.2_0.3', '0.3_0.4']
titles = ['-0.1 - 0.0 sec', '0.0 - 0.1 sec', '0.1 - 0.2 sec', '0.2 - 0.3 sec']
f, ax = plt.subplots(1, 4, figsize=(12, 3), sharey=True)
for t1, t2, title, a in zip(tbins1, tbins2, titles, ax.flatten()):
    m1 = (df.batch == 307) & (df.tbin == t1) & ((df.pa < alpha) |
                                                (df.pp < alpha))
def eval(gold,predectvalues):
            pr = pearsonr(gold, predectvalues)[0]
            print 'Test Pearson: ' + str(pr)
            return pr
def check_correlation(x, y):
    pearson_coef, p_value = stats.pearsonr(x, y)
    print("Pearson Correlation Coefficient: ", pearson_coef,
          "and a P-value of:", p_value)
Example #52
0
            np.mean(accuracies_twin.item()['n' + str(i) + '_DZ']))
        results['DZ_acc_std'].append(
            np.std(accuracies_twin.item()['n' + str(i) + '_DZ']))
        results['Title'].append(
            eval('labels_dict_' + parcellation[j])['n' + str(i)][0])
        results['Falconers formula'].append(
            falconers_h2.item()[parcellation[j]][i])
        results['ACE'].append(eval('ACE_h2_' + parcellation[j])[i])

    # Excel file
    # pd.DataFrame.from_dict(results).to_excel('./../outputs/identification_results_' + parcellation[j] + '.xlsx')
    df = pd.DataFrame(results)
    df = df.sort_values(by=['SI_acc_mean'], ascending=False)

    print(
        stat.pearsonr(results['MZ_acc_mean'],
                      eval('ACE_h2_' + parcellation[j])))
    # print(stat.pearsonr(results['MZ_acc_mean'],
    #                     falconers_h2.item()[parcellation[j]]))
    # print(stat.pearsonr(falconers_h2.item()[parcellation[j]],
    #                     eval('nodes_'+parcellation[j])))
    labels = df['Title']

    # Figure
    if nets[j] == 9:
        fig, ax = plt.subplots(figsize=(10, 5))
    else:
        fig, ax = plt.subplots(figsize=(14, 5))
    plt.xlabel('Functional networks, n of nodes', fontsize=20)
    plt.ylabel('Identification accuracy', fontsize=20)
    bar_width = 0.25
    plt.xticks(range(nets[j]), labels, rotation=45, fontsize=15)
Example #53
0
es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,patience=3,verbose=0, mode='auto')

cp = keras.callbacks.ModelCheckpoint(filepath="/Dedicated/jmichaelson-wdata/rotating_students/bhoskins/RetinaDL/ModelAge.h5",
        verbose=1, save_best_only=True)

model.fit(images[ind[0]], y.iloc[ind[0]], batch_size=16, callbacks=[es,cp],epochs=30, validation_data=[images[ind[1]], y.iloc[ind[1]]])

#model = keras.models.load_model("/Dedicated/jmichaelson-wdata/rotating_students/bhoskins/RetinaDL/Model2.h5")

pred = model.predict(images[ind[2]])

true = y.iloc[ind[2]]

#Age stats
pearsonr(true, pred[:,0])
statistics.mean(abs(true-pred[:,0]))

#plot true vs predicted cbornreg plot
sns.regplot(x=true, y=pred[:,0])

plt.show()

plt.savefig("/Dedicated/jmichaelson-wdata/rotating_students/bhoskins/RetinaDL/data_prep/" + "regplot.png")

from scipy.stats import ttest_ind
#Depression stats
dep = diagnosis.Depression == 1
depDiagnosed = diagnosis[dep]
depAge = depDiagnosed["age"] #use as true
pred2 = model.predict(images[dep])
districts_to_label_xy_df = districts_private_and_income.loc[
    districts_to_label_mask,
    ["district_eng", "private_or_public", "yearly_average_household_income"]]

for idx, row in districts_to_label_xy_df.iterrows():
    x = row["yearly_average_household_income"] + 2
    y = row["private_or_public"]  #To align it properly
    ax_1.annotate(s=row["district_eng"],
                  xy=(x, y),
                  horizontalalignment='left',
                  verticalalignment="center")

#Annotate pearson's r

pearson_r = st.pearsonr(
    districts_private_and_income.loc[:, "private_or_public"],
    districts_private_and_income.loc[:, "yearly_average_household_income"])[0]

ax_1.annotate(s="r = {:.2f}".format(pearson_r),
              xy=(.9, .9),
              xycoords=ax_1.transAxes,
              color="black",
              weight="bold",
              fontsize=15)

# --- Scatterplots Bottom : Income - Num. of Inst for all sub-categories
institutions = [
    "Hospital", "Dental Health Center", "Dialysis Center",
    "Physical Therapy Center", "Gynecology and Obstetrics Clinic",
    "Medical Center", "Polyclinic", "Planned Parenthood Center"
]
# print('Network parameters for anger: ', model_anger.summary())

# print('Network parameters for fear: ', model_fear.summary())

# print('Network parameters for joy: ', model_joy.summary())

# print('Network parameters for sadness: ', model_sadness.summary())

y_pred_anger = model_anger.predict(x_test_anger)
y_pred_fear = model_fear.predict(x_test_fear)
y_pred_joy = model_joy.predict(x_test_joy)
y_pred_sadness = model_sadness.predict(x_test_sadness)

# In[ ]:

pearson_correlation_score_anger = pearsonr(
    y_pred_anger.reshape((y_pred_anger.shape[0], )), y_test_anger)[0]

print('Pearson Correlation for LE_PC_DMTL model on Test set for anger')
print(pearson_correlation_score_anger)

# In[ ]:

pearson_correlation_score_fear = pearsonr(
    y_pred_fear.reshape((y_pred_fear.shape[0], )), y_test_fear)[0]

print('Pearson Correlation for LE_PC_DMTL model on Test set for fear')
print(pearson_correlation_score_fear)

# In[ ]:

pearson_correlation_score_joy = pearsonr(
        x2 = (data2-data1)/2

    return x1,x2

#parameters
a_bound=5
M=400
N=1000

results = None
for rho in np.arange(0.9,1.0,0.1):
    for i in range(M):
        delta_t = 0.1
        coupling = 2*np.abs(rho)/(1-np.abs(rho))*np.sign(rho)
        x1,x2 = correlated_ts(coupling,N=N)
        prho = pearsonr(x1,x2)[0]
        print("OU cross correlation", OUcross(x1,x2,delta_t))
        print("pearson: ",prho)

        para = calc_fundstats(x1+x2) + calc_fundstats(x1-x2) +(delta_t,N)
        guessa1 = (x1+x2).std()**2
        guessa2 = (x1-x2).std()**2
        guessd = 0.5
        c_guess = (guessa1-guessa2)/guessa2
        print(guessa1,guessa2,guessd,c_guess/(2+c_guess))
        result = root(phi_deriv, [guessa1,guessa2,guessd],args=para)

        a1 = result.x[0]
        a2 = result.x[1]
        d = result.x[2]
Example #57
0
def pr2_spatial(tslsreg):
    """
    Calculates the pseudo r^2 for the spatial two stage least squares 
    regression.
    
    Parameters
    ----------
    stslsreg            : spatial two stage least squares regression object
                          output instance from a spatial two stage least 
                          squares regression model

        
    Returns
    -------    
    pr2_result          : float
                          value of the squared pearson correlation between
                          the y and stsls-predicted y vectors

    
    Examples
    --------

    We first need to import the needed modules. Numpy is needed to convert the
    data we read into arrays that ``spreg`` understands and ``pysal`` to
    perform all the analysis. The GM_Lag is required to run the model on
    which we will perform the tests and the ``pysal.spreg.diagnostics`` module
    contains the function with the test.

    >>> import numpy as np
    >>> import pysal
    >>> import pysal.spreg.diagnostics as D
    >>> from twosls_sp import GM_Lag

    Open data on Columbus neighborhood crime (49 areas) using pysal.open().
    This is the DBF associated with the Columbus shapefile.  Note that
    pysal.open() also reads data in CSV format; since the actual class
    requires data to be passed in as numpy arrays, the user can read their
    data in using any method.  

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),'r')
    
    Extract the HOVAL column (home value) from the DBF file and make it the
    dependent variable for the regression. Note that PySAL requires this to be
    an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
    that other packages accept.

    >>> y = np.array(db.by_col("HOVAL"))
    >>> y = np.reshape(y, (49,1))

    Extract INC (income) vectors from the DBF to be used as
    independent variables in the regression.  Note that PySAL requires this to
    be an nxj numpy array, where j is the number of independent variables (not
    including a constant). By default this model adds a vector of ones to the
    independent variables passed in, but this can be overridden by passing
    constant=False.

    >>> X = np.array(db.by_col("INC"))
    >>> X = np.reshape(X, (49,1))

    In this case, we consider CRIME (crime rates) as an endogenous regressor,
    so we acknowledge that by reading it in a different category.

    >>> yd = np.array(db.by_col("CRIME"))
    >>> yd = np.reshape(yd, (49,1))

    In order to properly account for the endogeneity, we have to pass in the
    instruments. Let us consider DISCBD (distance to the CBD) is a good one:

    >>> q = np.array(db.by_col("DISCBD"))
    >>> q = np.reshape(q, (49,1))

    Since this test has a spatial component, we need to specify the spatial
    weights matrix that includes the spatial configuration of the observations
    into the error component of the model. To do that, we can open an already
    existing gal file or create a new one. In this case, we will create one
    from ``columbus.shp``.

    >>> w = pysal.rook_from_shapefile(pysal.examples.get_path("columbus.shp")) 

    Unless there is a good reason not to do it, the weights have to be
    row-standardized so every row of the matrix sums to one. Among other
    things, this allows to interpret the spatial lag of a variable as the
    average value of the neighboring observations. In PySAL, this can be
    easily performed in the following way:

    >>> w.transform = 'r'

    Now we are good to run the spatial lag model. Make sure you pass all the
    parameters correctly and, if desired, pass the names of the variables as
    well so when you print the summary (reg.summary) they are included:

    >>> reg = GM_Lag(y, X, w=w, yend=yd, q=q, w_lags=2, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus')

    Once we have a regression object, we can perform the spatial version of
    the pesudo R^2. It is as simple as one line!

    >>> result = pr2_spatial(reg)
    >>> print("%1.6f"%result)
    0.299649

    """

    y = tslsreg.y
    predy_e = tslsreg.predy_e
    pr = pearsonr(y, predy_e)[0]
    pr2_result = float(pr**2)
    return pr2_result
Example #58
0
    def __init__(self,
                 expect,
                 got,
                 name=None,
                 data=None,
                 P_LARGER=0.9,
                 regression=True,
                 ax=None,
                 alphabet=None,
                 expect_label=None,
                 got_label=None,
                 verbose=1):
        """Compare vectors.

        Arguments:

          - Specifying data for comparison two methods:

            1) `expect`, `got`: two numeric one-dimensional arrays which we'd like
               to compare (the argument names come for software testing). This
               method requires argument `data=None`.

            2) `data`: instance of `DataFrame`, expects arguments `expect` and `got`
               to be column labels.

          - `name`: name of this comparison.

        Note:

         - when plotting `expect` is the y-axis, `got` is the x-axis. This is by
           convention that `expect` is the dependent variable (regression target).

        TODO:

         - Add an option to drop NaNs and continue comparison.

         - Indicate which dimensions have the largest errors.

        """

        if isinstance(expect, dict) and isinstance(got, dict):
            _alphabet = expect.keys() if alphabet is None else alphabet
            assert set(got.keys()) == set(_alphabet)
            expect = [expect[k] for k in _alphabet]
            got = [got[k] for k in _alphabet]

        if data is not None:
            assert isinstance(expect, (int, basestring)), \
                'expected a column name got %s' % type(expect)
            assert isinstance(got, (int, basestring)), \
                'expected a column name got %s' % type(got)

            if expect_label is None:
                expect_label = expect
            if got_label is None:
                got_label = got

            expect = data[expect]
            got = data[got]

        else:
            if expect_label is None:
                expect_label = 'expect'
            if got_label is None:
                got_label = 'got'

            expect = np.asarray(expect)
            got = np.asarray(got)

            data = pd.DataFrame({expect_label: expect, got_label: got})

        assert expect.shape == got.shape
        [n] = expect.shape

        self.expect = expect
        self.got = got
        self.alphabet = alphabet
        self.ax = ax
        self.name = name
        self.got_label = got_label
        self.expect_label = expect_label
        self.n = n
        self.coeff = None

        self.tests = tests = []

        # Check that vectors are finite.
        if not np.isfinite(expect).all():
            tests.append([
                'expect finite',
                progress(np.isfinite(expect).sum(), n), False
            ])
        if not np.isfinite(got).all():
            tests.append(
                ['got finite',
                 progress(np.isfinite(got).sum(), n), False])

        ne = norm(expect)
        ng = norm(got)
        ok = abs(ne - ng) / ne < 0.01 if ne != 0 else True

        if n > 1:
            tests.append(['norms', '[%g, %g]' % (ne, ng), ok])
            F = zero_retrieval(expect, got)
            tests.append(['zero F1', F, F > 0.99])

        if n > 1:
            c = cosine(expect, got)
            self.cosine = c
            tests.append(['cosine-sim', c, (c > 0.99999)
                          ])  # cosine similarities must be really high.

            self.pearsonr = 1.0 if ne == ng == 0 else pearsonr(expect, got)[0]
            tests.append(['pearson', self.pearsonr, (self.pearsonr > 0.99999)])

            p = spearmanr(expect, got)[0]
            tests.append(['spearman', p, (p > 0.99999)])

        # TODO: this check should probably take into account the scale of the data.
        d = linf(expect, got)
        self.max_err = d
        tests.append(['Linf', d, d < 1e-8])

        # same sign check (weak agreement, but useful sanity check -- especially
        # for gradients)
        x = expect
        y = got
        s = np.asarray(~((x >= 0) ^ (y >= 0)), dtype=int)
        p = s.sum() * 100.0 / len(s)
        tests.append(
            ['same-sign',
             '%s%% (%s/%s)' % (p, s.sum(), len(s)), p == 100.0])

        # relative error
        r = relative_difference(expect, got)
        r = np.mean(r[np.isfinite(r)])
        tests.append(['mean relative error', r, r <= 0.01])
        self.mean_relative_error = r

        # TODO: suggest that if relative error is high and rescaled error is low (or
        # something to do wtih regression residuals) that maybe there is a
        # (hopefully) simple fix via scale/offset.

        # TODO: can provide descriptive statistics for each vector
        #tests.append(['range (expect)', [expect.min(), expect.max()], 2])
        #tests.append(['range (got)   ', [got.min(), got.max()], 2])

        # regression and rescaled error only valid for n >= 2
        if n >= 2:
            es = abs(expect).max()
            gs = abs(got).max()
            if es == 0:
                es = 1
            if gs == 0:
                gs = 1
            if 0:
                # rescaled error
                E = expect / es
                G = got / gs
                R = abs(E - G)
                r = np.mean(R)
                tests.append(['mean rescaled error', r, r <= 1e-5])

        if regression:
            self.regression()

        if n >= 2:
            # These tests check if one of the datasets is consistently larger than the
            # other. The threshold for error is based on `P_LARGER` ("percent larger").
            L = ((expect - got) > 0).sum()
            if L >= P_LARGER * n:
                tests.append(['expect is larger', progress(L, n), 0])
            L = ((got - expect) > 0).sum()
            if L >= P_LARGER * n:
                tests.append(['got is larger', progress(L, n), 0])

        self.tests = tests
        if verbose:
            self.message()

        if alphabet is not None:
            self.show_largest_rel_errors()
Example #59
0
def pr2_aspatial(tslsreg):
    """
    Calculates the pseudo r^2 for the two stage least squares regression.
    
    Parameters
    ----------
    tslsreg             : two stage least squares regression object
                          output instance from a two stage least squares
                          regression model

        
    Returns
    -------    
    pr2_result          : float
                          value of the squared pearson correlation between
                          the y and tsls-predicted y vectors

    
    Examples
    --------

    We first need to import the needed modules. Numpy is needed to convert the
    data we read into arrays that ``spreg`` understands and ``pysal`` to
    perform all the analysis. The TSLS is required to run the model on
    which we will perform the tests.

    >>> import numpy as np
    >>> import pysal
    >>> from twosls import TSLS

    Open data on Columbus neighborhood crime (49 areas) using pysal.open().
    This is the DBF associated with the Columbus shapefile.  Note that
    pysal.open() also reads data in CSV format; since the actual class
    requires data to be passed in as numpy arrays, the user can read their
    data in using any method.  

    >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),'r')
    
    Before being able to apply the diagnostics, we have to run a model and,
    for that, we need the input variables. Extract the CRIME column (crime
    rates) from the DBF file and make it the dependent variable for the
    regression. Note that PySAL requires this to be an numpy array of shape
    (n, 1) as opposed to the also common shape of (n, ) that other packages
    accept.

    >>> y = np.array(db.by_col("CRIME"))
    >>> y = np.reshape(y, (49,1))

    Extract INC (income) vector from the DBF to be used as
    independent variables in the regression.  Note that PySAL requires this to
    be an nxj numpy array, where j is the number of independent variables (not
    including a constant). By default this model adds a vector of ones to the
    independent variables passed in, but this can be overridden by passing
    constant=False.

    >>> X = []
    >>> X.append(db.by_col("INC"))
    >>> X = np.array(X).T

    In this case, we consider HOVAL (home value) as an endogenous regressor,
    so we acknowledge that by reading it in a different category.

    >>> yd = []
    >>> yd.append(db.by_col("HOVAL"))
    >>> yd = np.array(yd).T

    In order to properly account for the endogeneity, we have to pass in the
    instruments. Let us consider DISCBD (distance to the CBD) is a good one:

    >>> q = []
    >>> q.append(db.by_col("DISCBD"))
    >>> q = np.array(q).T

    Now we are good to run the model. It is an easy one line task.

    >>> reg = TSLS(y, X, yd, q=q)

    In order to perform the pseudo R^2, we pass the regression object to the
    function and we are done!

    >>> result = pr2_aspatial(reg)
    >>> print("%1.6f"%result)    
    0.279361

    """

    y = tslsreg.y
    predy = tslsreg.predy
    pr = pearsonr(y, predy)[0]
    pr2_result = float(pr**2)
    return pr2_result
def display_selected_data(selectedArea, choiceNB):

    if choiceNB == 'boroughs':
        df_selected = df_trees_properties_boro
        title_part = ' boroughs'
        key = 'borough'

    else:
        title_part = ' neighborhoods'
        df_selected = df_trees_properties
        key = 'ntaname'

    font_ann = dict(
        size=10,
        color=colors['text']
    )

    if selectedArea is not None:
        points = selectedArea["points"]
        area_names = [str(point["text"].split("<br")[0])
                      for point in points]
        df_selected = df_selected[df_selected[key].isin(area_names)]

    index_vals = df_selected['borough'].astype('category').cat.codes
    coef_list = []

    # find pearson coeff and p_value for each pair of attributes
    pairs = [['trees/sq.mile', 'avg.landprice_thous$/acre'], ['trees/sq.mile',
                                                              'properties/sq.mile'], ['avg.landprice_thous$/acre', 'properties/sq.mile']]
    flag = True
    for pair in pairs:
        if len(df_selected[pair[0]]) >= 2 and len(df_selected[pair[1]]) >= 2:
            coef_list.append(
                pearsonr(df_selected[pair[0]], df_selected[pair[1]]))
        else:
            flag = False
    if flag:
        ann = [
            dict(
                x=5000,
                y=6000,
                xref="x2",
                yref="y1",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[0][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[0][1])),
                showarrow=False,

            ),
            dict(
                x=6000,
                y=5000,
                xref="x1",
                yref="y2",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[0][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[0][1])),
                showarrow=False,
            ),
            dict(
                x=14000,
                y=6000,
                xref="x3",
                yref="y1",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[1][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[1][1])),
                showarrow=False,
            ),
            dict(
                x=6000,
                y=14000,
                xref="x1",
                yref="y3",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[1][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[1][1])),
                showarrow=False,
            ),
            dict(
                x=14000,
                y=6000,
                xref="x3",
                yref="y2",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[2][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[2][1])),
                showarrow=False,
            ),
            dict(
                x=6000,
                y=14000,
                xref="x2",
                yref="y3",
                font=font_ann,
                text="PCC: " +
                str(round(coef_list[2][0], 2)) + "<br>p: " +
                ('{:0.1e}'.format(coef_list[2][1])),
                showarrow=False,
            ),

        ]
    else:
        ann = []

    axisd = dict(showline=True,
                 zeroline=False,
                 gridcolor='#104752',
                 showticklabels=True)

    # here we build a scatter matrix, and add annotations for each subgraph
    layout = go.Layout(
        dragmode='select',

        margin=dict(l=0, r=0, b=0, t=0, pad=0),
        autosize=False,
        hovermode='closest',
        font=dict(color=colors['text'], size=12),
        plot_bgcolor=colors['background'],
        paper_bgcolor=colors['background'],
        xaxis1=dict(axisd),
        xaxis2=dict(axisd),
        xaxis3=dict(axisd),
        xaxis4=dict(axisd),
        yaxis1=dict(axisd),
        yaxis2=dict(axisd),
        yaxis3=dict(axisd),
        yaxis4=dict(axisd),
        annotations=ann)

    fig = go.Figure(data=go.Splom(
        dimensions=[dict(label='trees/sq.mile',
                         values=df_selected['trees/sq.mile']),
                    dict(label='avg.landprice($K/A)',
                         values=df_selected['avg.landprice_thous$/acre']),
                    dict(label='properties/sq.mile',
                         values=df_selected['properties/sq.mile']),
                    ],
        text=(df_selected[key]+': '+df_selected['borough']
              if key == 'ntaname' else df_selected[key]),
        hoverinfo="x+y+text",
        # showlegend=True,
        marker=dict(color=index_vals,
                    showscale=False,  # colors encode categorical variables
                    line_color='white', line_width=0.4),
        diagonal=dict(visible=True)
    ), layout=layout
    )

    return fig