def _extract(self, img): cA, (cH, cV, cD) = dwt2(img, self.mother) cH2 = cH.reshape(cH.size) cV2 = cV.reshape(cV.size) cD2 = cD.reshape(cD.size) assert cH2.shape == cV2.shape == cD2.shape buffers = (cH2, cV2, cD2) chunk_size = (cH2.size * 3) // (self.total_bits) seq0 = self.seq0[:chunk_size] seq1 = self.seq1[:chunk_size] byte = 0 output = bytearray() for i in range(self.total_bits): target = buffers[i % 3] offset = (i//3) * chunk_size chunk = target[offset : offset + seq0.size] corr0, _ = pearsonr(chunk, seq0) corr1, _ = pearsonr(chunk, seq1) bit = int(corr1 > corr0) byte = (byte << 1) | bit if i % 8 == 7: output.append(byte) byte = 0 #print repr(output) return output
def norm_distri(DF): ## Creating a Normalised data set of filtered products: based on code from http://chrisalbon.com/python/pandas_normalize_column.html #calculating the normalised values of the SP mutant products DF["norm_fcp"] = (DF['fcp'] - DF['fcp'].min()) / (DF['fcp'].max() - DF['fcp'].min()) DF.plot(kind='scatter', x='norm_fcp', y='fc'); plt.ylim((-0.01,1.01)) plt.xlim((-0.01,1.01)) plt.ylabel('Double Mutant FC Values') plt.xlabel('Normalised Product of corresponding Single Mutant FC Values') #plot the linear correlation (default = red) #assign x, y because x and y above are in plot()'s namespace x=DF['norm_fcp'] y=DF['fc'] z = np.polyfit(x, y, 1) p = np.poly1d(z) pylab.plot(x,p(x),"r-") # the line equation: #print "y=%.6fx+(%.6f)"%(z[0],z[1]) from scipy.stats import pearsonr pearsonr(x, y) #print pearsonr(x, y) plt.suptitle(pearsonr(x, y)) pylab.show()
def compare_images(indexpairs,imagematrix,pca_list,Target,Best): imagebag = [] corrarray = [] #calculate correlations for image in imagematrix: corr = pearsonr(image,Target)[0] corrarray.append(corr) corrarray = np.array(corrarray) #calculate best correlation in this generation maxcorr = np.amax(corrarray) #calculate average correlation in this generation avgcorr = np.mean(corrarray) #check if best correlation this gen is better than overall best if maxcorr > pearsonr(Best,Target)[0]: maxindex = np.argmax(corrarray) Best = np.array(imagematrix[maxindex]) #Let the Hunger Games begin for i in indexpairs: corr1 = corrarray[i[0]] corr2 = corrarray[i[1]] if corr1 > corr2: imagebag.append(i[0]) else: imagebag.append(i[1]) #survivors newindices = np.array(random.sample(imagebag,100)) pca_list_nextgen = pca_list[newindices] return pca_list_nextgen,maxcorr,avgcorr,Best
def xcorr(x, y, method = 'pearsonr', shift = 5): from numpy import correlate, array, argmax, arange, linspace from scipy.stats import spearmanr, pearsonr method = method.lower() vCorr = [] tCorr = [] if shift == 0: if len(x) >= len(y): if method == 'pearsonr': v,t = pearsonr(x[0:len(y)],y) elif method == 'spearmanr': v,t = spearmanr(x[0:len(y)],y) elif len(x) < len(y): if method == 'pearsonr': v,t = pearsonr(x,y[0:len(x)]) elif method == 'spearmanr': v,t = spearmanr(x,y[0:len(x)]) vCorr.append(v) elif shift > 0 and shift < len(y): iCounter = 0 for j in range(0,len(y)-shift,shift): vCorr.append([]) tCorr.append([]) for i in range(0,len(x)): if len(x[i:i+len(y[j:j+shift])]) == len(y[j:j+shift]): if method == 'pearsonr': v,t = pearsonr(x[i:i+len(y[j:j+shift])],y[j:j+shift]) elif method == 'spearmanr': v,t = spearmanr(x[i:i+len(y[j:j+shift])],y[j:j+shift]) vCorr[iCounter].extend([v]) tCorr[iCounter].extend([i]) iCounter = iCounter + 1 return array(vCorr), array(tCorr)
def predict(self, X): X = np.array(X) predictions = [self.classes_[int(pearsonr(self.B1, x)[0] > pearsonr(self.A1, x)[0])] for x in X] return predictions
def train_model(lrmodel, opt, cost, X, Y, devX, devY, devscores): """ Train model, using pearsonr on dev for early stopping """ done = False best = -1.0 r = np.arange(1, 6) train_set = ArrayIterator(X=X, y=Y, make_onehot=False) valid_set = ArrayIterator(X=devX, y=devY, make_onehot=False) eval_epoch = 10 while not done: callbacks = Callbacks(lrmodel, eval_set=valid_set) lrmodel.fit(train_set, optimizer=opt, num_epochs=eval_epoch, cost=cost, callbacks=callbacks) # Every 10 epochs, check Pearson on development set yhat = np.dot(lrmodel.get_outputs(valid_set), r) score = pearsonr(yhat, devscores)[0] if score > best: neon_logger.display('Dev Pearson: {}'.format(score)) best = score bestlrmodel = copy.copy(lrmodel) else: done = True eval_epoch += 10 yhat = np.dot(bestlrmodel.get_outputs(valid_set), r) score = pearsonr(yhat, devscores)[0] neon_logger.display('Dev Pearson: {}'.format(score)) return bestlrmodel
def bootstrap_test(sample1, sample2, k = 1000, p_value = 0.05, two_tailed = True): """ Test the null hypothesis that the two samples are independent from each other thanks to pearson coefficients. Note that we keep nan values during the resampling (and eliminate them to compute the pearson coefficient). Remember: if True, the coefficient is NOT significant. """ # eliminate all entries which have a nan in one of the sample. sample1_bis, sample2_bis = zip(*[zz for zz in zip(sample1, sample2) if not np.isnan(zz[0]) and not np.isnan(zz[1])]) r_sample = pearsonr(sample1_bis, sample2_bis)[0] sample1_bis = np.array(sample1_bis) sample2_bis = np.array(sample2_bis) n = len(sample1_bis) try: assert n == len(sample2_bis) except: Exception("Samples must have same sizes.") r_resample = np.zeros(k) for i in xrange(k): s1_rand = sample1_bis[randint(0, n, n)] # Resampling with the same size s2_rand = sample2_bis[randint(0, n, n)] s1_rand_bis, s2_rand_bis = zip(*[zz for zz in zip(s1_rand, s2_rand) if not np.isnan(zz[0]) and not np.isnan(zz[1])]) r_resample[i] = pearsonr(s1_rand_bis, s2_rand_bis)[0] ci = np.percentile(r_resample, [p_value/2., 1.-p_value/2.]) #print "Percentiles:", ci return ci[0]<r_sample<ci[1]
def plot_gpa_vs_grade(filename=None): plt.scatter(GPA,grade) plt.xlabel("Prior GPA") plt.ylabel("Class Grade (on GPA scale)") plt.title("GPA vs. Grade in BIOL 141 FA12") print pearsonr(GPA,grade) maybesave(filename)
def on_off_experiment1(): """compare MI vs Gini on synthetic motifs""" n, L, des_ic, num_motifs = 50,10,10,1000 sigma = 1 copies = 10*n def f(Ne): return motif_ic(sample_on_off_motif(sigma, Ne, L, copies, n)) - des_ic Ne = log_regress_spec2(f,[1,10],tol=10**-5) motifs = [sample_on_off_motif(sigma, Ne, L, copies, n) for i in trange(num_motifs)] ics = map(motif_ic, motifs) ginis = map(motif_gini, motifs) mis = map(total_motif_mi, motifs) plt.subplot(1,3,1) plt.scatter(ics, ginis) plt.xlabel("IC (bits)") plt.ylabel("Gini") print "ic vs gini:",pearsonr(ics,ginis) plt.subplot(1,3,2) plt.scatter(ics, mis) plt.xlabel("IC (bits)") plt.ylabel("MI (bits)") print "ic vs mi:",pearsonr(ics,mis) plt.subplot(1,3,3) plt.scatter(ginis, mis) plt.xlabel("Gini") plt.ylabel("Mi (bits)") print "gini vs mi:",pearsonr(ginis,mis) plt.tight_layout() param_template = ", ".join("{0}=%({0})s".format(v) for v in "n L des_ic sigma copies num_motifs".split()) param_string = param_template % vars() plt.title("Gini vs MI in On-Off Simulations") print "Pearson correlation:",pearsonr(ginis,mis) print "parameters:", param_string
def user_analysis(): for t in t_test_set: print "Analyzing ", t u_data = pd.read_csv(location + "par_normed_exp_data_repu_norm_users_" + t + ".csv") act = u_data["act"] exp = u_data["exp"] par = u_data["par"] # par = par-act act, exp, par = normalize_par(act, exp, par) ######### analyzig user side plot_distribution(act, "act", t) plot_distribution(exp, "exp", t) plot_distribution(par, "par", t) print ".data description" # print '..', u_data.describe() print "..act", mean(act), median(act), std(act) print "..exp", mean(exp), median(exp), std(exp) print "..par", mean(par), median(par), std(par) all_data = [] all_data.append(act) all_data.append(exp) all_data.append(par) plot_correlation(all_data, t) print ".correlation between features" print "..act-exp", stats.pearsonr(act, exp) print "..act-par", stats.pearsonr(act, par) print "..par-exp", stats.pearsonr(par, exp) return 0
def plot_si_vs_grade(filename=None): plt.scatter(si,grade) plt.xlabel("SI attendance count") plt.ylabel("Class Grade (on GPA scale)") plt.title("SI attendance vs. Grade in BIOL 141 FA12") print pearsonr(si,grade) maybesave(filename)
def add_pearson_cor(self, vector1, vector2, rownames=False): ''' Given two vectors, calculate pearson correlation, add pearson correlation and p-value into the list of self.pearson ''' if rownames == False: pearson_cor, pearson_pval = stats.pearsonr(vector1, vector2) self.pearson_dic.append((pearson_cor, pearson_pval)) elif rownames == True: if vector1[0] == vector2[0]: row_name = vector1[0] try: vector1 = [float(i) for i in vector1[1:len(vector1)]] vector2 = [float(i) for i in vector2[1:len(vector2)]] except ValueError: print('Rowname: %s strange. Skipping for pearson calculation.' %row_name) return None ''' vector1 = [0] * (len(vector1) - 1) vector2 = [0] * (len(vector2) - 1) self.nacount += 1 self.nagenes.append(row_name) row_name = 'NAgene' ''' pearson_cor, pearson_pval = stats.pearsonr(vector1, vector2) self.pearson_dic[row_name].append((pearson_cor, pearson_pval)) else: print('Row name of two vectors not equal, %s and %s' %(vector1[0], vector2[0])) sys.exit('Exiting...') else: print('Rownames must be either True or False.') sys.exit('Exiting...')
def train_model(lrmodel, X, Y, devX, devY, devscores): """ Train model, using pearsonr on dev for early stopping """ done = False best = -1.0 r = np.arange(1, 6) while not done: # Every 100 epochs, check Pearson on development set lrmodel.fit(X, Y, verbose=2, shuffle=False, validation_data=(devX, devY)) yhat = np.dot(lrmodel.predict_proba(devX, verbose=2), r) score = pearsonr(yhat, devscores)[0] if score > best: print score best = score bestlrmodel = copy.deepcopy(lrmodel) else: done = True yhat = np.dot(bestlrmodel.predict_proba(devX, verbose=2), r) score = pearsonr(yhat, devscores)[0] print 'Dev Pearson: ' + str(score) return bestlrmodel
def pearson_r(): tab_strings, tab_values, ground_truth = read_in_sim_data_as_table() tab_values = np.array(tab_values).transpose() print tab_values.shape tab_values = list(tab_values) for i in range(0, len(tab_values)): print pearsonr(tab_values[i], ground_truth)
def task3(): iris = datasets.load_iris() data = iris.data target = iris.target target_names = iris.target_names feature_names = iris.feature_names model = PCA(n_components=data.shape[1]) model.fit(data, target) data_new = model.transform(data) plot_iris(data_new, target, target_names) corrs1 = [] corrs2 = [] x1_new = data_new[:, 0] - data_new[:, 0].mean() x2_new = data_new[:, 1] - data_new[:, 1].mean() for i in xrange(data.shape[1]): x_i = data[:, i] - data[:, i].mean() corrs1.append(np.abs(pearsonr(x_i, x1_new)[0])) corrs2.append(np.abs(pearsonr(x_i, x2_new)[0])) list_pc1 = [] list_pc2 = [] for i in xrange(len(corrs1)): if corrs1[i] > corrs2[i]: list_pc1.append(i+1) else: list_pc2.append(i+1) write_answer_3(list_pc1, list_pc2)
def correlation(): '''Pearson correlation, and two types of rank correlation (Spearman, Kendall) comparing age and %fat (measured by dual-photon absorptiometry) for 18 normal adults. ''' # Get the data inFile = 'altman_11_1.txt' data = np.genfromtxt(inFile, delimiter=',') x = data[:,0] y = data[:,1] # --- >>> START stats <<< --- # Calculate correlations # Resulting correlation values are stored in a dictionary, so that it is # obvious which value belongs to which correlation coefficient. corr = {} corr['pearson'], _ = stats.pearsonr(x,y) corr['spearman'], _ = stats.spearmanr(x,y) corr['kendall'], _ = stats.kendalltau(x,y) # --- >>> STOP stats <<< --- print(corr) # Assert that Spearman's rho is just the correlation of the ranksorted data np.testing.assert_almost_equal(corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]) return corr['pearson'] # should be 0.79208623217849117
def GroupRegression(GroupDF,goodsubj,feedback,numFolds=10,addMotion=True,verbose=False): numberOfICs=10 columnNames=[] for rsnNumber in range(numberOfICs): columnNames.append('RSN%d' % rsnNumber) dmnIdeal=pd.read_csv('/home/jmuraskin/Projects/NFB/analysis/DMN_ideal_2.csv') SubjectDF = GroupDF[GroupDF.Subject_ID.isin(goodsubj)].groupby(['Subject_ID','FB','TR']).mean() clf = linear_model.LinearRegression() for indx,subj in enumerate(unique(GroupDF['Subject_ID'])): if verbose: print "Running Subject- %s" % subj if addMotion: X=np.column_stack((np.array(SubjectDF.loc[subj,feedback][columnNames]),zscore(SubjectDF.loc[subj,feedback]['fd']))) else: X=np.array(SubjectDF.loc[subj,feedback][columnNames]) if verbose: print X.shape predicted,intercepts,coef = leaveOneOutCV(clf,X,dmnIdeal['Wander']-dmnIdeal['Focus'],numFolds=numFolds) if indx==0: groupGLM=pd.DataFrame({'TR':range(408),'predicted':predicted,'subj':[subj]*408}) coefs=pd.DataFrame({'Coef':coef,'pe':range(X.shape[1]),'subj':[subj]*X.shape[1]}) performance=pd.DataFrame({'R':[pearsonr(dmnIdeal['Wander']-dmnIdeal['Focus'],predicted)[0]],'subj':[subj]}) else: df=pd.DataFrame({'TR':range(408),'predicted':predicted,'subj':[subj]*408}) groupGLM=pd.concat((groupGLM,df),ignore_index=True) coefs=pd.concat((coefs,pd.DataFrame({'Coef':coef,'pe':range(X.shape[1]),'subj':[subj]*X.shape[1]})),ignore_index=True) performance=pd.concat((performance,pd.DataFrame({'R':[pearsonr(dmnIdeal['Wander']-dmnIdeal['Focus'],predicted)[0]],'subj':[subj]})),ignore_index=True) return groupGLM,coefs,performance
def allDirectionalityRatios(ratioFunction): """ A simple plot which calculates all directionality ratios, plots them and puts lines at 20 top highly expressed genes (Supp figure from our paper) This is mostly matplotlib code. """ if not os.path.exists("savedHeatmaps"): os.mkdir("savedHeatmaps") wildRatio = np.log(ratioFunction("Wildtype_0min_BglII_rep1")) for j, dataset in enumerate(datasets): ax = plt.subplot(len(datasets), 1, j + 1) curRatio = (ratioFunction(dataset)) plt.title("{1}, r = {0:.2f}, p={2:.2e}".format(pearsonr(curRatio, wildRatio)[0], names[dataset], pearsonr(curRatio, wildRatio)[1]), fontsize=10) plt.tick_params(axis='both', which='major', labelsize=10) plt.tick_params(axis='both', which='minor', labelsize=8) plt.plot(curRatio) plt.ylim((0.25, 0.75)) plt.xlim((0, len(curRatio))) #plt.ylim((0, 1)) plt.yticks((0.25, 0.5, 0.75)) geneCoor = [1162773, 3509071, 1180887, 543099, 1953250, 2522439, 3328524, 1503879, 900483, 242693, 3677144, 3931680, 3677704, 3762707, 3480870, 3829656, 1424678, 901855, 1439056, 3678537] genePos = [i / 10000. for i in geneCoor] #genePos = [] for lpos in genePos: plt.vlines(lpos , -.8, .8, alpha=0.2, linewidth=1, color="black") plt.xticks([0, 50, 100, 150, 200, 250, 300, 350, 400], ["" for i in xrange(9)], fontsize=98) removeAxes(ax=ax) plt.subplots_adjust(0.07, 0.05, 0.94, 0.95, 0.2, 0.5) plt.show() exit()
def dualPlot(age, meanWithin, meanBetween, title): fig, (within, between) = plt.subplots(1, 2, sharex=True, sharey=False) # fitshit wP = np.polyfit(age, meanWithin, 1) bP = np.polyfit(age, meanBetween, 1) xnew = np.arange(age.min() - 1, age.max() + 1, 0.1) wFit = np.polyval(wP, xnew) bFit = np.polyval(bP, xnew) within.set_title("within network") between.set_title("between network") withinCorr, withinP = st.pearsonr(age, meanWithin) within.plot(age, meanWithin, "k.") within.plot(xnew, wFit, "r", label=(str(np.round(withinCorr, 2)) + " " + str(np.round(withinP, 4)))) within.set_xlabel("mean connectivity") within.set_ylabel("age") within.legend() betweenCorr, betweenP = st.pearsonr(age, meanBetween) between.plot(age, meanBetween, "k.") between.plot(xnew, bFit, "b", label=(str(np.round(betweenCorr, 2)) + " " + str(np.round(betweenP, 4)))) between.set_xlabel("mean connectivity") between.set_ylabel("age") between.legend() fig.suptitle(title) plt.show() raw_input("Press Enter to continue...") plt.close()
def main(kernel, stats_dir, outputfile=None): stats = get_results(stats_dir, kernel) variations = get_vars() x = np.array([variations[s] for s in SIVAL]) y = np.array([difference(stats, s) for s in SIVAL]) r = pearsonr(x, y)[0] p = pearsonr(x, y)[1] print 'Correlation: %f' % r print 'P-value: %f' % p X = np.column_stack([x, np.ones(x.size)]) bestfit = np.linalg.lstsq(X, y)[0] xx = np.linspace(np.min(x), np.max(x), 3) XX = np.column_stack([xx, np.ones(xx.size)]) yy = np.dot(XX, bestfit) fig = pl.figure(figsize=(16,8)) ax = fig.add_subplot(111) ax.scatter(x, y, s=25, edgecolor='none', color='k') ax.plot(xx, yy, '-', lw=3, color='k') if outputfile is None: pl.show() else: pdf = PdfPages(outputfile) pdf.savefig(fig, bbox_inches='tight') pdf.close()
def correlation_bw_sinks(ref_sink,other_sinks, cut=True): '''ref sink -- CombinedSinkData() object other_sinks -- dictionary of CombinedSinkData() objects cut -- removes first 1 tBH from mdot data b/c this is transiet phase and always increases, we want to know if equilibrium rates are correlated correlation between ref sink and sinks in N different models, either at same location as ref sink or diff location returns pearson r correlation coeff for same location and different location ''' r_same_loc, r_diff_loc= [],[] for key in other_sinks.keys(): size= min(other_sinks[key].mdot.shape[1],ref_sink.mdot.shape[1]) #if cut=False, then size is all we need, if cut=True then if ref starts at 10 and other at 2 then cannot just grab 10:size and 2:size, would be unequal array lengths cut_ref= index_before_tBH(ref_sink,1.) cut_other= index_before_tBH(other_sinks[key],1.) end_ref= size-other_sinks[key].mdot[:,:cut_other].shape[1] #if other starts at index 10, and ref at 5, then ref should end sooner than other (size - 10) end_other= size-ref_sink.mdot[:,:cut_ref].shape[1] #number of indices to grab has been figured out, finallly lets do pearson r calculation for i in range(64): if i == 0: #same location if cut: r_same_loc.append( pearsonr(other_sinks[key].mdot[i,cut_other:end_other],ref_sink.mdot[0,cut_ref:end_ref])[1] ) else: r_same_loc.append( pearsonr(other_sinks[key].mdot[i,:size],ref_sink.mdot[0,:size])[1] ) else: #different location if cut: r_diff_loc.append( pearsonr(other_sinks[key].mdot[i,cut_other:end_other],ref_sink.mdot[0,cut_ref:end_ref])[1] ) else: r_diff_loc.append( pearsonr(other_sinks[key].mdot[i,:size],ref_sink.mdot[0,:size])[1] ) assert(len(r_same_loc) == len(other_sinks.keys()) ) assert(len(r_diff_loc) == (len(other_sinks.keys())*(64-1) ) ) return r_same_loc,r_diff_loc
def new_abbas(): jurkatA = 0.25; jurkatB = 0.05; jurkatC = 0.01; jurkatD = 0.002; im9A = 0.125; im9B = 0.317; im9C = 0.495; im9D = 0.333; rajiA = 0.25; rajiB = 0.475; rajiC = 0.165; rajiD = 0.333; thp1A = 0.375; thp1B = 0.158; thp1C = 0.33; thp1D = 0.333; i = 0 result = [] while i <= 95: liste = [] j = 0 while j <= 95: cells = [] for k in range(3, 7): line = linecache.getline('../../../Master_files/abbas/Abbas_log_' + str(i) + '_' + str(j), k) cells.append([float(line.split('\t')[1]), float(line.split('\t')[2]), float(line.split('\t')[3]), float(line.split('\t')[4])]) cells = [list(x) for x in zip(*cells)] for k in range(len(cells)): cells[k] = [float(l)/sum(cells[k]) for l in cells[k]] corrA = stats.pearsonr(cells[0], [jurkatA, im9A, rajiA, thp1A]) corrB = stats.pearsonr(cells[1], [jurkatB, im9B, rajiB, thp1B]) corrC = stats.pearsonr(cells[2], [jurkatC, im9C, rajiC, thp1C]) corrD = stats.pearsonr(cells[3], [jurkatD, im9D, rajiD, thp1D]) liste.append((corrA[0] + corrB[0] + corrC[0] + corrD[0]) / 4.0) j += 5 result.append(liste) i += 5 result = [list(x) for x in zip(*result)] return result
def corr(self): index=['RMSE','TRMSE','area','CLength','CEdges'] index_Rank = [e+'_Rank' for e in index] for i in index: setattr(self,i+'_corr',pearsonr([route[i] for route in self.routes],[route['possibility'] for route in self.routes])) for i in index_Rank: setattr(self,i+'_corr',pearsonr([route[i] for route in self.routes],[route['possibility_Rank'] for route in self.routes]))
def ks_2d_2samp(data1, data2): """Computes the 2-dimensional Kolmogorov-Smirnof statistic on 2 samples. This is a two-sided test for the null hypothesis that 2 independent samples are drawn from the same continuous distribution. Parameters : a, b : sequence of 1-D ndarrays two arrays of sample observations assumed to be drawn from a continuous distribution, sample sizes can be different Returns : D : float KS statistic p-value : float two-tailed p-value. High value means we cannot reject the hypothesis that they are from the same distribution. low D => high p """ npt1 = np.shape(data1)[0] #Compute D using data1 as the origins D1 = np.max([max_diff_for_orig(data1[i,:], data1, data2) for i in xrange(npt1)]) npt2 = np.shape(data2)[0] #Compute D using data2 as the origins D2 = np.max([max_diff_for_orig(data2[i,:], data1, data2) for i in xrange(npt2)]) #Their mean D = (D1+D2)/2. #The approximate p-value: this is detailed in NR 14.8 neff = npt1*npt2/(1.*npt1+npt2) (rr1,p) = st.pearsonr(data1[:,0], data1[:,1]) (rr2,p) = st.pearsonr(data2[:,0], data2[:,1]) reff = (rr1**2+rr2**2)/2. ksarg = neff*D/(1+np.sqrt(1-reff)*(025-0.75/np.sqrt(neff)))
def test_recon(pAnom, scaled, start1, end1, start2, end2): """Compute reconstruction statistics.""" pAnom1 = pAnom.ix[start1:end1].values.flatten() pAnom2 = pAnom.ix[start2:end2].values.flatten() scaled1 = scaled.ix[start1:end1].values.flatten() scaled2 = scaled.ix[start2:end2].values.flatten() mpAnom1 = pAnom1.mean() mpAnom2 = pAnom2.mean() pMinusScaled = pAnom2 - scaled2 pMinusCal = pAnom2 - mpAnom1 pMinusVer = pAnom2 - mpAnom2 SSD = (pMinusScaled*pMinusScaled).sum() SSMa = (pMinusCal*pMinusCal).sum() SSMb = (pMinusVer*pMinusVer).sum() MSE = SSD RE = 1 - SSD/SSMa CE = 1 - SSD/SSMb glk = GLK(pAnom2, scaled2) rcal, p = pearsonr(pAnom1, scaled1) rver, p = pearsonr(pAnom2, scaled2) return { 'mse': MSE, 're': RE, 'ce': CE, 'rcal': rcal, 'rver': rver, 'glk': glk }
def train(df, target='ki'): y = df['ki'] columns = df.columns.tolist() columns.remove('ki') X = df[columns] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5) pipe_line = Pipeline([ ('tfidf', TfidfVectorizer(max_df=1.0, min_df=0.0, lowercase=False, token_pattern=r'(?u)\b\S+\b', analyzer='word') ), ('model', RandomForestRegressor(n_estimators=50, n_jobs=1)) ]) pipe_line.fit(X_train['tokens'], y_train) ofn = "/work/jaydy/working/pdbbind/PDBBind_refined_15/rf.{}.pkl".format( target) joblib.dump(pipe_line, ofn) prediction = pipe_line.predict(X_test['tokens']) corr = pearsonr(y_test, prediction) dfire_corr = pearsonr(y_test, X_test['dfire']) uncorr_dfire_corr = pearsonr(y_test, X_test['uncor_dfire']) print("rf correlation: {}".format(corr)) print("dfire correlation: {}".format(dfire_corr)) print("uncorrelated-dfire correlation: {}".format( uncorr_dfire_corr))
def wordsim(self, path = "wordsim/wordsim353/combined.tab"): (pairs, scores) = self.loadCorpus(path) #m = self.loadSenna("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") #dict{str: np.array()} m = Word2Vec.load_word2vec_format("../google_data/GoogleNews-vectors-negative300.bin.gz", binary=True) print "--- Original Pairs: ---" for pair in pairs: print pair words = set(m.index2word) (pairs,nums) = self.checkWords(m, pairs) print "--- After Matching: ---" ### For WS dataset. #nums = [0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 13, 16, 17, 19, 23, 24, 25, 27, 28, 29, 30, 31, 32, 36, 37, 40, 43, 44, 49, 54, 55, 56, 57, 58, 59, 60, 61, 62, 65, 70, 74, 75, 83, 84, 85, 86, 88, 90, 94, 96, 97, 98, 99, 100, 102, 107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 141, 142, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156, 161, 162, 163, 164, 165, 169, 171, 173, 174, 177, 178, 183, 184, 188, 190, 191, 194, 197, 198, 206, 210, 213, 214, 218, 219, 220, 221, 224, 225, 226, 227, 228, 230, 235, 238, 242, 247, 255, 256, 257, 259, 260, 267, 269, 273, 275, 277, 278, 279, 280, 282, 285, 286, 287, 288, 289, 291, 296, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 317, 318, 320, 321, 324, 325, 332, 334, 335, 336, 340, 343, 344, 347, 348, 350, 351, 352] print nums print "Original Number of Words",len(pairs) for pair in pairs: print pair matched_pairs = [pairs[num] for num in nums] matched_scores = [scores[num] for num in nums] print "--- After deleting unmatched: ---" print "Number of remaining words", len(matched_pairs) print matched_pairs print matched_scores cosine_scores = [] for tmp in matched_pairs: cosine = 1 - spatial.distance.cosine(m[tmp[0]], m[tmp[1]]) cosine_scores.append(cosine) print "--- After calculating cosine scores:--- " print cosine_scores print "--- Spearman Corelation ---" print stats.spearmanr(matched_scores, cosine_scores) print stats.pearsonr(matched_scores, cosine_scores)
def new_cibersort(): jurkatA = 0.25; jurkatB = 0.05; jurkatC = 0.01; jurkatD = 0.002; im9A = 0.125; im9B = 0.317; im9C = 0.495; im9D = 0.333; rajiA = 0.25; rajiB = 0.475; rajiC = 0.165; rajiD = 0.333; thp1A = 0.375; thp1B = 0.158; thp1C = 0.33; thp1D = 0.333; i = 0 result = [] while i <= 95: liste = [] j = 0 while j <= 95: cells = [] for k in range(2, 6): line = linecache.getline('../../../Master_files/output/CIBERSORT_R_log_' + str(i) + '_' + str(j), k) cells.append([float(line.split('\t')[1]), float(line.split('\t')[2]), float(line.split('\t')[3]), float(line.split('\t')[4])]) corrA = stats.pearsonr(cells[0], [jurkatA, im9A, rajiA, thp1A]) corrB = stats.pearsonr(cells[1], [jurkatB, im9B, rajiB, thp1B]) corrC = stats.pearsonr(cells[2], [jurkatC, im9C, rajiC, thp1C]) corrD = stats.pearsonr(cells[3], [jurkatD, im9D, rajiD, thp1D]) liste.append((corrA[0] + corrB[0] + corrC[0] + corrD[0]) / 4.0) j += 5 result.append(liste) i += 5 #result = reversed(result) result = [list(x) for x in zip(*result)] return result
def _call(self, dataset): """Computes the aslmap_dcm = sl_dcm(group_data)verage correlation in similarity structure across chunks.""" chunks_attr = self.chunks_attr nchunks = len(np.unique(dataset.sa[chunks_attr])) if nchunks < 2: raise StandardError("This measure calculates similarity consistency across " "chunks and is not meaningful for datasets with only " "one chunk:") #calc neur sim b/w targ_comp targets per subject neur_sim={} for s in np.unique(dataset.sa[chunks_attr]): ds_s = dataset[dataset.sa.chunks == s] neur_sim[s+'1'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp1[0]],ds_s[ds_s.sa.targets == self.targ_comp1[1]])[0][1] neur_sim[s+'2'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp2[0]],ds_s[ds_s.sa.targets == self.targ_comp2[1]])[0][1] #combine xSs_behavs xSs_behav = {} for s in self.xSs_behav1: xSs_behav[s+'1'] = self.xSs_behav1[s] for s in self.xSs_behav2: xSs_behav[s+'2'] = self.xSs_behav2[s] #create dsets where cols are neural sim and mt sim for correlations behav_neur = np.array([[xSs_behav[s],neur_sim[s]] for s in neur_sim]) #correlate behav with neur sim b/w subjects if self.comparison_metric == 'spearman': xSs_corr = pearsonr(rankdata(behav_neur[:,0]),rankdata(behav_neur[:,1])) xSs_corr = pearsonr(behav_neur[:,0],behav_neur[:,1]) #returns fish z transformed r coeff ; could change to be p value if wanted... return Dataset(np.array([np.arctanh(xSs_corr[0])]))
def extractExpFit(self,sample): samplingRange = sample.shape[1] sampleLen = sample.shape[0] sampleFl = sample.astype(np.float32) totals = np.array([0.0,0.0,0.0]) for iPerpProfile in range(0,sample.shape[0]): profile = sampleFl[iPerpProfile,:] if(len(profile.shape) == 2): for iChannel in range(0,3): y = np.log(profile[:,iChannel]) cor = -st.pearsonr(np.arange(0,samplingRange),y)[0] if(math.isnan(cor)): cor = 0.0 totals[iChannel] += cor else: y = np.log(profile) cor = -st.pearsonr(np.arange(0,samplingRange),y)[0] if(math.isnan(cor)): cor = 0.0 totals[0] += cor totals /= sampleLen if(len(sample.shape) == 3): return np.array(totals,dtype=ExtractTool.outputDType) else: return ExtractTool.outputDType(totals[0])
def r2(x, y): return stats.pearsonr(x, y)[0] ** 2
def PCA1(): print (rcsetup.all_backends) data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data", header=None) data.columns # rename column names to be similar to R naming convention data.columns = ["V"+str(i) for i in range(1, len(data.columns)+1)] data.V1 = data.V1.astype(float) # independent variables data X = data.loc[:, "V1":] # dependednt variable data Y = data.V1 #data #print (X) #if you want them stacked vertically #f, (ax1, ax2, ax3) = plt.subplots(1, 3) #============================================================================== # Scatter plot #============================================================================== pd.tools.plotting.scatter_matrix(data.loc[:, "V2":"V6"], diagonal="hist") plt.tight_layout() plt.show() sns.lmplot("V4", "V5", data, hue="V1", fit_reg=True) #ax.xaxis.tick_top() #============================================================================== # Profile plot #============================================================================== ax = data[["V2","V3","V4","V5","V6"]].plot() plt.figure() ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)); #============================================================================== # Summary statistics #============================================================================== ''' print (X.apply(np.mean)) print (X.apply(np.std)) ''' #============================================================================== #Extract out just cultivar 2 - for example (same can be done for cultivar 1 and 3) #============================================================================== ''' class2data = data[Y==2] print (class2data.loc[:, "V2":].apply(np.mean)) print (class2data.loc[:, "V2":].apply(np.std)) ''' #============================================================================== # Within and Between Groups Variance #============================================================================== #printMeanAndSdByGroup(X, Y) ''' print (calcWithinGroupsVariance(X.V2, Y)) print (calcBetweenGroupsVariance(X.V2, Y)) calcSeparations(X, Y) print ("Within Group Co-Variance = ", calcWithinGroupsCovariance(X.V8, X.V11, Y)) print ("Between Group Co-Variance = ", calcBetweenGroupsCovariance(X.V8, X.V11, Y)) ''' #============================================================================== # Co-orelation text matrix and the heatMap #============================================================================== corrmat = X.corr() print ("\n *****FIRST DATA OUTPUT: Co-orelation matrix*****::\n\n", corrmat) plt.figure() sns.heatmap(corrmat, vmax=1., square=True) ax.xaxis.tick_top() #============================================================================== # Most highly co-orelated #============================================================================== cor = stat.pearsonr (X.V2, X.V3) print ("\n ***** SECOND DATA OUTPUT *****::\n\n") print ("Cor:", cor[0], "\t p-value:", cor[1], "\n") print ("\n ***** THIRD DATA OUTPUT *****::\n\n") print (mosthighlycorrelated(X, 10)) #============================================================================== # Standardize before running PCA #============================================================================== standardisedX = scale(X) standardisedX = pd.DataFrame(standardisedX, index=X.index, columns=X.columns) standardisedX.apply(np.mean) standardisedX.apply(np.std) #============================================================================== # Run the PCA process #============================================================================== ''' PCA Process ''' pca = PCA().fit(standardisedX) summary = pca_summary(pca, standardisedX) plt.figure() screeplot(pca, standardisedX) #============================================================================== # First Principal Component #============================================================================== print ("\n ***** FIRST PRINCIPAL COMPONENT *****::\n\n") print (pca.components_[0]) print ("Sum of Variances:", np.sum(pca.components_[0]**2)) #Calculate the values of the first principal component print (calcpc(standardisedX, pca.components_[0])) #Another way - Calculate the values of the first principal component #print (pca.transform(standardisedX)[:, 0]) #============================================================================== # Second Principal Component #============================================================================== print ("\n ***** SECOND PRINCIPAL COMPONENT *****::\n\n") print (pca.components_[1]) print ("Sum of Variances: ", np.sum(pca.components_[1]**2)) #Calculate the values of the second principal component print (calcpc(standardisedX, pca.components_[1])) #Another way - Calculate the values of the second principal component #print (pca.transform(standardisedX)[:, 1]) #============================================================================== # Scatter Plot for the principal components #============================================================================== pca_scatter(pca, standardisedX, Y) return
import pandas as pd from scipy.stats import pearsonr df = pd.read_csv('wdbc.csv') df.set_index('ID_number', inplace=True) corr, _ = pearsonr(df.area, df.perimeter) print('Pearsons Correlation Coefficient of Area and Perimeter: %.3f' % corr) print( 'The pearsonr() SciPy function is used to calculate the Pearson correlation coefficient between two data samples \n' 'Area and Perimeter. The Pearsons Correlation Coefficient of Area and Perimeter is 0.987, which indicates a notable \n' 'correlation between the Area and Perimeter of cell nucleus. Therefore, we can interpret the area of the cell nucleus is \n' 'proportional to its perimeter.')
def validate(): """ run KFOLD method for regression """ #defining directories dir_in = "/lustre/fs0/home/mtadesse/eraFiveLag" dir_out = "/lustre/fs0/home/mtadesse/mlrValidation" surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef" #cd to the lagged predictors directory os.chdir(dir_in) x = 52 y = 53 #empty dataframe for model validation df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse']) #looping through for tg in range(x,y): os.chdir(dir_in) tg_name = os.listdir()[tg] print(tg, tg_name) #load predictor pred = pd.read_csv(tg_name) pred.drop('Unnamed: 0', axis = 1, inplace = True) #add squared and cubed wind terms (as in WPI model) pickTerms = lambda x: x.startswith('wnd') wndTerms = pred.columns[list(map(pickTerms, pred.columns))] wnd_sqr = pred[wndTerms]**2 wnd_cbd = pred[wndTerms]**3 pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1) #standardize predictor data dat = pred.iloc[:,1:] scaler = StandardScaler() print(scaler.fit(dat)) dat_standardized = pd.DataFrame(scaler.transform(dat), \ columns = dat.columns) pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1) #load surge data os.chdir(surge_path) surge = pd.read_csv(tg_name) surge.drop('Unnamed: 0', axis = 1, inplace = True) #remove duplicated surge rows surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True) surge.reset_index(inplace = True) surge.drop('index', axis = 1, inplace = True) #adjust surge time format to match that of pred time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d')) surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date']) time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1) print("it's all good!", '\n') print(pred_standardized.shape) # #merge predictors and surge to find common time frame # pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right') # pred_surge.sort_values(by = 'date', inplace = True) predSubset = pred_standardized.iloc[:,:2] dateSubset = pd.merge(predSubset, surge_new.iloc[:,:2], on ='date', how = 'left') pred_standardized['surge'] = dateSubset['surge'] pred_surge = pred_standardized.copy() del pred_standardized #find rows that have nans and remove them row_nan = pred_surge[pred_surge.isna().any(axis =1)] pred_surge.drop(row_nan.index, axis = 0, inplace = True) pred_surge.reset_index(inplace = True) pred_surge.drop('index', axis = 1, inplace = True) #in case pred and surge don't overlap if pred_surge.shape[0] == 0: print('-'*80) print('Predictors and Surge don''t overlap') print('-'*80) continue pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \ pred_surge['date'])), \ columns = ['date']) #prepare data for training/testing X = pred_surge.iloc[:,1:-1] y = pd.DataFrame(pred_surge['surge']) y = y.reset_index() y.drop(['index'], axis = 1, inplace = True) #apply PCA pca = PCA(.95) pca.fit(X) X_pca = pca.transform(X) #apply 10 fold cross validation kf = KFold(n_splits=10, random_state=29) metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs']) for train_index, test_index in kf.split(X): X_train, X_test = X_pca[train_index], X_pca[test_index] y_train, y_test = y['surge'][train_index], y['surge'][test_index] #train regression model lm = LinearRegression() lm.fit(X_train, y_train) #predictions predictions = lm.predict(X_test) # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \ # pd.DataFrame(np.array(y_test))], \ # axis = 1) # pred_obs.columns = ['pred', 'obs'] # combo = pd.concat([combo, pred_obs], axis = 0) #evaluation matrix - check p value if stats.pearsonr(y_test, predictions)[1] >= 0.05: print("insignificant correlation!") continue else: print(stats.pearsonr(y_test, predictions)) metric_corr.append(stats.pearsonr(y_test, predictions)[0]) print(np.sqrt(metrics.mean_squared_error(y_test, predictions))) metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions))) #number of years used to train/test model num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\ pred_surge['date'][0]).days/365 longitude = surge['lon'][0] latitude = surge['lat'][0] num_pc = X_pca.shape[1] #number of principal components corr = np.mean(metric_corr) rmse = np.mean(metric_rmse) print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' - avg_rmse (m) = ', \ np.mean(metric_rmse), '\n') #original size and pca size of matrix added new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T new_df.columns = ['tg', 'lon', 'lat', 'num_year', \ 'num_95pcs','corrn', 'rmse'] df = pd.concat([df, new_df], axis = 0) #save df as cs - in case of interruption os.chdir(dir_out) # deltaName = dir_out.split('\\')[-1] # saveName = 'era5_lrreg_kfold_'+deltaName+'.csv' df.to_csv(tg_name) #cd to dir_in os.chdir(dir_in)
def score(self, preds, targets): #preds['target'][0, 0] += random.random()*1e-6 score, _ = pearsonr(preds['target'][:, 0], targets['target']) base_score, _ = pearsonr(preds['target'][:, 0], shuffle(targets['target'])) return "{0:.4f}/{1:.4f}".format(score, base_score)
savefig('figures/draw_graph_fa_{}_cluster_trajectory_age_k{}.png' .format(NAMESPACE, knn), ax) if VIZ_CORR_PSEUDOTIME: sc.pp.neighbors(adata, n_neighbors=20) draw_graph(adata, layout='fa') tprint('Diffusion pseudotime analysis...') tprint('pseudotime') sc.tl.diffmap(adata) adata.uns['iroot'] = np.flatnonzero(adata.obs['age'] < 14.6)[0] sc.tl.dpt(adata) finite_idx = np.isfinite(adata.obs['dpt_pseudotime']) tprint(pearsonr(adata.obs['dpt_pseudotime'][finite_idx], adata.obs['age'][finite_idx])) tprint(spearmanr(adata.obs['dpt_pseudotime'][finite_idx], adata.obs['age'][finite_idx])) ax = sc.pl.draw_graph( adata, color='dpt_pseudotime', edges=True, edges_color='#CCCCCC', color_map='inferno', show=False, ) savefig('figures/draw_graph_fa_{}_cluster_trajectory_dpt.png' .format(NAMESPACE), ax) pair2corr = {} assert(len(gene_pairs) == X_dimred.shape[1]) for pair_idx, pair in enumerate(gene_pairs): pair2corr[pair] = pearsonr( X_dimred[finite_idx, pair_idx],
return val1 def p_val_mi(x, y): count = 0.0 iterations = 10000 score = metrics.adjusted_mutual_info_score(x, y) for i in range(iterations): shuffle(x) shuffle(y) if metrics.adjusted_mutual_info_score(x, y) >= score: count += 1.0 return count / iterations pearsonc = lambda x, y: stats.pearsonr(np.array(x), np.array(y))[0] p_val_test1 = lambda x, y: stats.pearsonr(np.array(x), np.array(y))[1] ajd_mi_bin = lambda x, y: metrics.adjusted_mutual_info_score( bin_variable(x), bin_variable(y)) p_val_test2 = lambda x, y: p_val_mi(bin_variable(x), bin_variable(y)) spearmanc = lambda x, y: stats.spearmanr(np.array(x), np.array(y))[0] # In[ ]: ## Evaluating p-distribution for p-value computation num_samples_estimation = 100000 sig_to_noise_rate = [j * 0.1 for j in range(21)] #0, 0.1 ,...,0.9, 1,...2 sig_to_noise_rate[0] = 0.0001 #num_points=[k*10 for k in range(1,16)]#10,20,30,...150
df = pd.read_csv('ANDR1602_clean.csv', sep=',') # df = df.drop(columns=["id", "time step"]) features = ["wind direction", "temperature", "humidity", "pressure", "dewpoint", "wind speed at 2 meters", "solar radiation"] target = ["wind speed"] X = [i for i in range(samples)] features_df = df[features] y = df[target] # plt.scatter(X_test, y_test_true, color='red') # plt.plot(idx_test, y_test_true, color='blue') # plt.plot(X[10:100], y[10:100], color='pink') # plt.show() #### correlation calculation ### for f_index, f in enumerate(features): my_feature = features_df.values[:, f_index] my_target = y.values[:, 0] coeff, pvalue = pearsonr(my_feature, my_target) print("feature:", f, ", yield:", target[0], "; coeff:", coeff, ";p-value:", pvalue) # yield_year_count = len(labels.columns) # for f in range(features): # for y in range(yield_year_count): # f_label = df.columns[8+f] # y_label = df.columns[3 + y] # coeff, pvalue = pearsonr(dat[:, f], labels.values[:, y]) # print("feature:", f_label, ", yield:", y_label, "; coeff:", coeff, ";p-value:", pvalue)
def bounds(input_bigWig, peaks_df, peak_width, smoothing_params=[7, 81]): """ Function to compute lower & upper bounds, and average profile performance for cross entropy and jsd metrics Args: input_bigWig (str): path to bigWig file peaks_df (str): pandas dataframe containing peaks information. The dataframe should have 'chrom', 'start', and 'end' as first 3 columns. Each peak should have the same width (equal to peak_width) i.e 'end' - 'start' is the same for all rows in the dataframe. peak_width (int): width of each peak. smoothing_params (list): list of length 2, containing sigma and window_size values for 1D gaussian smoothing of profiles Returns: tuple: (numpy array of average profile, pandas dataframe with bounds values in columns) """ # compute the average profile print("Computing average profile ...") avg_profile = get_average_profile(input_bigWig, peaks_df, peak_width) # get average profile as probabilities avg_profile_prob = avg_profile / np.sum(avg_profile) # open the bigWig file for reading bw = pyBigWig.open(input_bigWig) # arrays to hold metrics values for mnll, cross entropy, jsd, # pearson and spearman correlation of the peak profile computed # against uniform, average and self(observed peak) profile # mnll mnll_uniform = np.zeros(peaks_df.shape[0]) mnll_average = np.zeros(peaks_df.shape[0]) mnll_self = np.zeros(peaks_df.shape[0]) # cross entropy ce_uniform = np.zeros(peaks_df.shape[0]) ce_average = np.zeros(peaks_df.shape[0]) ce_self = np.zeros(peaks_df.shape[0]) # jsd jsd_uniform = np.zeros(peaks_df.shape[0]) jsd_average = np.zeros(peaks_df.shape[0]) jsd_self = np.zeros(peaks_df.shape[0]) # pearson pearson_uniform = np.zeros(peaks_df.shape[0]) pearson_average = np.zeros(peaks_df.shape[0]) pearson_self = np.zeros(peaks_df.shape[0]) # spearman spearman_uniform = np.zeros(peaks_df.shape[0]) spearman_average = np.zeros(peaks_df.shape[0]) spearman_self = np.zeros(peaks_df.shape[0]) print("Computing bounds ...") # iterate through all peaks for idx, row in tqdm(peaks_df.iterrows(), desc='peak', total=peaks_df.shape[0]): # raise exception if 'end' - 'start' is not equal to peak_width if (row['end'] - row['start']) != peak_width: raise quietexception.QuietException( "Inconsistent peak width found at: {}:{}-{}".format( row['chrom'], row['start'], row['end'])) # get bigWig profile profile = np.nan_to_num( bw.values(row['chrom'], row['start'], row['end'])) # if we find that the profile at this peak is all zeros if sum(profile) == 0: print("Found 'zero' profile at {}: ({}, {})".format( row['chrom'], row['start'], row['end'])) # assign nans to all mnll_uniform[idx] = np.nan mnll_average[idx] = np.nan mnll_self[idx] = np.nan ce_uniform[idx] = np.nan ce_average[idx] = np.nan ce_self[idx] = np.nan jsd_uniform[idx] = np.nan jsd_average[idx] = np.nan jsd_self[idx] = np.nan pearson_uniform[idx] = np.nan pearson_average[idx] = np.nan pearson_self[idx] = np.nan spearman_uniform[idx] = np.nan spearman_average[idx] = np.nan spearman_self[idx] = np.nan continue # uniform distribution profile uniform_profile = np.ones(peak_width) * (1.0 / peak_width) # smoothed profile profile_smooth = gaussian1D_smoothing(profile, smoothing_params[0], smoothing_params[1]) # smoothed profile as probabilities profile_smooth_prob = profile_smooth / np.sum(profile_smooth) # profile as probabilities profile_prob = profile / np.sum(profile) # mnll of profile with uniform profile mnll_uniform[idx] = mnll(profile, probs=uniform_profile) # mnll of profile with average profile mnll_average[idx] = mnll(profile, probs=avg_profile_prob) # mnll of profile with itself mnll_self[idx] = mnll(profile, probs=profile_prob) # cross entropy of profile with uniform profile ce_uniform[idx] = profile_cross_entropy(profile, probs=uniform_profile) # cross entropy of profile with average profile ce_average[idx] = profile_cross_entropy(profile, probs=avg_profile_prob) # cross entropy of profile with itself ce_self[idx] = profile_cross_entropy(profile, probs=profile_prob) # jsd of profile with uniform profile jsd_uniform[idx] = jensenshannon(profile_prob, uniform_profile) # jsd of profile with average profile jsd_average[idx] = jensenshannon(profile_prob, avg_profile_prob) # jsd of profile with itself (upper bound) jsd_self[idx] = 0.0 # pearson of profile with uniform profile ### nothing to do ... leave it as zeros # pearson of profile with average profile pearson_average[idx] = pearsonr(profile, avg_profile_prob)[0] # pearson of profile with itself pearson_self[idx] = pearsonr(profile, profile)[0] # spearman of profile with uniform profile ### nothing to do ... leave it as zeros # spearman of profile with average profile spearman_average[idx] = spearmanr(profile, avg_profile_prob)[0] spearman_self[idx] = spearmanr(profile, profile)[0] # create a pandas dataframe to hold the upper & lower bound, # and avg profile performance values column_names = ['mnll_uniform', 'mnll_average', 'mnll_self', 'ce_uniform', 'ce_average', 'ce_self', 'jsd_uniform', 'jsd_average', 'jsd_self', 'pearson_uniform', 'pearson_average', 'pearson_self', 'spearman_uniform', 'spearman_average', 'spearman_self'] # create a pandas dataframe to store all the bounds values bounds_df = pd.DataFrame(columns = column_names) # assign values to the dataframe columns bounds_df['mnll_uniform'] = np.nan_to_num(mnll_uniform) bounds_df['mnll_average'] = np.nan_to_num(mnll_average) bounds_df['mnll_self'] = np.nan_to_num(mnll_self) bounds_df['ce_uniform'] = np.nan_to_num(ce_uniform) bounds_df['ce_average'] = np.nan_to_num(ce_average) bounds_df['ce_self'] = np.nan_to_num(ce_self) bounds_df['jsd_uniform'] = np.nan_to_num(jsd_uniform) bounds_df['jsd_average'] = np.nan_to_num(jsd_average) bounds_df['jsd_self'] = np.nan_to_num(jsd_self) bounds_df['pearson_uniform'] = np.nan_to_num(pearson_uniform) bounds_df['pearson_average'] = np.nan_to_num(pearson_average) bounds_df['pearson_self'] = np.nan_to_num(pearson_self) bounds_df['spearman_uniform'] = np.nan_to_num(spearman_uniform) bounds_df['spearman_average'] = np.nan_to_num(spearman_average) bounds_df['spearman_self'] = np.nan_to_num(spearman_self) return avg_profile, bounds_df
100.11, 104.53, 106.46, 92.33, 101.0, 99.53, 116.2, 97.9, 102.54, 111.68, 85.02, 109.92, 99.53, 80.96, 71.91, 99.73, 92.6, 75.75, 98.29, 104.49, 112.13 ] dexterity1 = np.array(dexterity1) dexterity2 = np.array(dexterity2) kwargs = dict() for key in ['mse', 'corr', 'r2', 'betas']: data = apply_function(dataframes[0], keys=['nodes_1', 'nodes_2', 'y_attr', 'band'], attr=key, fx=lambda x: np.mean(x)) r = np.array([pearsonr(zscore(dexterity1), v) for v in data[key].values]) data['r'] = r[:, 0] data['p'] = r[:, 1] #kwargs['p'] = [pearsonr(dexterity1, v)[1] for v in data[key].values] grid = sns.FacetGrid(data, col="y_attr", row="band") grid.map(plot_matrix, "nodes_1", "nodes_2", 'r', 'p', **kwargs) def plot_matrix(nodes1, nodes2, acc, p, **kwargs): df = dict(n1=nodes1, n2=nodes2, a=acc) df = pd.DataFrame(df) pdf = df.pivot("n1", "n2", "a") nz = np.nonzero(np.isnan(pdf.values))
def custom(a, b): v,_ = stats.pearsonr(a, b) return round(v, 4)
def ptdt_analysis(PRS, PRS_iid, PRS_prs, structured): ''' Takes PRS score file and strcture matrix and returns pTDT summary statistics. ''' PRS_values = {} for line in PRS: try: PRS_values[line.split()[int(PRS_iid)]] = float(line.split()[int(PRS_prs)]) #creating dict for each ID to call its PRS except ValueError: continue print('Creating pTDT matrix.', end="") #progress data = [] #initialize list to store each row of matched matrix for list_ in structured: row = [] #create empty list per row values = list_ row.append(values[0]) n = 4 if quad == 'True': #Include sibling PRS if quad flag called n += 1 for i in range(1,n): #fill each place in list with values from dict try: row.append(str(PRS_values[values[i]])) except KeyError: row.append('NA') #return NA for missing PRS values data.append(row) print('.', end="") if quad == 'True': #Create a sibling column if quad flag called matrix = pd.DataFrame(data, columns=['ID','Proband','Father','Mother','Sibling']) else: matrix = pd.DataFrame(data, columns=['ID','Proband','Father','Mother']) # convert dict values to numeric and force NaNs from missing IDs = matrix['ID'] matrix = matrix.apply(pd.to_numeric, errors='coerce') matrix['ID'] = IDs #rescue non numeric IDs print('.', end="") # create output matrix output = pd.DataFrame(index = range(0, matrix.shape[0]), columns = ["FID","mp_PRS","pro_PRS","pro_pTDT"]) output['FID'] = matrix['ID'] output['mp_PRS'] = (matrix.iloc[:,2]+ matrix.iloc[:,3])/2 output['pro_PRS'] = matrix['Proband'] sd = np.std(output['mp_PRS'], ddof=1) output['pro_pTDT'] = (output.iloc[:,2] - output.iloc[:,1])/sd if quad == 'True': output['sib_PRS'] = matrix['Sibling'] output['sib_pTDT'] = (output.iloc[:,4] - output.iloc[:,1])/sd print(' done.') # Quality control output1 = output.dropna(subset=['pro_PRS','mp_PRS']) corr1 = ss.pearsonr(output1['pro_PRS'],output1['mp_PRS'])[0] if quad == 'True': output2 = output.dropna(subset=['sib_PRS','mp_PRS']) corr2 = ss.pearsonr(output2['sib_PRS'],output2['mp_PRS'])[0] else: corr2 = 1 if corr1 >= .2 and corr2 >= .2: print('QC pass.') log.write('QC pass.\n') else: print('WARNING: QC fail - Low correlation between mid-parent PRS and proband/sibling PRS.') log.write('WARNING: QC fail - Low correlation between mid-parent PRS and proband/sibling PRS.\n') # t-test x = output['pro_pTDT'] x_totlength = len(x) x = x[~np.isnan(x)] # remove NaNs x_usedlength = len(x) print('{0} probands used in pTDT analysis ({1} skipped due to missingness).' .format(x_usedlength,x_totlength-x_usedlength)) log.write('{0} probands used in pTDT analysis ({1} skipped due to missingness).\n' .format(x_usedlength,x_totlength-x_usedlength)) if x_usedlength == 0: raise RuntimeError('Column order in PRS file is incorrect') pTDT_mean = np.mean(x) pTDT_std = np.std(x,ddof=1)/np.sqrt(len(x)) pTDT_pvalue = ss.ttest_1samp(x, popmean=0).pvalue values = [pTDT_mean,pTDT_std,pTDT_pvalue] if quad == 'True': y = output['sib_pTDT'] y_totlength = len(y) y = y[~np.isnan(y)] y_usedlength = len(y) print('{0} siblings used in pTDT analysis ({1} skipped due to missingness).' .format(y_usedlength,y_totlength-y_usedlength)) log.write('{0} siblings used in pTDT analysis ({1} skipped due to missingness).\n' .format(y_usedlength,y_totlength-y_usedlength)) unaffected_mean = np.mean(y) unaffected_std = np.std(y,ddof=1)/np.sqrt(len(y)) unaffected_pvalue = ss.ttest_1samp(y, popmean=0).pvalue values2 = [unaffected_mean,unaffected_std,unaffected_pvalue] values = values + values2 if table == 'True': output['dad_PRS'] = matrix['Father'] output['mom_PRS'] = matrix['Mother'] if quad == 'True': cols = ['FID','pro_PRS','sib_PRS','dad_PRS','mom_PRS','mp_PRS','pro_pTDT','sib_pTDT'] else: cols = ['FID','pro_PRS','dad_PRS','mom_PRS','mp_PRS','pro_pTDT'] output = output[cols] output = output.round(4) headers = list(output.columns.values) outfile = open(outname+'.ptdt.table','w+') outfile.flush() output.to_csv(outname+'.ptdt.table', na_rep = 'NA', header=headers, index=None, sep='\t', mode='a+') return values
#nmb_son=100.*((np.nanmean(sites_ammonium_son))- np.nanmean(gc_data_ammonium_son))/np.nanmean(gc_data_ammonium_son) #nmb_djf=100.*((np.nanmean(sites_ammonium_djf))- np.nanmean(gc_data_ammonium_djf))/np.nanmean(gc_data_ammonium_djf) nmb_Annual=100.*((np.nanmean(gc_data_ammonium_annual))- np.nanmean(sites_ammonium_AM))/np.nanmean(sites_ammonium_AM) nmb_mam=100.*((np.nanmean(gc_data_ammonium_mam))- np.nanmean(sites_ammonium_mam))/np.nanmean(sites_ammonium_mam) nmb_jja=100.*((np.nanmean(gc_data_ammonium_jja))- np.nanmean(sites_ammonium_jja))/np.nanmean(sites_ammonium_jja) nmb_son=100.*((np.nanmean(gc_data_ammonium_son))- np.nanmean(sites_ammonium_son))/np.nanmean(sites_ammonium_son) nmb_djf=100.*((np.nanmean(gc_data_ammonium_djf))- np.nanmean(sites_ammonium_djf))/np.nanmean(sites_ammonium_djf) print(' DEFRA NMB_Annual= ', nmb_Annual) print(' DEFRA NMB_mam = ', nmb_mam) print(' DEFRA NMB_jja = ', nmb_jja) print(' DEFRA NMB_son = ', nmb_son) print(' DEFRA NMB_djf = ', nmb_djf) #correlation correlate_Annual=stats.pearsonr(gc_data_ammonium_annual,sites_ammonium_AM) # dropping nan values and compute correlation nas_mam = np.logical_or(np.isnan(gc_data_ammonium_mam), np.isnan(sites_ammonium_mam)) correlate_mam = stats.pearsonr(gc_data_ammonium_mam[~nas_mam],sites_ammonium_mam[~nas_mam]) nas_jja = np.logical_or(np.isnan(gc_data_ammonium_jja), np.isnan(sites_ammonium_jja)) correlate_jja = stats.pearsonr(gc_data_ammonium_jja[~nas_jja],sites_ammonium_jja[~nas_jja]) nas_son = np.logical_or(np.isnan(gc_data_ammonium_son), np.isnan(sites_ammonium_son)) correlate_son = stats.pearsonr(gc_data_ammonium_son[~nas_son],sites_ammonium_son[~nas_son]) nas_djf = np.logical_or(np.isnan(gc_data_ammonium_djf), np.isnan(sites_ammonium_djf)) correlate_djf = stats.pearsonr(gc_data_ammonium_djf[~nas_djf],sites_ammonium_djf[~nas_djf]) print('Correlation = ',correlate_Annual)
### the decision to stay or switch might be a lot more calculated than ### the decision to UP or DN. ### Including UP and DN separately in entropy might introduce variability ### that isn't reflective of cognitive thought, but noise fig = _plt.figure(figsize=(10, 10)) if1 = -1 for sfeat1 in ["entropyS", "entropyD", "entropyU"]: feat1 = eval(sfeat1) if1 += 1 if2 = -1 for sfeat2 in ["entropyW2", "entropyT2", "entropyL2"]: feat2 = eval(sfeat2) if2 += 1 fig.add_subplot(3, 3, if1*3 + if2 + 1) pc, pv = _ss.pearsonr(feat1, feat2) _plt.title("%(pc).2f %(pv).1e" % {"pc" : pc, "pv" : pv}) _plt.scatter(feat1, feat2, color="black", s=5) if if2 == 0: _plt.ylabel(sfeat1, fontsize=18) if if1 == 2: _plt.xlabel(sfeat2, fontsize=18) _plt.xticks(fontsize=13) _plt.yticks(fontsize=13) fig.subplots_adjust(wspace=0.25, hspace=0.25) _plt.savefig("corr_btwn_ent_comps_sim2") fig = _plt.figure(figsize=(10, 10)) if1 = -1 for sfeat1 in ["entropyS", "entropyD", "entropyU"]: feat1 = eval(sfeat1)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Apr 9 14:59:34 2020 @author: dustan """ import numpy as np import scipy.stats as stats n = 10 x = np.random.normal(size=n) y = 2 * x + np.random.normal(size=n) # Compute with scipy cor, pval = stats.pearsonr(x, y)
from scipy.stats import pearsonr from scipy.stats import linregress from matplotlib import pyplot as plt import numpy as np sat = np.array([595, 520, 715, 405, 680, 490, 565]) gpa = np.array([3.4, 3.2, 3.9, 2.3, 3.9, 2.5, 3.5]) fig1 = plt.figure(1) ax = plt.subplot(1, 1, 1) pearson = pearsonr(sat, gpa) plt.scatter(sat, gpa, label="data") # Get linear regression parameters slope, intercept, r_value, p_value, std_err = linregress(sat, gpa) # Format the chart plt.xlabel("SAT Scores") plt.ylabel("GPA") plt.title( "Scatter Plot with Linear Regression Fit\nY=a*X + b\na=%0.4f, b=%0.4f" % (slope, intercept)) plt.grid() # Create linear regression x values x_lr = sat # Create linear regression y values: Y = slope*X + intercept y_lr = slope * x_lr + intercept
def normalization_and_graphing(): ###probably useful data strucutures for this function: # vader_polarity_list # textblob_polarity_list # final_idx_list #Not used within function, but could be useful ################## global everything_included_list_vader everything_included_list_vader = np.log1p(vader_polarity_list) global everything_included_list_textblob_polarity everything_included_list_textblob_polarity = np.log1p( textblob_polarity_list) global everything_included_list_textblob_subjectivity everything_included_list_textblob_subjectivity = np.log1p( textblob_subjectivity_list) skew_textblob = skew(textblob_polarity_list) skew_vader = skew(vader_polarity_list) skew_textblob_subjectivity = skew(textblob_subjectivity_list) print("length of final score list: ", len(vader_polarity_list)) print("length of textblob final score list: ", len(textblob_polarity_list)) print("length of textblob subjectivity final score list: ", len(textblob_subjectivity_list)) print("skew vader: ", skew_vader) print("skew textblo_polarityb: ", skew_textblob) print("skew textblob_subjectivity: ", skew_textblob_subjectivity) plt_1 = plt.figure(1) plt.hist(vader_polarity_list, bins=80, range=[-1, 1], align='mid') plt.ylabel("Frequency") plt.title('vader_polarity - original') plt.show() input("Press Enter to continue...") plt_2 = plt.figure(2) plt.hist(textblob_polarity_list, bins=80, range=[-1, 1], align='mid') plt.ylabel("Frequency") plt.title('textblob_polarity - original') plt.show() input("Press Enter to continue...") plt_3 = plt.figure(3) plt.hist(textblob_subjectivity_list, bins=80, range=[0, 1], align='mid') plt.ylabel("Frequency") plt.title('textblob_subjectivity - original') plt.show() input("Press Enter to continue...") print('removing outliers vader') elements = np.array(vader_polarity_list) mean = np.mean(elements) sd = np.std(elements) temp_id = [] current_id = 0 for x in vader_polarity_list: if (x > mean - 3 * sd and x < mean + 3 * sd): ignore = 1 #does nothing else: temp_id.append(current_id) current_id = current_id + 1 temp_id_1 = temp_id.copy() print('removing outliers textblob_polarity') elements = np.array(textblob_polarity_list) mean = np.mean(elements) sd = np.std(elements) temp_id = [] current_id = 0 for x in textblob_polarity_list: if (x > mean - 3 * sd and x < mean + 3 * sd): ignore = 1 #does nothing else: temp_id.append(current_id) current_id = current_id + 1 temp_id_2 = temp_id.copy() print('removing outliers textblob_subjectivity') elements = np.array(textblob_subjectivity_list) mean = np.mean(elements) sd = np.std(elements) temp_id = [] current_id = 0 for x in textblob_subjectivity_list: if (x > mean - 3 * sd and x < mean + 3 * sd): ignore = 1 # does nothing else: temp_id.append(current_id) current_id = current_id + 1 temp_id_3 = temp_id.copy() temp_id_4 = temp_id_1 + temp_id_2 + temp_id_3 #temp_id_4 now contains all outlier ids current_id = 0 flag = 0 for placeholder7 in vader_polarity_list: for placeholder8 in temp_id_4: if (current_id == placeholder8): flag = 1 break if (flag == 0): no_outliers_list_vader.append(vader_polarity_list[current_id]) no_outliers_list_textblob_polarity.append( textblob_polarity_list[current_id]) no_outliers_list_textblob_subjectivity.append( textblob_subjectivity_list[current_id]) no_errors_no_outliers_list_vader.append( vader_polarity_list[current_id]) no_errors_no_outliers_list_textblob_polarity.append( textblob_polarity_list[current_id]) no_errors_no_outliers_list_textblob_subjectivity.append( textblob_subjectivity_list[current_id]) everything_included_outlier_list.append("not outlier") if (flag == 1): no_outliers_list_vader.append( "Outlier ERROR- YOU SHOULD NEVER SEE THIS") no_outliers_list_textblob_polarity.append( "Outlier ERROR- YOU SHOULD NEVER SEE THIS") no_outliers_list_textblob_subjectivity.append( "Outlier ERROR- YOU SHOULD NEVER SEE THIS") everything_included_outlier_list.append("outlier") flag = 0 current_id = current_id + 1 print("length of vader_polarity - outliers removed: ", len(no_errors_no_outliers_list_vader)) print("length of textblob_polarity - outliers removed: ", len(no_errors_no_outliers_list_textblob_polarity)) print("length of textblob_subjectivity - outliers removed: ", len(no_errors_no_outliers_list_textblob_subjectivity)) print("outliers removed vader_polarity skew: ", skew(no_errors_no_outliers_list_vader)) print("outliers removed textblob_polarity skew: ", skew(no_errors_no_outliers_list_textblob_polarity)) print("outliers removed textblob_subjectivity skew: ", skew(no_errors_no_outliers_list_textblob_subjectivity)) plt_4 = plt.figure(4) plt.hist(no_errors_no_outliers_list_vader, bins=80, range=[-1, 1], align='mid') plt.ylabel("Frequency") plt.title('vader_polarity - outliers removed') plt.show() input("Press Enter to continue...") plt_5 = plt.figure(5) plt.hist(no_errors_no_outliers_list_textblob_polarity, bins=80, range=[-1, 1], align='mid') plt.ylabel("Frequency") plt.title('textblob_polarity - outliers removed') plt.show() input("Press Enter to continue...") plt_6 = plt.figure(6) plt.hist(no_errors_no_outliers_list_textblob_subjectivity, bins=80, range=[0, 1], align='mid') plt.ylabel("Frequency") plt.title('textblob_subjectivity - outliers removed') plt.show() input("Press Enter to continue...") print("MAKE SURE THESE ARE SAME VALUE: ") print(len(no_outliers_list_vader)) print(len(no_outliers_list_textblob_polarity)) print(len(no_outliers_list_textblob_subjectivity)) print("and these: ") print("Finding log of both score lists...") no_errors_logged_list_vader = np.log1p(no_errors_no_outliers_list_vader) no_errors_logged_list_textblob_polarity = np.log1p( no_errors_no_outliers_list_textblob_polarity) no_errors_logged_list_textblob_subjectivity = np.log1p( no_errors_no_outliers_list_textblob_subjectivity) i = 0 for placeholder10 in no_outliers_list_vader: if (isinstance(placeholder10, str)): logged_list_vader.append("outlier") logged_list_textblob_polarity.append("outlier") logged_list_textblob_subjectivity.append("outlier") else: logged_list_vader.append(math.log1p(no_outliers_list_vader[i])) logged_list_textblob_polarity.append( math.log1p(no_outliers_list_textblob_polarity[i])) logged_list_textblob_subjectivity.append( math.log1p(no_outliers_list_textblob_subjectivity[i])) i = i + 1 plt_7 = plt.figure(7) plt.hist(no_errors_logged_list_vader, bins=80, range=[-1, 1], align='mid') plt.ylabel("Frequency") plt.title('vader_polarity - normalized') plt.show() input("Press Enter to continue...") plt_8 = plt.figure(8) plt.hist(no_errors_logged_list_textblob_polarity, bins=80, range=[-1, 1], align='mid') plt.ylabel("Frequency") plt.title('textblob_polarity - normalized') plt.show() input("Press Enter to continue...") plt_9 = plt.figure(9) plt.hist(no_errors_logged_list_textblob_subjectivity, bins=80, range=[0, 1], align='mid') plt.ylabel("Frequency") plt.title('textblob_subjectivity - normalized') plt.show() input("Press Enter to continue...") print( "Finding The Pearson Correlation Coefficient between vader_polarity scores and textblob_polarity scores..." ) print( "Pearson Correlation Coefficient: ", pearsonr(no_errors_logged_list_textblob_polarity, no_errors_logged_list_vader))
def main(): st.title("Statistical Testing Web App") st.sidebar.title("Statistical Testing Web App") st.markdown("What would you like to do today?") st.sidebar.markdown("Select what would you like to do today") data = st.file_uploader("Upload a Dataset", type=["csv", "txt"]) if data is not None: df = pd.read_csv(data) st.dataframe(df.head()) activities = ['Parametric Test', 'Non-Parametric Test','Normality Tests','Correlation'] choice = st.sidebar.selectbox("Select a Statistical Testing",activities) if choice == 'Parametric Test': Parametric= ["student's t-test","Pairwise t-test","F-test","chi-square"] tests = st.sidebar.selectbox("Select the tests you want to conduct",Parametric) if tests == "student's t-test": data1 = st.sidebar.selectbox("Select the 1st variable you want to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable you want to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("student's t-test Results") stat, p = stats.ttest_ind(df[data1],df[data2]) st.write("t-statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Do not reject the hypothesis') else: st.write('Reject the hypothesis') if tests == "Pairwise t-test": data1 = st.sidebar.selectbox("Select the 1st variable you want to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable you want to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Pairwise t-test Results") stat, p = stats.ttest_rel(df[data1],df[data2]) st.write("t-statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Do not reject the hypothesis') else: st.write('Reject the Hypothesis') if tests == "F-test": data1 = st.sidebar.selectbox("Select the 1st variable you want to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable you want to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("F-test Results") stat, p = stats.f_oneway(df[data1],df[data2]) st.write("F-statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Do not reject the hypothesis') else: st.write('Reject the Hypothesis') if tests == "chi-square" : data1 = st.sidebar.selectbox("Select the observed frequency", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the expected frequency", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Chi Square test Results") stat, p = stats.chisquare(df[data1],df[data2],axis=0) st.write("chi square-statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Do not reject the hypothesis') else: st.write('Reject the Hypothesis') elif choice == 'Non-Parametric Test': Non_parametric = ['Mann-Whitney','Wilcoxon','Kruskal-Wallis','Friedman'] tests = st.sidebar.selectbox("Select the tests you want to conduct",Non_parametric) if tests == "Mann-Whitney" : data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Mann Whitney test Results") stat, p = stats.mannwhitneyu(df[data1], df[data2]) st.write("Mann Whitney statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Do not reject the hypothesis') else: st.write('Reject the hypothesis') if tests == "Wilcoxon" : data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Wilcoxon test Results") stat, p = stats.wilcoxon(df[data1], df[data2]) st.write("Wilcoxon statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Do not reject the hypothesis') else: st.write('Reject the hypothesis') if tests == "Kruskal-Wallis" : data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Kruskal-Wallis test Results") stat, p = stats.kruskal(df[data1], df[data2]) st.write("Kruskal-Wallis statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Do not reject the hypothesis') else: st.write('Reject the hypothesis') if tests == "Friedman" : data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist()) data3 = st.sidebar.selectbox("Select the 3rd variable to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Friedman Results") stat, p = stats.friedmanchisquare(df[data1], df[data2],df[data3]) st.write("Friedman statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Do not reject the hypothesis') else: st.write('Reject the hypothesis') elif choice == 'Normality Tests': Normal= ["Shapiro–Wilk","Anderson–Darling","Kolmogorov–Smirnov","Normal-Test"] tests = st.sidebar.selectbox("Select the test you want to conduct",Normal) if tests == "Shapiro–Wilk" : user_input = str(st.text_input('Do you need specific columns? Y or N')) if user_input == "Y": selected_columns = st.sidebar.multiselect("Select the variables you want to perform the test on", df.columns.tolist()) new_df = df[selected_columns] if st.sidebar.button("Show results"): st.subheader("Shapiro Wilk test Results") stat, p = stats.shapiro(new_df) st.write("Shapiro Wilk statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Probably Gaussian') else: st.write('Probably not Gaussian') else: if st.sidebar.button("Show results"): stat, p = stats.shapiro(df) st.write("Shapiro Wilk statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Probably Gaussian') else: st.write('Probably not Gaussian') if tests == "Anderson–Darling" : selected_columns = st.sidebar.multiselect("Select the variables you want to perform the test on", df.columns.tolist()) new_df = df[selected_columns] if st.sidebar.button("Show results"): st.subheader("Anderson–Darling test Results") result = stats.anderson(new_df,dist='norm') st.write("Anderson–Darling statistics: ", result.statistics) for i in range(len(result.critical_values)): sl , cv = result.significance_level[i], result.critical_values[i] if result.statistic < cv: st.write('Probably Gaussian at the %.1f%% level' % (sl)) else: st.write('Probably not Gaussian at the %.1f%% level' % (sl)) if tests == "Kolmogorov–Smirnov": data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Kolmogorov–Smirnov test Results") p,stat = stats.ks_2samp(df[data1], df[data2]) st.write("Kolmogorov–Smirnov statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Probably the same distribution') else: st.write('Probably different distributions') if tests == "Normal-Test" : user_input = st.text_input('Do you need specific columns?Y or N') if user_input == "Y": selected_columns = st.sidebar.multiselect("Select the variables you want to perform the test on", df.columns.tolist()) new_df = df[selected_columns] if st.sidebar.button("Show results"): st.subheader("Normal-Test test Results") p, stat = stats.normaltest(new_df) st.write("Normal-Test statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Probably Gaussian') else: st.write('Probably not Gaussian') else: stat, p = stats.normaltest(df) st.write("Normal-Test statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Probably Gaussian') else: st.write('Probably not Gaussian') elif choice == 'Correlation': corr= ["Spearman's Rank","Pearson","Kendall"] tests = st.sidebar.selectbox("Select the tests you want to conduct",corr) if tests == "Spearman's Rank": data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader(" Spearman's Rank Results") p, stat = stats.spearmanr(df[data1], df[data2]) st.write("Spearman's Rank statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Probably dependent') else: st.write('Probably independent') if tests == "Pearson": data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Pearson Results") p, stat = stats.pearsonr(df[data1], df[data2]) st.write("Pearson statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Probably dependent') else: st.write('Probably independent') if tests == "Kendall": data1 = st.sidebar.selectbox("Select the 1st variable to perform the test on", df.columns.tolist()) data2 = st.sidebar.selectbox("Select the 2nd variable to perform the test on", df.columns.tolist()) if st.sidebar.button("Show results"): st.subheader("Kendall Results") p,stat = stats.kendalltau(df[data1], df[data2]) st.write("Kendall statistics: ", stat,"p-value: ", p) if p > 0.05: st.write('Probably dependent') else: st.write('Probably independent') if st.sidebar.checkbox('Show Correlation plot'): st.write('To see the correlation plot of all variables') st.write(sns.heatmap(df.corr(),annot=True)) st.pyplot(height=800) if st.sidebar.checkbox('EDA'): if st.sidebar.checkbox('Show data summary'): st.write(df.describe()) if st.sidebar.checkbox('Show Value Counts'): st.write(df.iloc[:,-1].value_counts())
ax[0].set_ylim((-0.02, 0.12)) ax[0].axhline(0, linestyle='--', color='k') ax[0].set_xticks([0, 1]) ax[0].set_xticklabels(['Active', 'Passive'], rotation=45) ax[0].set_xlabel('Behavior State') ax[0].set_ylabel(r'Noise Correlation ($r_{sc}$)') # correlation with overall behavior resg = res.groupby(by=['snr', 'f', 'site']).mean() #ax[1].scatter(resg[di_metric], resg['diff'], s=50, edgecolor='white', color='tab:orange') sns.regplot(x=di_metric, y='diff', data=resg, ax=ax[1], color='tab:orange') ax[1].set_xlabel('Behavior performance (DI)') ax[1].set_ylabel(r"$\Delta r_{sc}$" + "\n(Active - Passive)") ax[1].axhline(0, linestyle='--', color='k') ax[1].axvline(0.5, linestyle='--', color='k') r, p = ss.pearsonr(resg[di_metric], resg['diff']) ax[1].set_title(r"$r$: %s, $p$: %s" % (round(r, 3), round(p, 3))) f.tight_layout() f.savefig(DIR + 'pyfigures/rsc_behavior.svg') # ===================================== FIGURE 2 ==================================== # break down correlation vs. behavior into different time windows tbins1 = ['0.25_0.35', '0.35_0.45', '0.45_0.55', '0.55_0.65'] tbins2 = ['0_0.1', '0.1_0.2', '0.2_0.3', '0.3_0.4'] titles = ['-0.1 - 0.0 sec', '0.0 - 0.1 sec', '0.1 - 0.2 sec', '0.2 - 0.3 sec'] f, ax = plt.subplots(1, 4, figsize=(12, 3), sharey=True) for t1, t2, title, a in zip(tbins1, tbins2, titles, ax.flatten()): m1 = (df.batch == 307) & (df.tbin == t1) & ((df.pa < alpha) | (df.pp < alpha))
def eval(gold,predectvalues): pr = pearsonr(gold, predectvalues)[0] print 'Test Pearson: ' + str(pr) return pr
def check_correlation(x, y): pearson_coef, p_value = stats.pearsonr(x, y) print("Pearson Correlation Coefficient: ", pearson_coef, "and a P-value of:", p_value)
np.mean(accuracies_twin.item()['n' + str(i) + '_DZ'])) results['DZ_acc_std'].append( np.std(accuracies_twin.item()['n' + str(i) + '_DZ'])) results['Title'].append( eval('labels_dict_' + parcellation[j])['n' + str(i)][0]) results['Falconers formula'].append( falconers_h2.item()[parcellation[j]][i]) results['ACE'].append(eval('ACE_h2_' + parcellation[j])[i]) # Excel file # pd.DataFrame.from_dict(results).to_excel('./../outputs/identification_results_' + parcellation[j] + '.xlsx') df = pd.DataFrame(results) df = df.sort_values(by=['SI_acc_mean'], ascending=False) print( stat.pearsonr(results['MZ_acc_mean'], eval('ACE_h2_' + parcellation[j]))) # print(stat.pearsonr(results['MZ_acc_mean'], # falconers_h2.item()[parcellation[j]])) # print(stat.pearsonr(falconers_h2.item()[parcellation[j]], # eval('nodes_'+parcellation[j]))) labels = df['Title'] # Figure if nets[j] == 9: fig, ax = plt.subplots(figsize=(10, 5)) else: fig, ax = plt.subplots(figsize=(14, 5)) plt.xlabel('Functional networks, n of nodes', fontsize=20) plt.ylabel('Identification accuracy', fontsize=20) bar_width = 0.25 plt.xticks(range(nets[j]), labels, rotation=45, fontsize=15)
es = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0,patience=3,verbose=0, mode='auto') cp = keras.callbacks.ModelCheckpoint(filepath="/Dedicated/jmichaelson-wdata/rotating_students/bhoskins/RetinaDL/ModelAge.h5", verbose=1, save_best_only=True) model.fit(images[ind[0]], y.iloc[ind[0]], batch_size=16, callbacks=[es,cp],epochs=30, validation_data=[images[ind[1]], y.iloc[ind[1]]]) #model = keras.models.load_model("/Dedicated/jmichaelson-wdata/rotating_students/bhoskins/RetinaDL/Model2.h5") pred = model.predict(images[ind[2]]) true = y.iloc[ind[2]] #Age stats pearsonr(true, pred[:,0]) statistics.mean(abs(true-pred[:,0])) #plot true vs predicted cbornreg plot sns.regplot(x=true, y=pred[:,0]) plt.show() plt.savefig("/Dedicated/jmichaelson-wdata/rotating_students/bhoskins/RetinaDL/data_prep/" + "regplot.png") from scipy.stats import ttest_ind #Depression stats dep = diagnosis.Depression == 1 depDiagnosed = diagnosis[dep] depAge = depDiagnosed["age"] #use as true pred2 = model.predict(images[dep])
districts_to_label_xy_df = districts_private_and_income.loc[ districts_to_label_mask, ["district_eng", "private_or_public", "yearly_average_household_income"]] for idx, row in districts_to_label_xy_df.iterrows(): x = row["yearly_average_household_income"] + 2 y = row["private_or_public"] #To align it properly ax_1.annotate(s=row["district_eng"], xy=(x, y), horizontalalignment='left', verticalalignment="center") #Annotate pearson's r pearson_r = st.pearsonr( districts_private_and_income.loc[:, "private_or_public"], districts_private_and_income.loc[:, "yearly_average_household_income"])[0] ax_1.annotate(s="r = {:.2f}".format(pearson_r), xy=(.9, .9), xycoords=ax_1.transAxes, color="black", weight="bold", fontsize=15) # --- Scatterplots Bottom : Income - Num. of Inst for all sub-categories institutions = [ "Hospital", "Dental Health Center", "Dialysis Center", "Physical Therapy Center", "Gynecology and Obstetrics Clinic", "Medical Center", "Polyclinic", "Planned Parenthood Center" ]
# print('Network parameters for anger: ', model_anger.summary()) # print('Network parameters for fear: ', model_fear.summary()) # print('Network parameters for joy: ', model_joy.summary()) # print('Network parameters for sadness: ', model_sadness.summary()) y_pred_anger = model_anger.predict(x_test_anger) y_pred_fear = model_fear.predict(x_test_fear) y_pred_joy = model_joy.predict(x_test_joy) y_pred_sadness = model_sadness.predict(x_test_sadness) # In[ ]: pearson_correlation_score_anger = pearsonr( y_pred_anger.reshape((y_pred_anger.shape[0], )), y_test_anger)[0] print('Pearson Correlation for LE_PC_DMTL model on Test set for anger') print(pearson_correlation_score_anger) # In[ ]: pearson_correlation_score_fear = pearsonr( y_pred_fear.reshape((y_pred_fear.shape[0], )), y_test_fear)[0] print('Pearson Correlation for LE_PC_DMTL model on Test set for fear') print(pearson_correlation_score_fear) # In[ ]: pearson_correlation_score_joy = pearsonr(
x2 = (data2-data1)/2 return x1,x2 #parameters a_bound=5 M=400 N=1000 results = None for rho in np.arange(0.9,1.0,0.1): for i in range(M): delta_t = 0.1 coupling = 2*np.abs(rho)/(1-np.abs(rho))*np.sign(rho) x1,x2 = correlated_ts(coupling,N=N) prho = pearsonr(x1,x2)[0] print("OU cross correlation", OUcross(x1,x2,delta_t)) print("pearson: ",prho) para = calc_fundstats(x1+x2) + calc_fundstats(x1-x2) +(delta_t,N) guessa1 = (x1+x2).std()**2 guessa2 = (x1-x2).std()**2 guessd = 0.5 c_guess = (guessa1-guessa2)/guessa2 print(guessa1,guessa2,guessd,c_guess/(2+c_guess)) result = root(phi_deriv, [guessa1,guessa2,guessd],args=para) a1 = result.x[0] a2 = result.x[1] d = result.x[2]
def pr2_spatial(tslsreg): """ Calculates the pseudo r^2 for the spatial two stage least squares regression. Parameters ---------- stslsreg : spatial two stage least squares regression object output instance from a spatial two stage least squares regression model Returns ------- pr2_result : float value of the squared pearson correlation between the y and stsls-predicted y vectors Examples -------- We first need to import the needed modules. Numpy is needed to convert the data we read into arrays that ``spreg`` understands and ``pysal`` to perform all the analysis. The GM_Lag is required to run the model on which we will perform the tests and the ``pysal.spreg.diagnostics`` module contains the function with the test. >>> import numpy as np >>> import pysal >>> import pysal.spreg.diagnostics as D >>> from twosls_sp import GM_Lag Open data on Columbus neighborhood crime (49 areas) using pysal.open(). This is the DBF associated with the Columbus shapefile. Note that pysal.open() also reads data in CSV format; since the actual class requires data to be passed in as numpy arrays, the user can read their data in using any method. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),'r') Extract the HOVAL column (home value) from the DBF file and make it the dependent variable for the regression. Note that PySAL requires this to be an numpy array of shape (n, 1) as opposed to the also common shape of (n, ) that other packages accept. >>> y = np.array(db.by_col("HOVAL")) >>> y = np.reshape(y, (49,1)) Extract INC (income) vectors from the DBF to be used as independent variables in the regression. Note that PySAL requires this to be an nxj numpy array, where j is the number of independent variables (not including a constant). By default this model adds a vector of ones to the independent variables passed in, but this can be overridden by passing constant=False. >>> X = np.array(db.by_col("INC")) >>> X = np.reshape(X, (49,1)) In this case, we consider CRIME (crime rates) as an endogenous regressor, so we acknowledge that by reading it in a different category. >>> yd = np.array(db.by_col("CRIME")) >>> yd = np.reshape(yd, (49,1)) In order to properly account for the endogeneity, we have to pass in the instruments. Let us consider DISCBD (distance to the CBD) is a good one: >>> q = np.array(db.by_col("DISCBD")) >>> q = np.reshape(q, (49,1)) Since this test has a spatial component, we need to specify the spatial weights matrix that includes the spatial configuration of the observations into the error component of the model. To do that, we can open an already existing gal file or create a new one. In this case, we will create one from ``columbus.shp``. >>> w = pysal.rook_from_shapefile(pysal.examples.get_path("columbus.shp")) Unless there is a good reason not to do it, the weights have to be row-standardized so every row of the matrix sums to one. Among other things, this allows to interpret the spatial lag of a variable as the average value of the neighboring observations. In PySAL, this can be easily performed in the following way: >>> w.transform = 'r' Now we are good to run the spatial lag model. Make sure you pass all the parameters correctly and, if desired, pass the names of the variables as well so when you print the summary (reg.summary) they are included: >>> reg = GM_Lag(y, X, w=w, yend=yd, q=q, w_lags=2, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus') Once we have a regression object, we can perform the spatial version of the pesudo R^2. It is as simple as one line! >>> result = pr2_spatial(reg) >>> print("%1.6f"%result) 0.299649 """ y = tslsreg.y predy_e = tslsreg.predy_e pr = pearsonr(y, predy_e)[0] pr2_result = float(pr**2) return pr2_result
def __init__(self, expect, got, name=None, data=None, P_LARGER=0.9, regression=True, ax=None, alphabet=None, expect_label=None, got_label=None, verbose=1): """Compare vectors. Arguments: - Specifying data for comparison two methods: 1) `expect`, `got`: two numeric one-dimensional arrays which we'd like to compare (the argument names come for software testing). This method requires argument `data=None`. 2) `data`: instance of `DataFrame`, expects arguments `expect` and `got` to be column labels. - `name`: name of this comparison. Note: - when plotting `expect` is the y-axis, `got` is the x-axis. This is by convention that `expect` is the dependent variable (regression target). TODO: - Add an option to drop NaNs and continue comparison. - Indicate which dimensions have the largest errors. """ if isinstance(expect, dict) and isinstance(got, dict): _alphabet = expect.keys() if alphabet is None else alphabet assert set(got.keys()) == set(_alphabet) expect = [expect[k] for k in _alphabet] got = [got[k] for k in _alphabet] if data is not None: assert isinstance(expect, (int, basestring)), \ 'expected a column name got %s' % type(expect) assert isinstance(got, (int, basestring)), \ 'expected a column name got %s' % type(got) if expect_label is None: expect_label = expect if got_label is None: got_label = got expect = data[expect] got = data[got] else: if expect_label is None: expect_label = 'expect' if got_label is None: got_label = 'got' expect = np.asarray(expect) got = np.asarray(got) data = pd.DataFrame({expect_label: expect, got_label: got}) assert expect.shape == got.shape [n] = expect.shape self.expect = expect self.got = got self.alphabet = alphabet self.ax = ax self.name = name self.got_label = got_label self.expect_label = expect_label self.n = n self.coeff = None self.tests = tests = [] # Check that vectors are finite. if not np.isfinite(expect).all(): tests.append([ 'expect finite', progress(np.isfinite(expect).sum(), n), False ]) if not np.isfinite(got).all(): tests.append( ['got finite', progress(np.isfinite(got).sum(), n), False]) ne = norm(expect) ng = norm(got) ok = abs(ne - ng) / ne < 0.01 if ne != 0 else True if n > 1: tests.append(['norms', '[%g, %g]' % (ne, ng), ok]) F = zero_retrieval(expect, got) tests.append(['zero F1', F, F > 0.99]) if n > 1: c = cosine(expect, got) self.cosine = c tests.append(['cosine-sim', c, (c > 0.99999) ]) # cosine similarities must be really high. self.pearsonr = 1.0 if ne == ng == 0 else pearsonr(expect, got)[0] tests.append(['pearson', self.pearsonr, (self.pearsonr > 0.99999)]) p = spearmanr(expect, got)[0] tests.append(['spearman', p, (p > 0.99999)]) # TODO: this check should probably take into account the scale of the data. d = linf(expect, got) self.max_err = d tests.append(['Linf', d, d < 1e-8]) # same sign check (weak agreement, but useful sanity check -- especially # for gradients) x = expect y = got s = np.asarray(~((x >= 0) ^ (y >= 0)), dtype=int) p = s.sum() * 100.0 / len(s) tests.append( ['same-sign', '%s%% (%s/%s)' % (p, s.sum(), len(s)), p == 100.0]) # relative error r = relative_difference(expect, got) r = np.mean(r[np.isfinite(r)]) tests.append(['mean relative error', r, r <= 0.01]) self.mean_relative_error = r # TODO: suggest that if relative error is high and rescaled error is low (or # something to do wtih regression residuals) that maybe there is a # (hopefully) simple fix via scale/offset. # TODO: can provide descriptive statistics for each vector #tests.append(['range (expect)', [expect.min(), expect.max()], 2]) #tests.append(['range (got) ', [got.min(), got.max()], 2]) # regression and rescaled error only valid for n >= 2 if n >= 2: es = abs(expect).max() gs = abs(got).max() if es == 0: es = 1 if gs == 0: gs = 1 if 0: # rescaled error E = expect / es G = got / gs R = abs(E - G) r = np.mean(R) tests.append(['mean rescaled error', r, r <= 1e-5]) if regression: self.regression() if n >= 2: # These tests check if one of the datasets is consistently larger than the # other. The threshold for error is based on `P_LARGER` ("percent larger"). L = ((expect - got) > 0).sum() if L >= P_LARGER * n: tests.append(['expect is larger', progress(L, n), 0]) L = ((got - expect) > 0).sum() if L >= P_LARGER * n: tests.append(['got is larger', progress(L, n), 0]) self.tests = tests if verbose: self.message() if alphabet is not None: self.show_largest_rel_errors()
def pr2_aspatial(tslsreg): """ Calculates the pseudo r^2 for the two stage least squares regression. Parameters ---------- tslsreg : two stage least squares regression object output instance from a two stage least squares regression model Returns ------- pr2_result : float value of the squared pearson correlation between the y and tsls-predicted y vectors Examples -------- We first need to import the needed modules. Numpy is needed to convert the data we read into arrays that ``spreg`` understands and ``pysal`` to perform all the analysis. The TSLS is required to run the model on which we will perform the tests. >>> import numpy as np >>> import pysal >>> from twosls import TSLS Open data on Columbus neighborhood crime (49 areas) using pysal.open(). This is the DBF associated with the Columbus shapefile. Note that pysal.open() also reads data in CSV format; since the actual class requires data to be passed in as numpy arrays, the user can read their data in using any method. >>> db = pysal.open(pysal.examples.get_path("columbus.dbf"),'r') Before being able to apply the diagnostics, we have to run a model and, for that, we need the input variables. Extract the CRIME column (crime rates) from the DBF file and make it the dependent variable for the regression. Note that PySAL requires this to be an numpy array of shape (n, 1) as opposed to the also common shape of (n, ) that other packages accept. >>> y = np.array(db.by_col("CRIME")) >>> y = np.reshape(y, (49,1)) Extract INC (income) vector from the DBF to be used as independent variables in the regression. Note that PySAL requires this to be an nxj numpy array, where j is the number of independent variables (not including a constant). By default this model adds a vector of ones to the independent variables passed in, but this can be overridden by passing constant=False. >>> X = [] >>> X.append(db.by_col("INC")) >>> X = np.array(X).T In this case, we consider HOVAL (home value) as an endogenous regressor, so we acknowledge that by reading it in a different category. >>> yd = [] >>> yd.append(db.by_col("HOVAL")) >>> yd = np.array(yd).T In order to properly account for the endogeneity, we have to pass in the instruments. Let us consider DISCBD (distance to the CBD) is a good one: >>> q = [] >>> q.append(db.by_col("DISCBD")) >>> q = np.array(q).T Now we are good to run the model. It is an easy one line task. >>> reg = TSLS(y, X, yd, q=q) In order to perform the pseudo R^2, we pass the regression object to the function and we are done! >>> result = pr2_aspatial(reg) >>> print("%1.6f"%result) 0.279361 """ y = tslsreg.y predy = tslsreg.predy pr = pearsonr(y, predy)[0] pr2_result = float(pr**2) return pr2_result
def display_selected_data(selectedArea, choiceNB): if choiceNB == 'boroughs': df_selected = df_trees_properties_boro title_part = ' boroughs' key = 'borough' else: title_part = ' neighborhoods' df_selected = df_trees_properties key = 'ntaname' font_ann = dict( size=10, color=colors['text'] ) if selectedArea is not None: points = selectedArea["points"] area_names = [str(point["text"].split("<br")[0]) for point in points] df_selected = df_selected[df_selected[key].isin(area_names)] index_vals = df_selected['borough'].astype('category').cat.codes coef_list = [] # find pearson coeff and p_value for each pair of attributes pairs = [['trees/sq.mile', 'avg.landprice_thous$/acre'], ['trees/sq.mile', 'properties/sq.mile'], ['avg.landprice_thous$/acre', 'properties/sq.mile']] flag = True for pair in pairs: if len(df_selected[pair[0]]) >= 2 and len(df_selected[pair[1]]) >= 2: coef_list.append( pearsonr(df_selected[pair[0]], df_selected[pair[1]])) else: flag = False if flag: ann = [ dict( x=5000, y=6000, xref="x2", yref="y1", font=font_ann, text="PCC: " + str(round(coef_list[0][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[0][1])), showarrow=False, ), dict( x=6000, y=5000, xref="x1", yref="y2", font=font_ann, text="PCC: " + str(round(coef_list[0][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[0][1])), showarrow=False, ), dict( x=14000, y=6000, xref="x3", yref="y1", font=font_ann, text="PCC: " + str(round(coef_list[1][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[1][1])), showarrow=False, ), dict( x=6000, y=14000, xref="x1", yref="y3", font=font_ann, text="PCC: " + str(round(coef_list[1][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[1][1])), showarrow=False, ), dict( x=14000, y=6000, xref="x3", yref="y2", font=font_ann, text="PCC: " + str(round(coef_list[2][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[2][1])), showarrow=False, ), dict( x=6000, y=14000, xref="x2", yref="y3", font=font_ann, text="PCC: " + str(round(coef_list[2][0], 2)) + "<br>p: " + ('{:0.1e}'.format(coef_list[2][1])), showarrow=False, ), ] else: ann = [] axisd = dict(showline=True, zeroline=False, gridcolor='#104752', showticklabels=True) # here we build a scatter matrix, and add annotations for each subgraph layout = go.Layout( dragmode='select', margin=dict(l=0, r=0, b=0, t=0, pad=0), autosize=False, hovermode='closest', font=dict(color=colors['text'], size=12), plot_bgcolor=colors['background'], paper_bgcolor=colors['background'], xaxis1=dict(axisd), xaxis2=dict(axisd), xaxis3=dict(axisd), xaxis4=dict(axisd), yaxis1=dict(axisd), yaxis2=dict(axisd), yaxis3=dict(axisd), yaxis4=dict(axisd), annotations=ann) fig = go.Figure(data=go.Splom( dimensions=[dict(label='trees/sq.mile', values=df_selected['trees/sq.mile']), dict(label='avg.landprice($K/A)', values=df_selected['avg.landprice_thous$/acre']), dict(label='properties/sq.mile', values=df_selected['properties/sq.mile']), ], text=(df_selected[key]+': '+df_selected['borough'] if key == 'ntaname' else df_selected[key]), hoverinfo="x+y+text", # showlegend=True, marker=dict(color=index_vals, showscale=False, # colors encode categorical variables line_color='white', line_width=0.4), diagonal=dict(visible=True) ), layout=layout ) return fig