def test_mogsm(self): mcgsm = MCGSM( dim_in=0, dim_out=3, num_components=2, num_scales=2, num_features=0) p0 = 0.3 p1 = 0.7 N = 20000 m0 = array([[2], [0], [0]]) m1 = array([[0], [2], [1]]) C0 = cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)) C1 = cov(randn(mcgsm.dim_out, mcgsm.dim_out**2)) input = zeros([0, N]) output = hstack([ dot(cholesky(C0), randn(mcgsm.dim_out, round(p0 * N))) + m0, dot(cholesky(C1), randn(mcgsm.dim_out, round(p1 * N))) + m1]) * (rand(1, N) + 0.5) mcgsm.train(input, output, parameters={ 'verbosity': 0, 'max_iter': 10, 'train_means': True}) mogsm = MoGSM(3, 2, 2) # translate parameters from MCGSM to MoGSM mogsm.priors = sum(exp(mcgsm.priors), 1) / sum(exp(mcgsm.priors)) for k in range(mogsm.num_components): mogsm[k].mean = mcgsm.means[:, k] mogsm[k].covariance = inv(dot(mcgsm.cholesky_factors[k], mcgsm.cholesky_factors[k].T)) mogsm[k].scales = exp(mcgsm.scales[k, :]) mogsm[k].priors = exp(mcgsm.priors[k, :]) / sum(exp(mcgsm.priors[k, :])) self.assertAlmostEqual(mcgsm.evaluate(input, output), mogsm.evaluate(output), 5) mogsm_samples = mogsm.sample(N) mcgsm_samples = mcgsm.sample(input) # generated samples should have the same distribution for i in range(mogsm.dim): self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[0]) > 0.0001) self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[1]) > 0.0001) self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[2]) > 0.0001) posterior = mcgsm.posterior(input, mcgsm_samples) # average posterior should correspond to prior for k in range(mogsm.num_components): self.assertLess(abs(1 - mean(posterior[k]) / mogsm.priors[k]), 0.1)
def tailStats(tail1, tail2, gene): threeLoc1 = [] threeLoc2 = [] tailLen1 = [] tailLen2 = [] for tail in tail1: if gene in tail[2]: repeater(int(tail[3]), threeLoc1, int(tail[1])) repeater(int(tail[4]), tailLen1, int(tail[1])) #threeLoc1.append(int(tail[3])) #tailLen1.append(int(tail[4])) for tail in tail2: if gene in tail[2]: repeater(int(tail[3]), threeLoc2, int(tail[1])) repeater(int(tail[4]), tailLen2, int(tail[1])) #threeLoc2.append(int(tail[3])) #tailLen2.append(int(tail[4])) if not threeLoc1 or not threeLoc2: pLoc = "nan" pTail = "nan" else: #pLoc = stats.ttest_ind(threeLoc1, threeLoc2)[1] #pTail = stats.ttest_ind(tailLen1, tailLen2)[1] pLoc = stats.ks_2samp(threeLoc1, threeLoc2)[1] pTail = stats.ks_2samp(tailLen1, tailLen2)[1] return gene, len(threeLoc1), np.average(threeLoc1), np.average(tailLen1), len(threeLoc2), np.average(threeLoc2), np.average(tailLen2), pLoc, pTail
def test_weibull(dist, p1, p2, report_file = None, round = False): """ ----------------------------------------------------- kstest for weibull distribution :param p1: scale, lambda > 0 :param p2: shape, kappa > 0 :param dist: The distribution to be tested :return: True, False ----------------------------------------------------- """ size = len(dist) # s = np.random.weibull(p2, size) # dist_weibull_np = map(lambda x : x * p1, s) dist_weibull_scipy = stats.weibull_min.rvs(c = p2, loc = 0, scale = p1, size = size) if round: dist_weibull_scipy2 = [] for n in dist_weibull_scipy: dist_weibull_scipy2.append(round_to_n_digit(n, 7)) result = stats.ks_2samp(dist_weibull_scipy2,dist) else: result = stats.ks_2samp(dist_weibull_scipy, dist) p = get_p_s_from_ksresult(result)['p'] s = get_p_s_from_ksresult(result)['s'] critical_value_s = calc_ks_critical_value(size) # return p >= 5e-2 or s <= critical_value_s if p >= 5e-2 or s <= critical_value_s: return True else: if report_file is not None: report_file.write("BAD: ({0},{1})failed with statistic={2}, pvalue={3}, expected s less than {4} and p larger than 0.05.\n".format(p1, p2, s, p, critical_value_s)) return False
def motifStats(data,motifSize,degree, usetotal=False): for corr in ('corr','lcorr','lacorr'): motifsNL = findMotifs(data,('NL',corr), motifSize, degree, usetotal) motifsMCI = findMotifs(data,('MCI',corr), motifSize, degree, usetotal) motifsAD = findMotifs(data,('AD',corr), motifSize, degree, usetotal) allMotifs = list(set(motifsNL.keys()) | set(motifsAD.keys()) | set(motifsMCI.keys())) datatype = "Total" if usetotal else "Percent" filename = "result2/{}_ks-stats_size-{}_deg-{}.txt".format(corr+datatype,motifSize,degree) with open(filename,'w') as f: f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}\n".format('ID','MCI','AD','NORM NL','NORM MCI','NORM AD')) for key in allMotifs: NLdata = motifsNL.get(key,np.zeros(88)) MCIdata = motifsMCI.get(key,np.zeros(88)) ADdata = motifsAD.get(key,np.zeros(88)) KSstatistic, MCIpvalue = stats.ks_2samp(MCIdata,NLdata) KSstatistic, ADpvalue = stats.ks_2samp(ADdata,NLdata) k2,NLnorm = stats.normaltest(NLdata) k2,MCInorm = stats.normaltest(MCIdata) k2,ADnorm = stats.normaltest(ADdata) if MCIpvalue<0.01 or ADpvalue<0.01: line = "*{0:>9}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n" else: line = "{0:>10}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n" f.write(line.format(str(int(key)),MCIpvalue,ADpvalue,NLnorm,MCInorm,ADnorm))
def main(): pt = '/home/sdhawan/bol_ni_ej/' #sn =sys.argv[1] sbv = np.loadtxt(pt+'sbv_all_b14.txt', dtype='string') snlis = np.loadtxt('all91bg.txt', dtype='string') #p, f=ir_frac(sn) arr =[] for i in snlis: try: s = sbv[sbv[:,0] == i][0] irt = ir_at_max(i) print irt arr.append([irt, float(s[1]), float(s[2])]) except: i print arr arr = np.array(arr) #print "the NIR fraction at bolometric maximum is:", round(ir_at_max(sn)*100, 2), "%" plt.errorbar(arr[:,0], arr[:,1] , arr[:,2], fmt='go') plt.show() return 0 ir = np.loadtxt('../ejecmass.txt', usecols=(-2, -1)) noir = np.loadtxt('../ejecmass_noir.txt', usecols=(-2, -1)) print ks_2samp(ir[:,0], noir[:,0]) plt.hist(ir[:,0], histtype='step') plt.hist(noir[:,0], histtype='step')
def findKSstat(self): """ """ # Load baseline files for comparison: baseline = self.loadPickle(condition='baseline') # KS stats: for syll in self.syllables: AED, AEp = stats.ks_2samp(self.syllables[syll]['dstFreq'], baseline[syll]['dstFreq']) self.syllables[syll]['EntKS'] = AED self.syllables[syll]['EntPvalKS'] = AEp print 'syll ', syll, 'entropy : ', AED AFD, AFp = stats.ks_2samp(self.syllables[syll]['dstEnt'], baseline[syll]['dstEnt']) self.syllables[syll]['FreqKS'] = AFD self.syllables[syll]['FreqPvalKS'] = AFp print 'syll ', syll, 'freq : ', AED EXPdur = [] for song in self.syllables[syll]['duration']: if song is not None: EXPdur.append(self.syllables[syll]['duration'][song]) BASEdur = [] for song in baseline[syll]['duration']: if song is not None: BASEdur.append(baseline[syll]['duration'][song]) ADT, ADp = stats.ks_2samp(EXPdur, BASEdur) self.syllables[syll]['DurKS'] = ADT self.syllables[syll]['DurPvalKS'] = ADp print 'syll ', syll, 'duration : ', AED
def KS_test(groups, outfile): jdelim = args.delimiter if args.delimiter != None else ' ' for i,u in enumerate(groups): for j,v in enumerate(groups): if j > i or (j == i and len(args.columns) == 1): break for x,us in enumerate(u.samples): for y,vs in enumerate(v.samples): if len(vs) < args.ignore or len(us) < args.ignore: continue if j == i and y >= x: break if args.random != None: verdict = False for k in range(args.random): res = ks_2samp(random.sample(us, args.subsample), random.sample(vs, args.subsample)) if res[0] < res[1]: verdict = True outfile.write(jdelim.join(u.tup + v.tup + map(str, res)) + '\n') outfile.write('Verdict:' + str(verdict) + '\n') else: res = ks_2samp(us, vs) verdict = False if res[0] < res[1]: verdict = True outfile.write(jdelim.join(u.tup + v.tup + map(str, res)) + '\n') outfile.write('Verdict:' + str(verdict) + '\n')
def test_points(xs,ys): print xs[0][0], " steps" #print ks_2samp(xs[2:],ys[2:]) print ks_2samp(xs[2],ys[2]) print ks_2samp(xs[3],ys[3]) print ks_2samp(xs[4],ys[4]) print ks_2samp(xs[5],ys[5]) print ks_2samp(xs[6],ys[6]) print "======"
def kstest(x,y,alpha, beta): """Find the K-S test probability that the fit and the data were from the same distribution""" #Vector of expected y from fit fity = beta*x + alpha #Vector of expected x from fit fitx = (y - alpha) / beta (D1, p1) = st.ks_2samp(y,fity) (D2, p2) = st.ks_2samp(x,fitx) return (np.sqrt(D1*D2),np.sqrt(p1*p2))
def kstest(): n1=200 n2=300 a = stats.norm.rvs(size=n1, loc=0, scale=1) b = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) c = stats.norm.rvs(size=n2, loc=0.01, scale=1) print (stats.ks_2samp(a, b)) print (stats.ks_2samp(a, c))
def samples_from_same_distribution(self, *args): # Test if flattened samples distributions match (marginals match) _, p_marginal = st.ks_2samp(*[s.flatten() for s in args]) # Test if correlations within non independent draws match _, p_correlation = st.ks_2samp( *[np.array([np.corrcoef(ss) for ss in s]).flatten() for s in args] ) assert p_marginal >= 0.05 and p_correlation >= 0.05
def stats(binding_data, proximity_data): n_bins = 50 hist_b, bins_b = np.histogram(binding_data, bins=np.linspace(np.min(binding_data), np.max(binding_data), n_bins)) hist_p, bins_p = np.histogram(proximity_data, bins=np.linspace(np.min(binding_data), np.max(binding_data), n_bins)) #print hist_b, hist_p #print "Binding data> mean:%s, median:%s, std:%s" %(np.mean(binding_data), np.median(binding_data), np.std(binding_data)) #print "Proximity data> mean:%s, median:%s, std:%s" %(np.mean(proximity_data), np.median(proximity_data), np.std(proximity_data)) #print scipy.stats.spearmanr(hist_b, hist_p) #print bins_b, bins_p print ks_2samp(hist_b, hist_b) print ks_2samp(hist_b, hist_p)
def plot_posteriors_iter(eli,file_names): from scipy import stats import matplotlib as mpl mpl.use('Agg') mpl.rcParams.update({'font.size': 20}) import matplotlib.pyplot as plt no_files=np.array(file_names).shape[0] samples=list() for i in np.arange(no_files): read=np.genfromtxt(file_names[i]) samples.append(read[read.shape[0]/5:read.shape[0],:]) down=eli.lower_bounds up=eli.upper_bounds no_of_pars=up.shape[0] plt.clf() f,axes=plt.subplots(2,int(np.ceil(no_of_pars/2)),figsize=(24,12)) row=0 col=0 colors='bgcymbkrgrcmykw' linetypes=['--','-.','-.','-.','-.',':','-','-'] kolmog_stats=np.zeros((2*no_files-1,up.shape[0])) for i in np.arange(up.shape[0]): # patches=[] lines=[] x=np.arange(down[i],up[i],0.01) for j in np.arange(no_files): Ds,ps=stats.ks_2samp(samples[j][:,i],samples[-1][:,i]) kolmog_stats[j*2,i]=Ds kolmog_stats[j*2,i]=1 if j>0: D,p=stats.ks_2samp(samples[j][:,i],samples[j-1][:,i]) kolmog_stats[j*2-1,i]=D # print(D) kde=stats.gaussian_kde(samples[j][:,i]) kde.covariance_factor = lambda : .3 kde._compute_covariance() line,=axes[row,col].plot(x,kde.evaluate(x),linestyle=linetypes[j],color=colors[j],lw=3) lines.append(line) axes[row,col].set_title(eli.names[i]) print("_________") if i>up.shape[0]-3: axes[row,col].set_ylim([0,15]) row+=1 if (row==2): row=0 col+=1 f.tight_layout(rect=[0, 0.13, 1, 1]) np.savetxt("kolmog.dat",kolmog_stats,delimiter=" & ", fmt="%.2f") # f.legend([leg1,patches[2],patches[1],patches[0]],["Prior distribution","SWMM posterior","Emulator (improved) posterior","Emulator (standard) posterior",], bbox_to_anchor=[0.5, 0.05],loc='center',ncol=2) # f.legend([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6]],["noniterative (128)","0$^{th}$ iteration (64)","1$^{st}$ iteration (80)","2$^{nd}$ iteration (96)","3$^{rd}$ iteration (112)","4$^{th}$ iteration (128)","4$^{th}$ iteration (144)","SWMM",], bbox_to_anchor=[0.5, 0.08],loc='center',ncol=3) # f.legend([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6],lines[7]],["noniterative (72)","0$^{th}$ iteration (32)","1$^{st}$ iteration (40)","2$^{nd}$ iteration (48)","3$^{rd}$ iteration (56)","4$^{th}$ iteration (64)","5$^{th}$ iteration (72)","SWMM",], bbox_to_anchor=[0.5, 0.08],loc='center',ncol=3) f.legend([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6],lines[7]],["noniterative (108)","0$^{th}$ iteration (48)","1$^{st}$ iteration (60)","2$^{nd}$ iteration (72)","3$^{rd}$ iteration (84)","4$^{th}$ iteration (96)","5$^{th}$ iteration (108)","SWMM",], bbox_to_anchor=[0.5, 0.08],loc='center',ncol=3) f.savefig("posteriors.pdf",dpi=500) plt.close()
def omniDataCorr(srefDate, erefDate, startDate, endDate, epochs, SWP, binStride, CorrTime = 'Day', CorrType = 'kstest'): import numpy import bisect import datetime from scipy.stats import ks_2samp, pearsonr from getswdata import getOMNIfiles, dataClean, dateShift, dateList CorrTime = CorrTime.lower() CorrType = CorrType.lower() if endDate < startDate: print('(swdatanal.omniDataCorr).Error: Dates are not applicable') SWPDatRng=0; cepochs=0; KSVals=0; KSDist=0; aepochs=0 return SWPDatRng, cepochs, KSVals, KSDist, aepochs sEpochID = bisect.bisect_left(epochs, startDate) eEpochID = bisect.bisect_left(epochs, endDate) cepochs = epochs[sEpochID:eEpochID] SWPDatRng = SWP[sEpochID:eEpochID] if SWP[sEpochID:eEpochID] == []: print('(swdatanal.omniDataCorr).Error: No data avaliable for the designated date(s) and/or time(s).') SWPDatRng=0; cepochs=0; KSVals=0; KSDist=0; aepochs=0 return SWPDatRng, cepochs, KSVals, KSDist, aepochs _, bins = getDistrib(filter(lambda v: v==v, SWPDatRng), stride = binStride, norm = False) sEpochID = bisect.bisect_left(epochs, srefDate) eEpochID = bisect.bisect_left(epochs, erefDate) SWPV01 = SWP[sEpochID:eEpochID] SWPD01 = getDistrib(filter(lambda v: v==v, SWPV01), bins=bins, norm=True) if CorrTime == 'day': aepochs = []; KSVals = []; KSDist = [] sEpoch = datetime.datetime(startDate.year,startDate.month,startDate.day, 0, 0, 0) eEpoch = dateShift(sEpoch, hours = 23, minutes = 59, seconds = 59) for i in range((endDate-startDate).days+1): aepochs = aepochs + [dateShift(sEpoch,0,0,i,0,0,0)] sEpochID = bisect.bisect_left(epochs, dateShift(sEpoch,0,0,i,0,0,0)) eEpochID = bisect.bisect_left(epochs, dateShift(eEpoch,0,0,i,0,0,0)) SWPV02 = SWP[sEpochID:eEpochID] SWPD02 = getDistrib(filter(lambda v: v==v, SWPV02), bins=bins, norm=True) if CorrType == 'kstest': KSVals = KSVals + [ks_2samp(SWPV01, SWPV02)] KSDist = KSDist + [ks_2samp(SWPD01, SWPD02)] elif CorrType == 'pearson': KSVals = KSVals + [pearsonr(SWPV01, SWPV02)] KSDist = KSDist + [pearsonr(SWPD01, SWPD02)] KSVals = numpy.array(KSVals) KSDist = numpy.array(KSDist) return SWPDatRng, cepochs, KSVals, KSDist, aepochs
def simple_example(): np.random.seed(12345678) n1 = 200000 n2 = 300000 ''' rvs2, rvs3, rvs4 与 rvs1 的分布的相似性逐渐变大,表现在 pvalue 上是逐渐变大· ''' rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1) rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5) rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0) rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0) print stats.ks_2samp(rvs1, rvs2) print stats.ks_2samp(rvs1, rvs3) print stats.ks_2samp(rvs1, rvs4) print stats.ks_2samp(rvs1, rvs1) bins_cnt = 100 alpha = 0.5 plt.hist(rvs1, label='rvs1', bins=bins_cnt, alpha=1.0, density=True, histtype='stepfilled') plt.hist(rvs2, label='rvs2', bins=bins_cnt, alpha=alpha, density=True, histtype='stepfilled') plt.hist(rvs3, label='rvs3', bins=bins_cnt, alpha=alpha, density=True, histtype='stepfilled') plt.hist(rvs4, label='rvs4', bins=bins_cnt, alpha=alpha, density=True, histtype='stepfilled') plt.legend(loc='best', frameon=False) plt.show()
def plot_posteriors(eli,file_names): from scipy import stats import matplotlib as mpl mpl.use('Agg') mpl.rcParams.update({'font.size': 20}) import matplotlib.pyplot as plt no_files=np.array(file_names).shape[0] samples=list() for i in np.arange(no_files): samples.append(np.genfromtxt(file_names[i])) down=eli.lower_bounds up=eli.upper_bounds no_of_pars=up.shape[0] plt.clf() f,axes=plt.subplots(2,int(np.ceil(no_of_pars/2)),figsize=(24,12)) row=0 col=0 colors='bgrcmykwbgrcmykw' linetypes=['--','-','-','-','-','-','-'] for i in np.arange(up.shape[0]): # patches=[] lines=[] x=np.arange(down[i],up[i],0.01) pri=np.zeros(x.shape[0]) for j in np.arange(x.shape[0]): pri[j]=eli.prior_dist(x[j],i) leg1,=axes[row,col].plot(x,pri,'k.-') D1,p1=stats.ks_2samp(samples[no_files-1][:,i],samples[no_files-3][:,i]) D2,p2=stats.ks_2samp(samples[no_files-1][:,i],samples[no_files-2][:,i]) print(D1) print(D2) print("_________") for j in np.arange(no_files): kde=stats.gaussian_kde(samples[j][:,i]) kde.covariance_factor = lambda : .3 kde._compute_covariance() line,=axes[row,col].plot(x,kde.evaluate(x),linestyle=linetypes[j],color=colors[j],lw=2) lines.append(line) axes[row,col].set_title(eli.names[i]) if i>up.shape[0]-3: axes[row,col].set_ylim([0,15]) row+=1 if (row==2): row=0 col+=1 f.tight_layout(rect=[0, 0.08, 1, 1]) # f.legend([leg1,patches[2],patches[1],patches[0]],["Prior distribution","SWMM posterior","Emulator (improved) posterior","Emulator (standard) posterior",], bbox_to_anchor=[0.5, 0.05],loc='center',ncol=2) f.legend([leg1,lines[2],lines[1],lines[0]],["Prior distribution","SWMM posterior","Emulator (improved) posterior","Emulator (standard) posterior",], bbox_to_anchor=[0.5, 0.05],loc='center',ncol=2) f.savefig("posteriors.pdf",dpi=500) plt.close()
def model_v_model_cdfs_pdfs(arr, bins, cdf, pdf, args, nsinks=64): done = [] for ref_key in arr.keys(): for key in arr.keys(): if ref_key == key: continue # if (ref_key != 'bInf' and ref_key != 'bInfsd3') and (key != 'bInf' and key != 'bInfsd3'): continue # if ref_key != 'hydro_both' and key != 'hydro_both': continue if "%s_v_%s" % (ref_key, key) in done or "%s_v_%s" % (key, ref_key) in done: continue done.append("%s_v_%s" % (ref_key, key)) t = ttest_ind(arr[ref_key], arr[key], equal_var=False) ks = ks_2samp(arr[ref_key], arr[key]) # plot histograms and show Welch t, KS p-values fig, axes = plt.subplots(2, 1, sharex=True) ax = axes.flatten() plt.subplots_adjust(hspace=0.1) ax[0].plot((bins[ref_key][1:] + bins[ref_key][:-1]) / 2, cdf[ref_key], c="b", label="%s" % ref_key) ax[0].plot((bins[key][1:] + bins[key][:-1]) / 2, cdf[key], c="k", label="%s" % key) ax[1].plot((bins[ref_key][1:] + bins[ref_key][:-1]) / 2, pdf[ref_key], c="b", label="%s" % ref_key) ax[1].plot((bins[key][1:] + bins[key][:-1]) / 2, pdf[key], c="k", label="%s" % key) ax[1].legend(loc=0, fontsize="medium") ax[0].set_ylabel("CDF") ax[1].set_ylabel("PDF") ax[1].set_xlabel(r"$\dot{M}$") for i in range(2): ax[i].set_xlim(-7, -2) ax[i].set_ylim(-0.1, 1.1) plt.suptitle("%s_v_%s: Welch P(t)=%.2g, KS P(t)=%.2g" % (ref_key, key, t[1], ks[1])) plt.savefig(os.path.join(args.outdir, "%s-v-%s-nsinks-%d.png" % (ref_key, key, nsinks))) plt.close() # repeat for hydro1, hydro2, hydro 1+2 t = ttest_ind(arr[ref_key], arr[key], equal_var=False) ks = ks_2samp(arr[ref_key], arr[key]) # plot histograms and show Welch t, KS p-values fig, axes = plt.subplots(2, 1, sharex=True) ax = axes.flatten() plt.subplots_adjust(hspace=0) for key, c in zip(["hydro_both", "bInf", "bInfsd3"], ["b", "y", "k"]): ax[0].plot((bins[key][1:] + bins[key][:-1]) / 2, cdf[key], c=c, label="%s" % key) ax[1].plot((bins[key][1:] + bins[key][:-1]) / 2, pdf[key], c=c, label="%s" % key) ax[1].legend(loc=0, fontsize="medium") ax[0].set_ylabel("CDF") ax[1].set_ylabel("PDF") ax[1].set_xlabel(r"$\dot{M}$") for i in range(2): ax[i].set_xlim(-6, -2.5) plt.savefig(os.path.join(args.outdir, "hydro1+2-v-hydro1-v-hydro2-nsinks-%d.png" % nsinks)) plt.close()
def draw_overtraining(bdt_name, test, train): test_bg = test[test.classID==1] test_sig = test[test.classID==0] train_bg = train[train.classID==1] train_sig = train[train.classID==0] fig = plt.figure() ax = fig.add_subplot(111) low = min(test[bdt_name].min(), train[bdt_name].min()) high = max(test[bdt_name].max(), train[bdt_name].max()) print bdt_name, "signal", print ks_2samp(test_sig[bdt_name], train_sig[bdt_name])[1] print bdt_name, "background", print ks_2samp(test_bg[bdt_name], train_bg[bdt_name])[1] ax.hist(train_bg[bdt_name], bins=50, normed=True, range=(low,high), label="training background", color="blue", alpha=0.75) ax.hist(train_sig[bdt_name], bins=50, normed=True, range=(low,high), label="training signal", color="red", alpha=0.75) y,binEdges = np.histogram(test_bg[bdt_name], bins=50, normed=True, range=(low,high)) bincenters = 0.5*(binEdges[1:]+binEdges[:-1]) ax.plot(bincenters, y, 'o', color="blue", label="test background") y,binEdges = np.histogram(test_sig[bdt_name], bins=50, normed=True, range=(low,high)) bincenters = 0.5*(binEdges[1:]+binEdges[:-1]) ax.plot(bincenters, y, 'o', color="red", label="test signal") ax.legend(loc=2) ax.set_xlabel("BDT output") ax.set_ylabel("Arbitrary units") fig.savefig("/tmp/overtraining_%s.pdf"%(bdt_name))
def compute_ks_by_contained(contigs_by_lib_name, sinks, sources): # compute median of maxmin as well as ks p-value of contained maxmin for lib_snk in contigs_by_lib_name: # for a fixed lib_snk; do all source libs together # contained_ctg: contig names of all source libraries stored by source library names contained_ctg=collections.defaultdict(set) for snkCtg in contigs_by_lib_name[lib_snk].itervalues(): for srcCtg in snkCtg.contained_in: contained_ctg[srcCtg.lib].add(srcCtg.name) for lib_src in contigs_by_lib_name: if lib_src in contained_ctg: contained=[] not_contained=[] for ctg in contigs_by_lib_name[lib_src]: if ctg in contained_ctg[lib_src]: contained.append(contigs_by_lib_name[lib_src][ctg].maxmin) else: not_contained.append(contigs_by_lib_name[lib_src][ctg].maxmin) # contained=[contigs_by_lib_name[lib_src][ctg].maxmin for ctg in contigs_by_lib_name[lib_src] if ctg in contained_ctg[lib_src]] # not_contained=[contigs_by_lib_name[lib_src][ctg].maxmin for ctg in contigs_by_lib_name[lib_src] if ctg not in contained_ctg[lib_src]] ks_pvalue = stats.ks_2samp(contained, not_contained)[1] print lib_src, lib_snk, ks_pvalue, sum(contained)/len(contained), sum(not_contained)/len(not_contained) if ks_pvalue < 0.05 and np.median(contained) > np.median(not_contained): sources[lib_snk] |= {lib_src} sinks[lib_src] |= {lib_snk}
def p_value_scoring_object_test(clf, X, y): """ p_value_getter is a scoring callable that returns the negative p value from the KS test on the prediction probabilities for the particle and antiparticle samples. """ print("Greeting : ", greeting) #Finding out the prediction probabilities prob_pred=clf.predict_proba(X)[:,1] #print(prob_pred) #This can be deleted if not using Keras #For Keras turn cathegorical y back to normal y if y.ndim==2: if y.shape[0]!=1 and y.shape[1]!=1: #Then we have a cathegorical vector y = y[:,1] #making sure the inputs are row vectors y = np.reshape(y,(1,y.shape[0])) prob_pred = np.reshape(prob_pred,(1,prob_pred.shape[0])) #Separate prob into particle and antiparticle samples prob_0 = prob_pred[np.logical_or.reduce([y==0])] prob_1 = prob_pred[np.logical_or.reduce([y==1])] #if __debug__: #print("Plot") p_KS_stat=stats.ks_2samp(prob_0,prob_1) print(p_KS_stat) p_KS=-p_KS_stat[1] return p_KS
def calc_ks_test(self,true_distribution): if len(self.isize_list) >= 5: KS_statistic, self.pval = ks_2samp(self.isize_list, true_distribution) return KS_statistic, self.pval else: self.pval = -1 return -1, -1
def compare_fixlens(samp_fixlen, fixlendist, eps=.000000001): nonan_samp_fixlen = samp_fixlen[np.logical_not(np.isnan(samp_fixlen))] nonan_fixlendist = fixlendist[np.logical_not(np.isnan(fixlendist))] print nonan_samp_fixlen, nonan_fixlendist ks, p = sts.ks_2samp(nonan_samp_fixlen, nonan_fixlendist) print ks, p return np.log(p + eps)
def _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0): ''' A function which takes two lists of values of equal length with corresponding entries and returns a dict containing a variety of metrics. :param x_values: A list of values for the X-axis (experimental values). :param y_values: A list of values for the X-axis (predicted values). :param fcorrect_x_cutoff: See get_xy_dataset_statistics. :param fcorrect_y_cutoff: See get_xy_dataset_statistics. :param x_fuzzy_range: See get_xy_dataset_statistics. :param y_scalar: See get_xy_dataset_statistics. :return: A table of statistics. ''' from scipy.stats import pearsonr, spearmanr, normaltest, ks_2samp, kstest, norm assert(len(x_values) == len(y_values)) return dict( pearsonr = pearsonr(x_values, y_values), spearmanr = spearmanr(x_values, y_values), gamma_CC = gamma_CC(x_values, y_values), MAE = mae(x_values, y_values), normaltestx = normaltest(x_values), normaltesty = normaltest(y_values), kstestx = kstest(x_values, 'norm'), kstesty = kstest(y_values, 'norm'), ks_2samp = ks_2samp(x_values, y_values), fraction_correct = fraction_correct(x_values, y_values, x_cutoff = fcorrect_x_cutoff, y_cutoff = fcorrect_y_cutoff), fraction_correct_fuzzy_linear = fraction_correct_fuzzy_linear(x_values, y_values, x_cutoff = fcorrect_x_cutoff, x_fuzzy_range = x_fuzzy_range, y_scalar = y_scalar), )
def calc_ks_stats(scores, exp_scores=None): from scipy import stats if exp_scores: (D, p_val) = stats.ks_2samp(scores, exp_scores) else: (D, p_val) = stats.kstest(scores, stats.uniform.cdf) return {'D':D, 'p_val':p_val}
def get_ks(iso, self): for daynr in BUCKETS: #print(sorted(Counter(choose_day(iso.days, daynr).rows["duration_of_state"]).items())) #print(sorted(Counter(choose_day(self.days, daynr).rows["duration_of_state"]).items())) iso_das = choose_day(iso.days, daynr).rows["duration_of_state"] self_das = choose_day(self.days, daynr).rows["duration_of_state"] yield stats.ks_2samp(iso_das, self_das)
def ks_statistic_calc(fund_ts_past, fund_ts_month): seq1 = deepcopy(fund_ts_past.values) seq2 = deepcopy(fund_ts_month.values) tsu.returnize0(seq1) tsu.returnize0(seq2) (ks, p) = scst.ks_2samp(seq1, seq2) return ks, p
def sort_features(df1, df2): """ Takes two dataframes, and calculates a KS-Test between each two columns that appear in both dataframes. Returns a list of column names, sorted by p-value in ascending order, and a list of corresponding p-values. Args: df1 (pd.DataFrame): Dataframe of feature columns for 'sample 1' df2 (pd.DataFrame): Dataframe of feature columns for 'sample 2' Returns ([str], [float]): Lists of column names and p-values. """ common_cols = set.intersection(*[set(df.columns) for df in [df1, df2]]) if len(common_cols) == 0: raise ValueError("The dataframes have no columns in common.") # calculate a KS-test for each feature column d = [] p_vals = [] for c in common_cols: this_d, this_p = ks_2samp(df1[c], df2[c]) d.append(this_d) p_vals.append(this_p) # sort by p-value pc = list(zip(p_vals, common_cols)) pc.sort() p_vals, common_cols = zip(*pc) # return sorted list of feature names and p-values return list(common_cols), list(p_vals)
def ks_test(samples1, samples2, threshold=0.9): """Applies a KS test to determine if two sets of samples are the same. The ks test is applied parameter-by-parameter. If the two-tailed p-value returned by the test is greater than ``threshold``, the samples are considered to be the same. Parameters ---------- samples1 : dict Dictionary of mapping parameters to the first set of samples. samples2 : dict Dictionary of mapping parameters to the second set of samples. threshold : float The thershold to use for the p-value. Default is 0.9. Returns ------- dict : Dictionary mapping parameter names to booleans indicating whether the given parameter passes the KS test. """ is_the_same = {} assert set(samples1.keys()) == set(samples2.keys()), ( "samples1 and 2 must have the same parameters") # iterate over the parameters for param in samples1: s1 = samples1[param] s2 = samples2[param] _, p_value = ks_2samp(s1, s2) is_the_same[param] = p_value > threshold return is_the_same
def CDFDistance2(rho1, v1, rho2, v2, rho_min, rho_max): """ For two input 2D signals calculate the "distance" between their CDFs - averaged (over density bins) distance between two 1D CDFs of speed calculated for specific density bin. input: - rho1: density array of size n - v1: speed array of size n - rho2: density array of size m - v2: speed array of size m - rho_min: lower boundary of density value considered (used for bins creation) - rho_max: upper boundary of density value considered (used for bins creation) output: - KSD: not negative number from 0 to 1 """ EMPTY = -1; nBins = 10; # Now it is not obvious which value to get bins = np.linspace(rho_min, rho_max, nBins+1); # dist1D = EMPTY*np.ones((1,nBins)); dist1D = [] ; for iBin in range(nBins): v1_b = v1[(rho1 >= bins[iBin]) * (rho1 <= bins[iBin+1])]; v2_b = v2[(rho2 >= bins[iBin]) * (rho2 <= bins[iBin+1])]; if ((len(v1_b) > 0) and (len(v2_b) > 0)): [ks2stat,p] = stats.ks_2samp(v1_b, v2_b); # dist1D[0,iBin] = ks2stat; dist1D.append(ks2stat) #KSD = np.sum(dist1D[dist1D != EMPTY])/len(dist1D[dist1D != EMPTY]); KSD = np.sum(dist1D)/len(dist1D) return KSD
def do_ks_analysis(profiles, lens, name='', plot=False): L = np.array(0.446*(lens-np.mean(lens)), dtype='float64') n, bins = np.histogram(L, bins=2) idx_l = np.digitize(L, bins) pos_l = (idx_l == 1) r_list_l = np.where(pos_l)[0] lower_y = np.log(np.array(list(profiles[r_list_l]), dtype=np.float)) # print lower_y.shape, L[r_list_l] pos_u = (idx_l > 1) r_list_u = np.where(pos_u)[0] upper_y = np.log(np.array(list(profiles[r_list_u]), dtype=np.float)) # print upper_y.shape, L[r_list_u] if upper_y.shape[0] < 2 or lower_y.shape[0] < 2: return np.ones(profiles[0].shape[0]) '''ks 2 sample''' pval=[] for k in xrange(lower_y.shape[1]): try: _, p = stats.ks_2samp(lower_y[:,k], upper_y[:,k]) except ValueError: p = 1 pval.append(p) if plot: plot_ks_analysis(lower_y, upper_y, pval, name) pv = np.array(pval) return pv
def ks(list_obs, list_poisson, sample_size, confidence, lamb): #D, p_value = ks_2samp(list_poisson, list_obs) D, p_value = ks_2samp(list_poisson, list_obs) a = "The Kolmogorov-Smirnov Test accept the null hypothesis" b = "The Kolmogorov-Smirnov Test reject the null hypothesis" global teste teste = '' if lamb > 10: teste = b D_critico = 0 if confidence == 0.95: confidenceL = 1 elif confidence == 0.99: confidenceL = 2 else: confidenceL = 0 s = sqrt(sample_size) table = np.matrix([[0.202, 0.214, 0.226, 0.237, 0.254], [0.234, 0.242,0.254,0.265,0.281], [0.290,0.3,0.310,0.324,0.334], [0.152,0.166,0.172,0.179,0.185], [0.180,0.188,0.194,0.199,0.206], [0.223,0.234,0.236,0.243,0.249], [0.120,0.132,0.140,0.144,0.149], [0.141,0.151,0.156,0.160,0.165], [0.176,0.185,0.188,0.195,0.197], [0.100,0.112,0.116,0.120,0.124], [0.116,0.125,0.129,0.134,0.140], [0.149,0.154,0.158,0.160,0.168], [0.087,0.097,0.102,0.106,0.110], [0.101,0.108,0.113,0.118,0.122], [0.130,0.135,0.137,0.143,0.146], [0.55/s,0.61/s,0.65/s,0.67/s,0.7/s], [0.64/s,0.69/s,0.72/s,0.75/s,0.77/s], [0.82/s,0.86/s,0.87/s,0.9/s,0.93/s]]) if sample_size < 12: if confidenceL == 0: if lamb <=1: if D <= table[0,0]: teste = a D_critico = table[0,0] else: teste = b D_critico = table[0,0] elif lamb > 1 and lamb <=2: if D <= table[0,1]: teste = a D_critico= table[0,1] else: teste = b D_critico= table[0,1] elif lamb > 2 and lamb <=3: if D <= table[0,2]: teste = a D_critico= table[0,2] else: teste = b D_critico= table[0,2] elif lamb > 3 and lamb <=5: if D <= table[0,3]: teste = a D_critico= table[0,3] else: teste = b D_critico= table[0,3] elif lamb > 5 and lamb <=10: if D <= table[0,4]: teste = a D_critico= table[0,4] else: teste = b D_critico= table[0,4] #----------------------------------------------------------- elif confidenceL == 1: if lamb <=1: if D <= table[1,0]: teste = a D_critico= table[1,0] else: teste = b D_critico= table[1,0] elif lamb > 1 and lamb <=2: if D <= table[1,1]: teste = a D_critico= table[1,1] else: teste = b D_critico= table[1,1] elif lamb > 2 and lamb <=3: if D <= table[1,2]: teste = a D_critico= table[1,2] else: teste = b D_critico= table[1,2] elif lamb > 3 and lamb <=5: if D <= table[1,3]: teste = a D_critico= table[1,3] else: teste = b D_critico= table[1,3] elif lamb > 5 and lamb <=10: if D <= table[1,4]: teste = a D_critico= table[1,4] else: teste = b D_critico= table[1,4] #----------------------------------------------------------- elif confidenceL == 2: if lamb <=1: if D <= table[2,0]: teste = a D_critico= table[2,0] else: teste = b D_critico= table[2,0] elif lamb > 1 and lamb <=2: if D <= table[2,1]: teste = a D_critico= table[2,1] else: teste = b D_critico= table[2,1] elif lamb > 2 and lamb <=3: if D <= table[2,2]: teste = a D_critico= table[2,2] else: teste = b D_critico= table[2,2] elif lamb > 3 and lamb <=5: if D <= table[2,3]: teste = a D_critico= table[2,3] else: teste = b D_critico= table[2,3] elif lamb > 5 and lamb <=10: if D <= table[2,4]: teste = a D_critico= table[2,4] else: teste = b D_critico= table[2,4] #----------------------------------------------------------- elif sample_size >=12 and sample_size < 20: if confidenceL == 0: if lamb <=1: if D <= table[3,0]: teste = a D_critico= table[3,0] else: teste = b D_critico= table[3,0] elif lamb > 1 and lamb <=2: if D <= table[3,1]: teste = a D_critico= table[3,1] else: teste = b D_critico= table[3,1] elif lamb > 2 and lamb <=3: if D <= table[3,2]: teste = a D_critico= table[3,2] else: teste = b D_critico= table[3,2] elif lamb > 3 and lamb <=5: if D <= table[3,3]: teste = a D_critico= table[3,3] else: teste = b D_critico= table[3,3] elif lamb > 5 and lamb <=10: if D <= table[3,4]: teste = a D_critico= table[3,4] else: teste = b D_critico= table[3,4] #----------------------------------------------------------- elif confidenceL == 1: if lamb <=1: if D <= table[4,0]: teste = a D_critico= table[4,0] else: teste = b D_critico= table[4,0] elif lamb > 1 and lamb <=2: if D <= table[4,1]: teste = a D_critico= table[4,1] else: teste = b D_critico= table[4,1] elif lamb > 2 and lamb <=3: if D <= table[4,2]: teste = a D_critico= table[4,2] else: teste = b D_critico= table[4,2] elif lamb > 3 and lamb <=5: if D <= table[4,3]: teste = a D_critico= table[4,3] else: teste = b D_critico= table[4,3] elif lamb > 5 and lamb <=10: if D <= table[4,4]: teste = a D_critico= table[4,4] else: teste = b D_critico= table[4,4] #----------------------------------------------------------- elif confidenceL == 2: if lamb <=1: if D <= table[5,0]: teste = a D_critico= table[5,0] else: teste = b D_critico= table[5,0] elif lamb > 1 and lamb <=2: if D <= table[5,1]: teste = a D_critico= table[5,1] else: teste = b D_critico= table[5,1] elif lamb > 2 and lamb <=3: if D <= table[5,2]: teste = a D_critico= table[5,2] else: teste = b D_critico= table[5,2] elif lamb > 3 and lamb <=5: if D <= table[5,3]: teste = a D_critico= table[5,3] else: teste = b D_critico= table[5,3] elif lamb > 5 and lamb <=10: if D <= table[5,4]: teste = a D_critico= table[5,4] else: teste = b D_critico= table[5,4] #----------------------------------------------------------- elif sample_size >= 20 and sample_size < 30: if confidenceL == 0: if lamb <=1: if D <= table[6,0]: teste = a D_critico= table[6,0] else: teste = b D_critico= table[6,0] elif lamb > 1 and lamb <=2: if D <= table[6,1]: teste = a D_critico= table[6,1] else: teste = b D_critico= table[6,1] elif lamb > 2 and lamb <=3: if D <= table[6,2]: teste = a D_critico= table[6,2] else: teste = b D_critico= table[6,2] elif lamb > 3 and lamb <=5: if D <= table[6,3]: teste = a D_critico= table[6,3] else: teste = b D_critico= table[6,3] elif lamb > 5 and lamb <=10: if D <= table[6,4]: teste = a D_critico= table[6,4] else: teste = b D_critico= table[6,4] #----------------------------------------------------------- elif confidenceL == 1: if lamb <=1: if D <= table[7,0]: teste = a D_critico= table[7,0] else: teste = b D_critico= table[7,0] elif lamb > 1 and lamb <=2: if D <= table[7,1]: teste = a D_critico= table[7,1] else: teste = b D_critico= table[7,1] elif lamb > 2 and lamb <=3: if D <= table[7,2]: teste = a D_critico= table[7,2] else: teste = b D_critico= table[7,2] elif lamb > 3 and lamb <=5: if D <= table[7,3]: teste = a D_critico= table[7,3] else: teste = b D_critico= table[7,3] elif lamb > 5 and lamb <=10: if D <= table[7,4]: teste = a D_critico= table[7,4] else: teste = b D_critico= table[7,4] #----------------------------------------------------------- elif confidenceL == 2: if lamb <=1: if D <= table[8,0]: teste = a D_critico= table[8,0] else: teste = b D_critico= table[8,0] elif lamb > 1 and lamb <=2: if D <= table[8,1]: teste = a D_critico= table[8,1] else: teste = b D_critico= table[8,1] elif lamb > 2 and lamb <=3: if D <= table[8,2]: teste = a D_critico= table[8,2] else: teste = b elif lamb > 3 and lamb <=5: if D <= table[8,3]: teste = a D_critico= table[8,3] else: teste = b elif lamb > 5 and lamb <=10: if D <= table[8,4]: teste = a D_critico= table[8,4] else: teste = b D_critico= table[8,4] #----------------------------------------------------------- elif sample_size >= 30 and sample_size < 40: if confidenceL == 0: if lamb <=1: if D <= table[9,0]: teste = a D_critico= table[9,0] else: teste = b D_critico= table[9,0] elif lamb > 1 and lamb <=2: if D <= table[9,1]: teste = a D_critico= table[9,1] else: teste = b D_critico= table[9,1] elif lamb > 2 and lamb <=3: if D <= table[9,2]: teste = a D_critico= table[9,2] else: teste = b D_critico= table[9,2] elif lamb > 3 and lamb <=5: if D <= table[9,3]: teste = a D_critico= table[9,3] else: teste = b D_critico= table[9,3] elif lamb > 5 and lamb <=10: if D <= table[9,4]: teste = a D_critico= table[9,4] else: teste = b D_critico= table[9,4] #----------------------------------------------------------- elif confidenceL == 1: if lamb <=1: if D <= table[10,0]: teste = a D_critico= table[10,0] else: teste = b D_critico= table[10,0] elif lamb > 1 and lamb <=2: if D <= table[10,1]: teste = a D_critico= table[10,1] else: teste = b D_critico= table[10,1] elif lamb > 2 and lamb <=3: if D <= table[10,2]: teste = a D_critico= table[10,2] else: teste = b D_critico= table[10,2] elif lamb > 3 and lamb <=5: if D <= table[10,3]: teste = a D_critico= table[10,3] else: teste = b D_critico= table[10,3] elif lamb > 5 and lamb <=10: if D <= table[10,4]: teste = a D_critico= table[10,4] else: teste = b D_critico= table[10,4] #----------------------------------------------------------- elif confidenceL == 2: if lamb <=1: if D <= table[11,0]: teste = a D_critico= table[11,0] else: teste = b D_critico= table[11,0] elif lamb > 1 and lamb <=2: if D <= table[11,1]: teste = a D_critico= table[11,1] else: teste = b D_critico= table[11,1] elif lamb > 2 and lamb <=3: if D <= table[11,2]: teste = a D_critico= table[11,2] else: teste = b D_critico= table[11,2] elif lamb > 3 and lamb <=5: if D <= table[11,3]: teste = a D_critico= table[11,3] else: teste = b D_critico= table[11,3] elif lamb > 5 and lamb <=10: if D <= table[11,4]: teste = a D_critico= table[11,4] else: teste = b D_critico= table[11,4] #----------------------------------------------------------- elif sample_size == 40: if confidenceL == 0: if lamb <=1: if D <= table[12,0]: teste = a D_critico= table[12,0] else: teste = b D_critico= table[12,0] elif lamb > 1 and lamb <=2: if D <= table[12,1]: teste = a D_critico= table[12,1] else: teste = b D_critico= table[12,1] elif lamb > 2 and lamb <=3: if D <= table[12,2]: teste = a D_critico= table[12,2] else: teste = b D_critico= table[12,2] elif lamb > 3 and lamb <=5: if D <= table[12,3]: teste = a D_critico= table[12,3] else: teste = b D_critico= table[12,3] elif lamb > 5 and lamb <=10: if D <= table[12,4]: teste = a D_critico= table[12,4] else: teste = b D_critico= table[12,4] #----------------------------------------------------------- elif confidenceL == 1: if lamb <=1: if D <= table[13,0]: teste = a D_critico= table[13,0] else: teste = b D_critico= table[13,0] elif lamb > 1 and lamb <=2: if D <= table[13,1]: teste = a D_critico= table[13,1] else: teste = b D_critico= table[13,1] elif lamb > 2 and lamb <=3: if D <= table[13,2]: teste = a D_critico= table[13,2] else: teste = b D_critico= table[13,2] elif lamb > 3 and lamb <=5: if D <= table[13,3]: teste = a D_critico= table[13,3] else: teste = b D_critico= table[13,3] elif lamb > 5 and lamb <=10: if D <= table[13,4]: teste = a D_critico= table[13,4] else: teste = b D_critico= table[13,4] #----------------------------------------------------------- elif confidenceL == 2: if lamb <=1: if D <= table[14,0]: teste = a D_critico= table[14,0] else: teste = b D_critico= table[14,0] elif lamb > 1 and lamb <=2: if D <= table[14,1]: teste = a D_critico= table[14,1] else: teste = b D_critico= table[14,1] elif lamb > 2 and lamb <=3: if D <= table[14,2]: teste = a D_critico= table[14,2] else: teste = b D_critico= table[14,2] elif lamb > 3 and lamb <=5: if D <= table[14,3]: teste = a D_critico= table[14,3] else: teste = b D_critico= table[14,3] elif lamb > 5 and lamb <=10: if D <= table[14,4]: teste = a D_critico= table[14,4] else: teste = b D_critico= table[14,4] #----------------------------------------------------------- elif sample_size > 40: if confidenceL == 0: if lamb <=1: if D <= table[15,0]: teste = a D_critico= table[15,0] else: teste = b D_critico= table[15,0] elif lamb > 1 and lamb <=2: if D <= table[15,1]: teste = a D_critico= table[15,1] else: teste = b D_critico= table[15,1] elif lamb > 2 and lamb <=3: if D <= table[15,2]: teste = a D_critico= table[15,2] else: teste = b D_critico= table[15,2] elif lamb > 3 and lamb <=5: if D <= table[15,3]: teste = a D_critico= table[15,3] else: teste = b D_critico= table[15,3] elif lamb > 5 and lamb <=10: if D <= table[15,4]: teste = a D_critico= table[15,4] else: teste = b D_critico= table[15,4] #----------------------------------------------------------- elif confidenceL == 1: if lamb <=1: if D <= table[16,0]: teste = a D_critico= table[16,0] else: teste = b D_critico= table[16,0] elif lamb > 1 and lamb <=2: if D <= table[16,1]: teste = a D_critico= table[16,1] else: teste = b D_critico= table[16,1] elif lamb > 2 and lamb <=3: if D <= table[16,2]: teste = a D_critico= table[16,2] else: teste = b D_critico= table[16,2] elif lamb > 3 and lamb <=5: if D <= table[16,3]: teste = a D_critico= table[16,3] else: teste = b D_critico= table[16,3] elif lamb > 5 and lamb <=10: if D <= table[16,4]: teste = a D_critico= table[16,4] else: teste = b D_critico= table[16,4] #----------------------------------------------------------- elif confidenceL == 2: if lamb <=1: if D <= table[17,0]: teste = a D_critico= table[17,0] else: teste = b D_critico= table[17,0] elif lamb > 1 and lamb <=2: if D <= table[17,1]: teste = a D_critico= table[17,1] else: teste = b D_critico= table[17,1] elif lamb > 2 and lamb <=3: if D <= table[17,2]: teste = a D_critico= table[17,2] else: teste = b D_critico= table[17,2] elif lamb > 3 and lamb <=5: if D <= table[17,3]: teste = a D_critico= table[17,3] else: teste = b D_critico= table[17,3] elif lamb > 5 and lamb <=10: if D <= table[17,4]: teste = a D_critico= table[17,4] else: teste = b D_critico= table[17,4] return D, teste, D_critico
def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): if column_mapping: date_column = column_mapping.get('datetime') id_column = column_mapping.get('id') target_column = column_mapping.get('target') prediction_column = column_mapping.get('prediction') num_feature_names = column_mapping.get('numerical_features') if num_feature_names is None: num_feature_names = [] else: num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] cat_feature_names = column_mapping.get('categorical_features') if cat_feature_names is None: cat_feature_names = [] else: cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] else: date_column = 'datetime' if 'datetime' in reference_data.columns else None id_column = None target_column = 'target' if 'target' in reference_data.columns else None prediction_column = 'prediction' if 'prediction' in reference_data.columns else None utility_columns = [date_column, id_column, target_column, prediction_column] num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns)) cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns)) if prediction_column is not None: #calculate output drift pred_p_value = ks_2samp(reference_data[prediction_column], production_data[prediction_column])[1] pred_sim_test = "detected" if pred_p_value < 0.05 else "not detected" #plot output distributions pred_distr = ff.create_distplot( [reference_data[prediction_column], production_data[prediction_column]], ["Reference", "Production"], colors=[grey, red], show_rug=True) pred_distr.update_layout( xaxis_title = "Value", yaxis_title = "Share", legend = dict( orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1 ) ) pred_drift_json = json.loads(pred_distr.to_json()) self.wi = BaseWidgetInfo( title="Prediction Drift: " + pred_sim_test + ", p_value=" + str(round(pred_p_value, 6)), type="big_graph", details="", alertStats=AlertStats(), alerts=[], alertsPosition="row", insights=[], size=2, params={ "data": pred_drift_json['data'], "layout": pred_drift_json['layout'] }, additionalGraphs=[], ) else: self.wi = None
def compare_two_root_files(file1, file2, tolerance=0.02): """Compare two ROOT(.cern.ch) files and return dictionary of comparison.""" comparison = {} # content1 = dict((key, value) for (key, value) in walk(file1)) # content2 = dict((key, value) for (key, value) in walk(file2)) keys1 = set(recursive_keys(file1)) keys2 = set(recursive_keys(file2)) all_keys = sorted(keys1 | keys2) print(f'Testing {len(all_keys)} distributions') # for name in tqdm(all_keys): # tqdm does not work well inside CI for name in all_keys: comparison[name] = {} status = FAILED evaluationValue, ks_statistic, pvalue = 0, 0, 0 diff = np.array([]) reason = '' evaluationFunc = maxRelativeDifference cut = 'value <= {}'.format(tolerance) try: value1 = load_value(name, file1, keys1) value2 = load_value(name, file2, keys2) except TypeError as e: yield name, dict( status=WARNING, reason=str(e), original=None, reference=None, diff=None, ) continue try: v1_size = np.size(value1) v2_size = np.size(value2) except Exception: reason = f'Cannot handle {name} due to issues with value.size' yield name, dict(status=WARNING, reason=reason) continue if value1 is None or value2 is None: status = UNKNOWN reason = 'Cannot convert data to numpy array' elif len(value1) == 0 and len(value2) == 0: status = SUCCESS pvalue = 1 elif (v1_size == 0 and v2_size > 0) or (v1_size > 0 and v2_size == 0): status = FAILED reason = 'original file is empty' if v1_size > 0 else 'reference file is empty' diff = value1 if v1_size > 0 else value2 else: ks_statistic, pvalue = stats.ks_2samp(ak.to_numpy(value2), ak.to_numpy(value1)) try: diff = difference(value2, value1) evaluationValue = evaluationFunc(value1, value2) status = evaluateStatus(value1, value2, evaluationFunc, cut) if status == FAILED: reason = f'evaluationFunc({evaluationValue} > {cut}) failed' except Exception as e: reason = str(e) status = UNKNOWN yield name, dict( status=status, original=value1, reference=value2, diff=diff, evaluationValue=evaluationValue, ks_statistic=ks_statistic, pvalue=pvalue, reason=reason, )
def get_index_date_ad_save_together(): # Creates files streaming file_one = None file_two = None """ # Read the source file with previously separated lines """ file_one = open('separate_lines.csv', 'r') mjd_one = [] yyyymmdd = [] vlr_s1 = [] sig_s1 = [] with file_one as f1: for line in f1: splits = line[:-1].split(',') mjd_one.append(splits[0]) yyyymmdd.append(splits[1]) vlr_s1.append(float(splits[2])) sig_s1.append(float(splits[3])) file_one.close() """ # Read the source file with interpolated data """ file_two = open('file-interpolated-rounded.csv', 'r') mjd_two = [] vlr_s2 = [] sig_s2 = [] file_two.readline() # skip 1st line #file_two.readline() # skip 2nd line with file_two as f2: for line in f2: splits = line[:-1].split(',') mjd_two.append(splits[0]) vlr_s2.append(float(splits[2])) sig_s2.append(float(splits[3])) file_two.close() """ Compare """ mjd = [] dts = [] fd1 = [] er1 = [] fd2 = [] er2 = [] c = 0 for c in range(len(mjd_one)): x = 0 for x in range(len(mjd_two)): if mjd_one[c] in mjd_two[x]: mjd.append(mjd_one[c]) dts.append(yyyymmdd[c]) fd1.append(float(vlr_s1[c])) er1.append(float(sig_s1[c])) fd2.append(float(vlr_s2[x])) er2.append(float(sig_s2[x])) #print('{0},{1},{2} {3:5.2f},{4:5.2f},{5:5.2f},{6:5.2f}'.format(mjd_one[c],mjd_two[x],yyyymmdd[c],vlr_s1[c],sig_s1[c],vlr_s2[x],sig_s2[x])) """ # Write list """ outf = open('final_to_evaluate_with_k-s-test.csv', 'w') outf.write('mjd,date,Soriginal,sigSoriginal,Scalc,sigScalc\n') c = 0 for c in range(len(mjd)): outf.write('{0},{1},{2:5.2f},{3:5.2f},{4:5.2f},{5:5.2f}\n'.format( mjd[c], dts[c], fd1[c], er1[c], fd2[c], er2[c])) outf.close() """ Kolmogorov-Smirnov Test """ result_ks = ks_2samp(fd1, fd2) print(result_ks) fne = open('k-s_test.txt', 'w') fne.write('# Kolmogorov-Smirnov Test\n') fne.write('# statistic={0} pvalue={1}\n'.format(result_ks[0], result_ks[1])) fne.close() return
lr = LogisticRegression(epoch=20, solver='NM', learning_rate=0.001,threshold=1e-4) lr.fit(trainX,trainY) #=========================================================================== # test #=========================================================================== y_pro,y_pre = lr.predict(testX) #=========================================================================== # evaluation #=========================================================================== tn,fp,fn,tp = confusion_matrix(y_true=testY, y_pred=y_pre).ravel() print('准确率:',(tp+tn)/(tn+fp+fn+tp)) print('查全率:',tp/(tp+fn)) print('查准率:',tp/(tp+fp)) print('auc:',roc_auc_score(y_true=testY, y_score=y_pro)) get_ks = lambda y_pred,y_true: ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic print('ks:',get_ks(y_pre,testY)) #plot ROC fpr,tpr,thresholds = roc_curve(y_true=testY, y_score=y_pro) roc_auc = auc(fpr,tpr) plt.title('Receiver Operating Characteristic') plt.plot(fpr,tpr,'b',label='AUC = %0.2f'% roc_auc) plt.legend(loc='lower right') plt.plot([0,1],[0,1],'r--') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() #Somer’s D concordance statistics pr_0 = []
plt.xlabel('Time', fontsize=16, fontweight='bold') plt.ylabel('Displacement', fontsize=16, fontweight='bold') plt.legend(loc=0, fontsize=14) plt.ylim(-42, 75) ax = plt.gca() PlotStyle(ax, '') ############################################################################### # Residuals Statistical test ############################################################################### ObRes = [signal - model for signal, model in zip(WhiteSignal, FitSolutionA)] KS = stats.ks_2samp(ObRes, WhiteNoise) print(KS) ############################################################################### # ODE system solving ############################################################################### SolverTime = np.linspace(0, 20, num=120) #Model B Parameters k1 = 0.3 k2 = 0.25 k3 = 0.1
# variance, standard deviation, mean, median, etc from statistics import variance, stdev, mean, median lst = [1, 2, 3, 4] my_variance = variance(lst) my_sd = stdev(lst) my_mean = mean(lst) my_median = median(lst) # normalize a list into [0,1] ## Method 1 - min, max max_num = max(lst) min_num = min(lst) normalized_lst = [(x - min_num) / (max_num - min_num)] ## Method 2 - Normalize to Standard Gaussian Distribution mean_num = mean(lst) my_sd = stdev(lst) normalized_lst = [(x - mean_num) / my_sd for x in lst] # Compare 2 curves ## Method 1 - Kolmogorov–Smirnov test from scipy.stats import ks_2samp from numpy import array pv = ks_2samp(num_lst1, num_lst2) # if p-value smaller than the threshold, then reject null hypothesis, which means the 2 curves are not similar
df_list = list() # simulation loop for sample_size in sample_sizes: df = pd.DataFrame(data=means, columns=["mean_data"]) print("Simulating data for sample size {}".format(sample_size)) for i in range(num_iterations): ks_results = list() # calculate the KS test p-value for mean in means: s0 = np.random.normal(loc=0, scale=1, size=sample_size) s_test = np.random.normal(loc=mean, scale=1, size=sample_size) ks_results.append(stats.ks_2samp(s0, s_test)[1]) # add results to dataframe df["iter_"+str(i)] = ks_results # calculate mean values across simulation df["mean_res"] = df[df.columns[1:]].mean(axis=1) df["std_res"] = df[df.columns[1:]].std(axis=1) # append results to dataframe list df_list.append(df) # saving data to files for df, sample_size in zip(df_list, sample_sizes): df.to_csv(str(destination_path) + "/size_"+str(sample_size), header=True, index=False)
stats.scoreatpercentile(generated,95) #数值所在的百分比 stats.percentileofscore(generated,1) #分布直方图 import matplotlib.pyplot as plt plt.hist(generated) plt.show() #均值检验 import numpy as np price = get_price(['000001.XSHE','601398.XHSG'],start_date = '2016-01-01',end_date = '2017-01-01',fields='close') price_001 = np.diff(np.log(np.array(price['000001.XSHE']))) price_398 = np.diff(np.log(np.array(price['601398.XHSG']))) #Kolmogorov-Smirnov检验 stats.ks_2samp(price_001,price_398) #Jarque-Bera正态性检验 stats.jarque_bera(price_001 -price_398)[-1] #信号处理 #检验股价的线性趋势 from datetime import date,datetme,time from scipy import signal import pandas as pd from matplotlib.dates import DateFormatter from matplotlib.dates import DayLocator from matplotlib.dates import MonthLocater price = get_price(['000001.XSHE','601398.XHSG'],start_date = '2016-01-01',end_date = '2017-01-01',fields='close') y = signal.detrend(price) #Series=串
plt.fill_between(xpoints, ypoints + errors, ypoints - errors, facecolor='green', alpha=0.4, label='error') plt.grid() plt.legend(loc='lower right') plt.savefig('BDT_roccurve.png') #print('ROCS', Rocs) """ #plot overtraining graph plt.figure(200) plt.xlabel('Ratio of data used to train') plt.ylabel('Accuracy of BDT') #plt.title('Graph to study overtraining') plt.plot(np.arange(0.01,1,0.01), accuraciestt[0], label='accuracies for testing') plt.plot(np.arange(0.01,1,0.01), accuraciestt[1], label='accuracies for training') plt.legend() plt.savefig('Accuracy of BDT')""" from scipy.stats import ks_2samp a, Gluon_KS = ks_2samp(probs[0][0], probs[1][0]) a, Quark_KS = ks_2samp(probs[0][1], probs[1][1]) print(Gluon_KS, Quark_KS) print('acc', accuraciestt) plt.show()
def main(): # assuming 'theFile' contains one name per line, read the file if getpass.getuser() == 'frenchd': # pickleFilename = '/Users/frenchd/Research/inclination/git_inclination/picklePilot_plusSALT_14.p' # gtPickleFilename = '/Users/frenchd/Research/inclination/git_inclination/pickleGT.p' # saveDirectory = '/Users/frenchd/Research/inclination/git_inclination/plotting_code/figs' # gtPickleFilename = '/Users/frenchd/Research/inclination/git_inclination/pickleGT_filteredAll.p' gtPickleFilename = '/Users/frenchd/Research/GT_update2/pickleGT_filteredAll.p' saveDirectory = '/Users/frenchd/Research/inclination/git_inclination/plotting_code/figs/' isolated_filename = '/Users/frenchd/Research/inclination/git_inclination/isolated4.p' L_isolated_filename = '/Users/frenchd/Research/inclination/git_inclination/L_isolated4.p' L_associated_isolated_filename = '/Users/frenchd/Research/inclination/git_inclination/L_associated_isolated4.p' L_associated_filename = '/Users/frenchd/Research/inclination/git_inclination/L_associated4.p' L_nonassociated_filename = '/Users/frenchd/Research/inclination/git_inclination/L_nonassociated4.p' L_two_filename = '/Users/frenchd/Research/inclination/git_inclination/L_two4.p' L_two_plus_filename = '/Users/frenchd/Research/inclination/git_inclination/L_two_plus4.p' L_group_filename = '/Users/frenchd/Research/inclination/git_inclination/L_group4.p' else: print 'Could not determine username. Exiting.' sys.exit() # pickle file for the whole galaxy table: gtPickleFile = open(gtPickleFilename, 'rU') gtDict = pickle.load(gtPickleFile) gtPickleFile.close() # open all the pickle files isolated_file = open(isolated_filename, 'r') L_isolated_file = open(L_isolated_filename, 'r') L_associated_isolated_file = open(L_associated_isolated_filename, 'r') L_associated_file = open(L_associated_filename, 'r') L_nonassociated_file = open(L_nonassociated_filename, 'r') L_two_file = open(L_two_filename, 'r') L_two_plus_file = open(L_two_plus_filename, 'r') L_group_file = open(L_group_filename, 'r') # unload the data from them isolated = pickle.load(isolated_file) L_isolated = pickle.load(L_isolated_file) L_associated_isolated = pickle.load(L_associated_isolated_file) L_associated = pickle.load(L_associated_file) L_nonassociated = pickle.load(L_nonassociated_file) L_two = pickle.load(L_two_file) L_two_plus = pickle.load(L_two_plus_file) L_group = pickle.load(L_group_file) # close the files isolated_file.close() L_isolated_file.close() L_associated_isolated_file.close() L_associated_file.close() L_nonassociated_file.close() L_two_file.close() L_two_plus_file.close() L_group_file.close() # if match, then the includes in the file have to MATCH the includes above. e.g., if # virInclude = False, cusInclude = True, finalInclude = False, then only systems # matching those three would be included. Otherwise, all cusInclude = True would be included # regardless of the others dataSet = L_associated_isolated Lya_vs = dataSet['Lya_vs'] e_Lya_vs = dataSet['e_Lya_vs'] Lya_Ws = dataSet['Lya_Ws'] e_Lya_Ws = dataSet['e_Lya_Ws'] Nas = dataSet['Nas'] e_Nas = dataSet['e_Nas'] bs = dataSet['bs'] e_bs = dataSet['e_bs'] Ws = dataSet['Ws'] e_Ws = dataSet['e_Ws'] targets = dataSet['targets'] z_targets = dataSet['z_targets'] RA_targets = dataSet['RA_targets'] Dec_targets = dataSet['Dec_targets'] Names = dataSet['Names'] RA_galaxies = dataSet['RA_galaxies'] Dec_galaxies = dataSet['Dec_galaxies'] impacts = dataSet['impacts'] azimuths = dataSet['azimuths'] PAs = dataSet['PAs'] incs = dataSet['incs'] adjustedIncs = dataSet['adjustedIncs'] ls = dataSet['ls'] l_cuss = dataSet['l_cuss'] R_virs = dataSet['R_virs'] cuss = dataSet['cuss'] MajDiams = dataSet['MajDiams'] MTypes = dataSet['MTypes'] Vhels = dataSet['Vhels'] vcorrs = dataSet['vcorrs'] bestDists = dataSet['bestDists'] e_bestDists = dataSet['e_bestDists'] group_nums = dataSet['group_nums'] group_mems = dataSet['group_mems'] group_dists = dataSet['group_dists'] Lstar_meds = dataSet['Lstar_meds'] e_Lstar_meds = dataSet['e_Lstar_meds'] Bmags = dataSet['Bmags'] majorAxisL = gtDict['majorAxis'] incL = gtDict['inc'] adjustedIncL = gtDict['adjustedInc'] paL = gtDict['PA'] BmagL = gtDict['Bmag'] # Bmag_sdssL = gtDict['Bmag_sdss'] RID_medianL = gtDict['RID_median'] RID_meanL = gtDict['RID_mean'] RID_stdL = gtDict['RID_std'] VhelL = gtDict['Vhel'] RAdegL = gtDict['RAdeg'] DEdegL = gtDict['DEdeg'] NameL = gtDict['Name'] allPA = paL allInclinations = [] allAdjustedIncs = [] allCosInclinations = [] # print 'type: ',type(incL) for i in incL: if i != -99: i = float(i) allInclinations.append(i) i2 = pi / 180. * i cosi2 = cos(i) allCosInclinations.append(cosi2) allCosFancyCosInclinations = [] for i in adjustedIncL: if str(i) != '-99': i = float(i) allAdjustedIncs.append(i) i2 = pi / 180. * i cosi2 = cos(i) allCosFancyCosInclinations.append(cosi2) allDiameter = majorAxisL print 'finished with this shit' print 'len(allAdjustedIncs): ', len(allAdjustedIncs) print total = 0 totalNo = 0 totalYes = 0 totalIsolated = 0 totalGroup = 0 ######################################################################################## ######################################################################################### # print all the things # # absorber info lists blues = [] reds = [] blueAbs = [] redAbs = [] blueW = [] redW = [] blueB = [] redB = [] e_blueB = [] e_redB = [] blueErr = [] redErr = [] blueV = [] redV = [] blueImpact = [] redImpact = [] # galaxy info lists blueInc = [] redInc = [] blueFancyInc = [] redFancyInc = [] blueAz = [] redAz = [] bluePA = [] redPA = [] blueVcorr = [] redVcorr = [] blueVir = [] redVir = [] blueLike = [] redLike = [] # for absorbers for Lya_v, w, e_w, Vhel, i, b, e_b in zip(Lya_vs, Lya_Ws, e_Lya_Ws, Vhels, impacts, bs, e_bs): vel_dif = Lya_v - Vhel if vel_dif >= 0: reds.append(float(vel_dif)) redW.append(float(w)) redErr.append(float(e_w)) redV.append(float(Vhel)) redImpact.append(float(i)) redAbs.append(abs(vel_dif)) redB.append(float(b)) e_redB.append(float(e_b)) else: blues.append(float(vel_dif)) blueW.append(float(w)) blueErr.append(float(e_w)) blueV.append(float(Vhel)) blueImpact.append(float(i)) blueAbs.append(abs(vel_dif)) blueB.append(float(b)) e_blueB.append(float(e_b)) ########################################################################################## ########################################################################################## nameDict = {} # for galaxies for Lya_v, Vhel, inc, adjustedInc, az, pa, vcorr, vir, l, name in zip( Lya_vs, Vhels, incs, adjustedIncs, azimuths, PAs, vcorrs, R_virs, ls, Names): vel_dif = Lya_v - Vhel if nameDict.has_key(name): i = nameDict[name] i += 1 nameDict[name] = i else: nameDict[name] = 1 if vel_dif >= 0: if inc != -99: redInc.append(float(inc)) if adjustedInc != -99: redFancyInc.append(float(adjustedInc)) if az != -99: redAz.append(float(az)) if pa != -99: redPA.append(float(pa)) if vcorr != -99: redVcorr.append(float(vcorr)) if vir != -99: redVir.append(float(vir)) if l != -99: redLike.append(float(l)) else: if inc != -99: blueInc.append(float(inc)) if adjustedInc != -99: blueFancyInc.append(float(adjustedInc)) if az != -99: blueAz.append(float(az)) if pa != -99: bluePA.append(float(pa)) if vcorr != -99: blueVcorr.append(float(vcorr)) if vir != -99: blueVir.append(float(vir)) if l != -99: blueLike.append(float(l)) galaxyNames = nameDict.keys() # how many absorbers above vs below vel_cut? redVelCount200 = 0 redVelCount100 = 0 blueVelCount200 = 0 blueVelCount100 = 0 for b in blues: if b >= 200: blueVelCount200 += 1 if b >= 100: blueVelCount100 += 1 for r in reds: if abs(r) >= 200: redVelCount200 += 1 if abs(r) >= 100: redVelCount100 += 1 assocFancyInc = adjustedIncs AGNnameDict = {} for i in targets: if AGNnameDict.has_key(i): c = AGNnameDict[i] c += 1 AGNnameDict[i] = c else: AGNnameDict[i] = 1 AGN_list = AGNnameDict.keys() # write out a file breaking down all this shit # out_directory = '/Users/frenchd/Research/inclination/git_inclination/rotation_paper/' # save_name = 'full_stats.txt' # stats_filename = '{0}/{1}'.format(out_directory, save_name) # stats_file = open(stats_filename,'wt') print print '------------------------ Pilot Data -----------------------------' print # print 'total number of lines: ', len(lyaWList) + len(lyaWAmbList) print 'total number of lines: ', len(Lya_vs) print 'total number of unique galaxies matched: ', len(galaxyNames) print 'total number of AGN: ', len(AGN_list) print '# of redshifted lines: ', len(reds) print '# of blueshifted lines: ', len(blues) print print print ' TARGETS ' print print 'final target number: ', len(AGNnameDict.keys()) for i in AGNnameDict.keys(): print i print print print print print '----------------------- Absorber info ----------------------------' print print 'avg blueshifted EW: ', mean(blueW) print 'median blueshifted EW: ', median(blueW) print 'avg blue err: ', mean(blueErr) print 'median blue err: ', median(blueErr) print print 'std(blue EW): ', std(blueW) print 'stats.sem(blue EW): ', stats.sem(blueW) print 'stats.describe(blue EW): ', stats.describe(blueW) print print 'avg blueshifted vel_diff: ', mean(blues) print 'median blueshifted vel_diff: ', median(blues) print 'std(blueshifted vel_diff): ', std(blues) print 'stats.sem(blue vel_dif): ', stats.sem(blues) print 'stats.describe(blue vel_dif: ', stats.describe(blues) print print '% blueshifted which have vel_diff >= 200 km/s: {0}'.format( float(blueVelCount200) / len(blues)) print 'total number with abs(vel_diff) >= 200 km/s: {0}'.format( blueVelCount200) print '% blueshifted which have vel_diff >= 100 km/s: {0}'.format( float(blueVelCount100) / len(blues)) print 'total number with abs(vel_diff) >= 100 km/s: {0}'.format( blueVelCount100) print print print 'avg blue velocity: ', mean(blueV) print 'median blue velocity: ', median(blueV) print 'std(blue Velocity): ', std(blueV) print 'avg blue impact: ', mean(blueImpact) print 'median blue impact: ', median(blueImpact) print 'stats.sem(blue impact): ', stats.sem(blueImpact) print 'stats.describe(blue impact): ', stats.describe(blueImpact) print print print 'avg redshifted EW: ', mean(redW) print 'median redshifted EW: ', median(redW) print 'avg red err: ', mean(redErr) print 'median red err: ', median(redErr) print print 'std(red EW): ', std(redW) print 'stats.sem(red EW): ', stats.sem(redW) print 'stats.describe(red EW): ', stats.describe(redW) print print 'avg redshifted vel_diff: ', mean(reds) print 'median redshifted vel_diff: ', median(reds) print 'std(redshifted vel_dif): ', std(reds) print 'stats.sem(red vel_dif): ', stats.sem(reds) print 'stats.describe(red vel_dif): ', stats.describe(reds) print print '% redshifted which have abs(vel_diff) >= 200 km/s: {0}'.format( float(redVelCount200) / len(reds)) print 'total number with abs(vel_diff) >= 200 km/s: {0}'.format( redVelCount200) print '% redshifted which have abs(vel_diff) >= 100 km/s: {0}'.format( float(redVelCount100) / len(reds)) print 'total number with abs(vel_diff) >= 100 km/s: {0}'.format( redVelCount100) print print 'avg red velocity: ', mean(redV) print 'median red velocity: ', median(redV) print print 'avg red impact: ', mean(redImpact) print 'median red impact: ', median(redImpact) print 'stats.sem(red impact): ', stats.sem(redImpact) print 'stats.describe(red impact): ', stats.describe(redImpact) print 'std(red impact): ', std(redImpact) print print '----------------------- Galaxy info ----------------------------' print # regular inclinations incCut = 50 totalBlueInc = len(blueInc) totalRedInc = len(redInc) print print print 'totalBlueInc: ', totalBlueInc print 'totalRedInc: ', totalRedInc print print "blueInc: ", blueInc print blueIncCount = 0 for i in blueInc: if i >= incCut: blueIncCount += 1 redIncCount = 0 for i in redInc: if i >= incCut: redIncCount += 1 totalInc = len(allInclinations) totalCount = 0 for i in allInclinations: if i >= incCut: totalCount += 1 # fancy inclinations totalBlueFancyInc = len(blueFancyInc) totalRedFancyInc = len(redFancyInc) blueFancyIncCount = 0 for i in blueFancyInc: if i >= incCut: blueFancyIncCount += 1 redFancyIncCount = 0 for i in redFancyInc: if i >= incCut: redFancyIncCount += 1 combinedCount = redFancyIncCount + blueFancyIncCount totalCombinedCount = totalRedFancyInc + totalBlueFancyInc totalFancyInc = len(allAdjustedIncs) totalFancyCount = 0 for i in allAdjustedIncs: if i >= incCut: totalFancyCount += 1 print print ' INCLINATIONS: ' print print 'Blue: {0} % of associated galaxies have >={1}% inclination'.format( float(blueIncCount) / float(totalBlueInc), incCut) print 'Red: {0} % of associated galaxies have >={1}% inclination'.format( float(redIncCount) / float(totalRedInc), incCut) print 'All: {0} % of ALL galaxies have >={1}% inclination'.format( float(totalCount) / float(totalInc), incCut) print print ' FANCY INCLINATIONS: ' print print 'Blue: {0} % of associated galaxies have >={1}% fancy inclination'.format( float(blueFancyIncCount) / float(totalBlueFancyInc), incCut) print 'Red: {0} % of associated galaxies have >={1}% fancy inclination'.format( float(redFancyIncCount) / float(totalRedFancyInc), incCut) print 'All: {0} % of ALL galaxies have >={1}% fancy inclination'.format( float(totalFancyCount) / float(totalFancyInc), incCut) print 'Combined: {0} % of associated galaxies have >= {1} fancy inclination'.format( float(combinedCount) / float(totalCombinedCount), incCut) print print 'Average all fancy inclination: ', mean(allAdjustedIncs) print 'stats.sem(all): ', stats.sem(allAdjustedIncs) print print 'avg blue inclination: ', mean(blueInc) print 'median blue inclination: ', median(blueInc) print 'avg blue fancy inclination: ', mean(blueFancyInc) print 'median blue fancy inclination: ', median(blueFancyInc) print print 'avg red inclination: ', mean(redInc) print 'median red inclination: ', median(redInc) print 'avg red fancy inclination: ', mean(redFancyInc) print 'median red fancy inclination: ', median(redFancyInc) print print 'mean associated: ', mean(assocFancyInc) print 'stats.sem(associated): ', stats.sem(assocFancyInc) print 'stats.describe(associated): ', stats.describe(assocFancyInc) print 'stats.sem(blue): ', stats.sem(blueFancyInc) print 'stats.describe(blue): ', stats.describe(blueFancyInc) print print 'stats.sem(red): ', stats.sem(redFancyInc) print 'stats.describe(red): ', stats.describe(redFancyInc) print print print " AZIMUTHS and PA: " print print 'avg blue azimuth: ', mean(blueAz) print 'median blue azimuth: ', median(blueAz) print 'stats.sem(blue az): ', stats.sem(blueAz) print 'stats.describe(blue az): ', stats.describe(blueAz) print print 'avg red azimuth: ', mean(redAz) print 'median red azimuth: ', median(redAz) print 'stats.sem(red az): ', stats.sem(redAz) print 'stats.describe(red az): ', stats.describe(redAz) print print 'avg blue PA: ', mean(bluePA) print 'median blue PA: ', median(bluePA) print print 'avg red PA: ', mean(redPA) print 'median red PA: ', median(redPA) print print ' VCORR : ' print print 'avg blue vcorr: ', mean(blueVcorr) print 'median blue vcorr: ', median(blueVcorr) print print 'avg red vcorr: ', mean(redVcorr) print 'median red vcorr: ', median(redVcorr) print print ' R_vir: ' print print 'avg blue R_vir: ', mean(blueVir) print 'median blue R_vir: ', median(blueVir) print 'stats.sem(blue R_vir): ', stats.sem(blueVir) print 'stats.describe(blue R_vir): ', stats.describe(blueVir) print print 'avg red R_vir: ', mean(redVir) print 'median red R_vir: ', median(redVir) print 'stats.sem(red R_vir): ', stats.sem(redVir) print 'stats.describe(red R_vir): ', stats.describe(redVir) print print ' LIKELIHOOD: ' print print 'avg blue likelihood: ', mean(blueLike) print 'median blue likelihood: ', median(blueLike) print print 'avg red likelihood: ', mean(redLike) print 'median red likelihood: ', median(redLike) print print print '-------------------- Distribution analysis ----------------------' print print print ' FANCY INCLINATIONS: ' # perform the K-S and AD tests for inclination ans1 = stats.ks_2samp(blueFancyInc, redFancyInc) ans1a = stats.anderson_ksamp([blueFancyInc, redFancyInc]) print 'KS for blue vs red fancy inclinations: ', ans1 print 'AD for blue vs red fancy inclinations: ', ans1a ans2 = stats.ks_2samp(blueFancyInc, allAdjustedIncs) print 'KS for blue vs all fancy inclinations: ', ans2 ans3 = stats.ks_2samp(redFancyInc, allAdjustedIncs) print 'KS for red vs all fancy inclinations: ', ans3 print z_statrb, p_valrb = stats.ranksums(blueFancyInc, redFancyInc) z_statall, p_valall = stats.ranksums(assocFancyInc, allAdjustedIncs) print 'ranksum red vs blue p-value: ', p_valrb print 'ranksum associated vs all: ', p_valall ans4 = stats.ks_2samp(assocFancyInc, allAdjustedIncs) ans4a = stats.anderson_ksamp([assocFancyInc, allAdjustedIncs]) print 'KS for all associated vs all fancy inclinations: ', ans4 print 'AD for all associated vs all fancy inclinations: ', ans4a print # ans5 = stats.ks_2samp(spiralIncList, allSpiralIncList) # ans5a = stats.anderson_ksamp([spiralIncList,allSpiralIncList]) # # print 'KS for all spiral associated vs all spiral fancy inclinations: ',ans5 # print 'AD for all spiral associated vs all spiral fancy inclinations: ',ans5a print print ' INCLINATIONS: ' print # perform the K-S and AD tests for inclination ans1 = stats.ks_2samp(blueInc, redInc) ans1a = stats.anderson_ksamp([blueInc, redInc]) print 'KS for blue vs red inclinations: ', ans1 print 'AD for blue vs red inclinations: ', ans1a ans2 = stats.ks_2samp(blueInc, allInclinations) print 'KS for blue vs all inclinations: ', ans2 ans3 = stats.ks_2samp(redInc, allInclinations) print 'KS for red vs all inclinations: ', ans3 assocInc = incs ans4 = stats.ks_2samp(assocInc, allInclinations) print 'KS for associated vs all inclinations: ', ans4 print print ' EW Distributions: ' print # perform the K-S and AD tests for EW ans1 = stats.ks_2samp(blueW, redW) ans1a = stats.anderson_ksamp([blueW, redW]) print 'KS for blue vs red EW: ', ans1 print 'AD for blue vs red EW: ', ans1a print print ' Impact parameter Distributions: ' print # perform the K-S and AD tests for impact parameter ans1 = stats.ks_2samp(blueImpact, redImpact) ans1a = stats.anderson_ksamp([blueImpact, redImpact]) print 'KS for blue vs red impact parameters: ', ans1 print 'AD for blue vs red impact parameters: ', ans1a print print ' \Delta v Distributions: ' print # perform the K-S and AD tests for \delta v ans1 = stats.ks_2samp(blueAbs, redAbs) ans1a = stats.anderson_ksamp([blueAbs, redAbs]) print 'KS for blue vs red \Delta v: ', ans1 print 'AD for blue vs red \Delta v: ', ans1a print print ' Azimuth Distributions: ' print # perform the K-S and AD tests for azimuth ans1 = stats.ks_2samp(blueAz, redAz) ans1a = stats.anderson_ksamp([blueAz, redAz]) print 'KS for blue vs red azimuth: ', ans1 print 'AD for blue vs red azimuth: ', ans1a print # now against a flat distribution flatRed = arange(0, 90, 1) flatBlue = arange(0, 90, 1) ans1 = stats.ks_2samp(blueAz, flatBlue) ans1a = stats.anderson_ksamp([blueAz, flatBlue]) print 'KS for blue vs flat azimuth: ', ans1 print 'AD for blue vs flat azimuth: ', ans1a print ans1 = stats.ks_2samp(redAz, flatRed) ans1a = stats.anderson_ksamp([redAz, flatRed]) print 'KS for red vs flat azimuth: ', ans1 print 'AD for erd vs flat azimuth: ', ans1a print print print ' R_vir Distributions: ' print # perform the K-S and AD tests for r_vir ans1 = stats.ks_2samp(blueVir, redVir) ans1a = stats.anderson_ksamp([blueVir, redVir]) print 'KS for blue vs red R_vir: ', ans1 print 'AD for blue vs red R_vir: ', ans1a print print ' Doppler parameter Distributions: ' print # perform the K-S and AD tests for doppler parameter ans1 = stats.ks_2samp(blueB, redB) ans1a = stats.anderson_ksamp([blueB, redB]) print 'KS for blue vs red doppler parameter: ', ans1 print 'AD for blue vs red doppler parameter: ', ans1a print print ' Likelihood Distributions: ' print # perform the K-S and AD tests for doppler parameter ans1 = stats.ks_2samp(blueLike, redLike) ans1a = stats.anderson_ksamp([blueLike, redLike]) print 'KS for blue vs red likelihood: ', ans1 print 'AD for blue vs red likelihood: ', ans1a print print ' COMPLETED. '
def _kernel_leaves_target_invariant(self, initial_draws, independent_chain_ndims): def log_gamma_log_prob(x): event_dims = tf.range(independent_chain_ndims, tf.rank(x)) return self._log_gamma_log_prob(x, event_dims) def fake_log_prob(x): """Cooled version of the target distribution.""" return 1.1 * log_gamma_log_prob(x) hmc = tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=log_gamma_log_prob, step_size=0.4, num_leapfrog_steps=5, seed=_set_seed(43)) sample, kernel_results = hmc.one_step( current_state=initial_draws, previous_kernel_results=hmc.bootstrap_results(initial_draws)) bad_hmc = tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=fake_log_prob, step_size=0.4, num_leapfrog_steps=5, seed=_set_seed(44)) bad_sample, bad_kernel_results = bad_hmc.one_step( current_state=initial_draws, previous_kernel_results=bad_hmc.bootstrap_results(initial_draws)) [ log_accept_ratio_, bad_log_accept_ratio_, initial_draws_, updated_draws_, fake_draws_, ] = self.evaluate([ kernel_results.log_accept_ratio, bad_kernel_results.log_accept_ratio, initial_draws, sample, bad_sample, ]) # Confirm step size is small enough that we usually accept. acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.)) bad_acceptance_probs = np.exp(np.minimum(bad_log_accept_ratio_, 0.)) self.assertGreater(acceptance_probs.mean(), 0.5) self.assertGreater(bad_acceptance_probs.mean(), 0.5) # Confirm step size is large enough that we sometimes reject. self.assertLess(acceptance_probs.mean(), 0.99) self.assertLess(bad_acceptance_probs.mean(), 0.99) _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(), updated_draws_.flatten()) _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(), fake_draws_.flatten()) tf.compat.v1.logging.vlog( 1, 'acceptance rate for true target: {}'.format(acceptance_probs.mean())) tf.compat.v1.logging.vlog( 1, 'acceptance rate for fake target: {}'.format( bad_acceptance_probs.mean())) tf.compat.v1.logging.vlog( 1, 'K-S p-value for true target: {}'.format(ks_p_value_true)) tf.compat.v1.logging.vlog( 1, 'K-S p-value for fake target: {}'.format(ks_p_value_fake)) # Make sure that the MCMC update hasn't changed the empirical CDF much. self.assertGreater(ks_p_value_true, 1e-3) # Confirm that targeting the wrong distribution does # significantly change the empirical CDF. self.assertLess(ks_p_value_fake, 1e-6)
def get_property_from_dataset(dataset, property_key): result = [] for row in dataset: result.append(row[property_key]) return result def getClicksFromFile(file_path): with open(file_path, 'rb') as csvfile: dataset_sample_a = csv.DictReader(csvfile, delimiter=';', quotechar='|') clicks = np.array(get_property_from_dataset(dataset_sample_a, DICT_KEY)) clicks = map(lambda x: 1 if x == 'yes' else 0, clicks) return clicks clicks_sample_a = getClicksFromFile('amostra_A_click.csv') clicks_sample_b = getClicksFromFile('amostra_B_click.csv') ks, p_value = stats.ks_2samp(clicks_sample_a, clicks_sample_b) print "Ks: " + str(ks) print "P-value: " + str(p_value) print "Sum Sample A: " + str(np.sum(clicks_sample_a)) print "Sum Sample B: " + str(np.sum(clicks_sample_b))
def getKSStatPVal(regions, region_to_corr, region_to_info): corr_stat, corr_p_val = ks_2samp(region_to_corr[regions[0]], region_to_corr[regions[1]]) info_stat, info_p_val = ks_2samp(region_to_info[regions[0]], region_to_info[regions[1]]) return regions[0], regions[1], corr_stat, corr_p_val, info_stat, info_p_val
def ksprob(arr1, arr2): from scipy.stats import ks_2samp return ks_2samp(arr1, arr2)[1]
##Histogram of dH NGC_dH = [] SGC_dH = [] i = 0 while i < len(NGCred): NGC_dH.append((fun.D(fun.HubbleIntegrate(NGCred[i])) - NGCdist[i]) / fun.D(fun.HubbleIntegrate(NGCred[i]))) i += 1 i = 0 while i < len(SGCred): SGC_dH.append((fun.D(fun.HubbleIntegrate(SGCred[i])) - SGCdist[i]) / fun.D(fun.HubbleIntegrate(SGCred[i]))) i += 1 print(ks_2samp(NGC_dH, SGC_dH)) plt.plot(SGCred, SGC_dH, '.') plt.plot([min(SGCred), max(SGCred)], [0, 0]) plt.show() NGChist = plt.hist(NGC_dH, bins=binno, alpha=0.5, label='NGC', color='blue') ##Best fit for data (NGCmu, NGCsigma) = norm.fit(NGC_dH) NGCSE = NGCsigma / np.sqrt(len(NGC_dH)) emptyNGC = plt.hist([], range=[-0.25, 0.25], alpha=0.5, label='$N=%i, \mu=%.2f \pm %.2f$, $\sigma=%.2f$' % (len(NGC_dH), NGCmu, NGCSE, NGCsigma), color='white')
t_in=np.zeros(gal_count) gal_count_out=len(t)-gal_count t_out=np.zeros(gal_count_out) gal_count=0 gal_count_out=0 for j in range(len(t)): x_gal, y_gal, z_gal=hp.ang2vec(theta[j], phi[j]) gal_separation=np.sqrt((cone_vec_x-x_gal)**2+(cone_vec_y-y_gal)**2+(cone_vec_z-z_gal)**2) #calculating the separation of the gal from the r_cone if gal_separation < radius_of_separation: t_in[gal_count]=t[j] gal_count=gal_count+1 else: t_out[gal_count_out]=t[j] gal_count_out=gal_count_out+1 ksd[i],p_value[i]=stats.ks_2samp(t_in, t_out) l=phi_cone*180.0/math.pi b=90-(theta_cone*180.0/math.pi) f.write(str(l)+"\t"+str(b)+"\t"+str(ksd[i])+"\t"+str(p_value[i])+"\t"+str(gal_count)+"\t"+str(gal_count_out)+"\n") f.close() max_ksd_index=np.unravel_index(ksd.argmax(), ksd.shape) theta_max, phi_max=hp.pix2ang(nside,max_ksd_index[0]) l_max=phi_max*180.0/math.pi b_max=90-(theta_max*180.0/math.pi) f2=open("max_KS_hemisphere.txt","w") f2.write("direction of the largest KS"+"\n") f2.write(str(l_max)+"\t"+str(b_max)+"\t"+str(max(ksd))+"\t"+str(p_value[max_ksd_index[0]])+"\n") f2.close()
def kstest2samp(samp1, samp2): ksval, pval = stats.ks_2samp(samp1, samp2) return ksval, pval
axis.plot(bin_centers, bin_vals, color='blue', alpha=0.75, lw=1) bin_vals, foo = np.histogram(fl_by_time_clpXminus_cut[i], bins=bin_edges, density=True) axis.plot(bin_centers, bin_vals, color='red', alpha=0.75, lw=1) ### Various statistical tests # calculate unequal variance t statistic tstat, ttpval = stats.ttest_ind(fl_by_time_clpXplus_cut[i], fl_by_time_clpXminus_cut[i], equal_var=False) # print('ttest', i, tstat, ttpval) # KS test ksstat, ks_pval = stats.ks_2samp(fl_by_time_clpXminus_cut[i], fl_by_time_clpXplus_cut[i]) # print('KS test', ksstat, ks_pval) # Mann-Whitney mwstat, mwpval = stats.mannwhitneyu(fl_by_time_clpXminus_cut[i], fl_by_time_clpXplus_cut[i], alternative='greater') # print('Mann-Whitney', mwstat, mwpval) # Mood's median test median_args = (fl_by_time_clpXminus_cut[i], fl_by_time_clpXplus_cut[i]) mstat, mpval, _, _ = stats.median_test(*median_args) # print('Median test', mstat, mpval) # print('\n') # F test for variance
#I'm going to cut out groups who have less than 5 attacks interevent = empdf[empdf.gname == group].idate.diff()[1:].values.tolist() if len(interevent) >= 5: empirical_data[group] = empdf[empdf.gname == group].idate.diff()[1:].values.tolist() #Load the abm data abmdf = pd.read_csv('../../results/abm_runs_v3/%s_20181009.csv' % country, header=None, names = header) #extract the params alpha = abmdf.alpha.unique() beta = abmdf.beta.unique() omega = abmdf.omega.unique() groups = abmdf.group.unique() #Group it up lvl_one_gdf = abmdf.groupby(['alpha', 'beta', 'omega']) for a in alpha: for b in beta: for o in omega: #Create the rundata rundata = [] #Pull the level two groups together by run lvl_two_gdf = lvl_one_gdf.get_group((a, b, o)).groupby('run') for r in lvl_two_gdf.groups.keys(): tdf = lvl_two_gdf.get_group(r) for group in tdf.group.unique(): if group in trans.keys() and trans[group] in empirical_data: diffset = tdf[tdf.group==group].step.diff()[1:].tolist() D, p = stats.ks_2samp(empirical_data[trans[group]], diffset) rundata.append(p) #Now write it out print('%d,%f,%f,%f,%s,%f' % (r, float(a), float(b), float(o), country, len([x for x in rundata if x<0.05])/len(rundata)), file=wfile)
def bias_test(self, df, gbias, sco, display=True): ''' gbias: group bias, e.g. ['CAM_TYPE','SIZE','REGION4'] sco: field where bias is present e.g. SZSCORE' kstest D stat: rate of convergence. at significance 0.05 reject H0 (eq. distr.) if D>0.043 ''' if display == True: print('\n', hlp.color.BOLD, hlp.color.CYAN, sco, hlp.color.END, '\n') sts = [] for gb in gbias: if display == True: print(hlp.color.BOLD, hlp.color.RED, gb, 'BIAS', hlp.color.END, '\n') gbitems = list(df[-df[gb].isnull()][gb].drop_duplicates()) dfstats = pd.DataFrame() full = np.array(df[(df.DIMENSION.isnull()) & (-df[sco].isnull())][sco]) sts.append([ gb, 'ALL', round(full.mean(), 4), round(full.std(), 4), len(full) ]) for gbi in gbitems: if display == True: print(hlp.color.BOLD, gbi, hlp.color.END, '\n') gidx = df[(df.DIMENSION.isnull()) & (-df[sco].isnull()) & (df[gb] == gbi)].index g = np.array(df.loc[gidx][sco]) fidx = df[(df.DIMENSION.isnull()) & (-df[sco].isnull()) & (df[gb] != gbi)].index f = np.array(df.loc[fidx][sco]) sts.append( [gb, gbi, round(g.mean(), 4), round(g.std(), 4), len(g)]) if display == True: ##normality check # print('length:', gbi ,':', len(g), '/ rest:', len(f)) # print('norm test:', stats.kstest(g, 'norm')) # stats.probplot(g, dist="norm", plot=plt) # plt.show() ##qq plot print('two sample test:', stats.ks_2samp(g, f)) # print('', gb, gbi, ' - mean:', round(g.mean(), 4), '; std:', round(g.std(), 4), # '\n Full sample - mean:', round(full.mean(), 4), '; std:', round(full.std(), 4)) q = np.linspace(0, 100, 101) k, ax = plt.subplots() ax.scatter(np.percentile(f, q), np.percentile(g, q), color='b') ax.plot(ax.get_xlim(), ax.get_xlim(), ls="--", c=".3") plt.ylabel(gb + ' ' + gbi + ' ' + sco) plt.xlabel(gb + ' ' + 'rest ' + sco) plt.title('qq plot - ' + gbi + ' ' + gb + ' vs rest' + '', fontsize=14) plt.show() print('') # sts.append([gb, 'ALL', round(full.mean(), 4), round(full.std(), 4), len(full)]) dfstsall = pd.DataFrame( sts, columns=['group', 'item', 'mean', 'std', 'size']) if display == True: for gr in dfstsall.group.drop_duplicates(): dfsts = dfstsall[dfstsall.group == gr] jet = plt.get_cmap('jet') colors = iter(jet(np.linspace(0, 1, 7))) div = dfsts['size'].max() / 1200 for index, row in dfsts.iterrows(): plt.scatter(row['mean'], row['std'], label=row['item'], color=next(colors), s=row['size'] / div) plt.xticks(rotation=45) plt.xlabel('mean', fontsize=14) plt.ylabel('stdev', fontsize=14) plt.title(gr, fontsize=14) plt.xlim(-dfsts['mean'].abs().max() * 1.2, dfsts['mean'].abs().max() * 1.2) lgnd = plt.legend(bbox_to_anchor=(0, 1), loc=2, scatterpoints=1, fontsize=10) for handle in lgnd.legendHandles: handle.set_sizes([10]) plt.show() return dfstsall
# Compair trace segment distributions with KS tests pp.close() print( "\nIf the KS statistic is small and p value is high we cannot reject that the distributions of the samples are the same" ) for n, d in enumerate(data): res = np.zeros((3, 2)) if n == 0: print('Packet length') else: print('Time stamp') for i in range(len(d) - 1): res[i, 0], res[i, 1] = stats.ks_2samp(d[i], d[i + 1]) res[len(d) - 1, 0], res[len(d) - 1, 1] = stats.ks_2samp(d[0], d[len(d) - 1]) #Save KS test to CSV writer = csv.writer( open( "/home/francesco/Documents/Thesis_project/Results/trace_samples_compare_kstest.csv", 'a')) writer.writerow(res) #multi_plot_data(pkt_segs[0],time_segs[0],cst_segs[0]) #multi_plot_data(pkt_segs[1],time_segs[1],cst_segs[1]) #multi_plot_data(pkt_segs[2],time_segs[2],cst_segs[2])
for mi, mr in zip(utr3['mir'], utr3['mrna']): cor.append(spearmanr(np.log2(stad.loc[mr]+1), np.log2(stadmirs.loc[mi]+1))[0]) utr3['stad_corr'] = cor n_permute = 100 stad_random_cor_e = np.zeros(n_permute * len(utr3['mir'])) i = 0 for c in range(n_permute): for mi in utr3['mir']: rand_index = randint(0, len(stad.index) - 1) stad_random_cor_e[i] = spearmanr(np.log2(stad.iloc[rand_index]+1), np.log2(stadmirs.loc[mi]+1))[0] i += 1 print(ks_2samp(utr3['stad_corr'], stad_random_cor_e, alternative="greater"), np.median(utr3['stad_corr']), np.median(stad_random_cor_e)) cor = [] for mi, mr in zip(cds['mir'], cds['mrna']): cor.append(spearmanr(np.log(stad.loc[mr]+1), np.log(stadmirs.loc[mi]+1))[0]) cds['stad_corr'] = cor n_permute = 100 stad_random_cor_e_cds = np.zeros(n_permute * len(cds['mir'])) i = 0 for c in range(n_permute): for mi in cds['mir']: rand_index = randint(0, len(stad.index) - 1) stad_random_cor_e_cds[i] = spearmanr(np.log2(stad.iloc[rand_index]+1), np.log2(stadmirs.loc[mi]+1))[0] i += 1
print(sys.argv, "Q1&Q3", Q1, Q3) print(sys.argv[0], "Load Data", len(ans)) plt.figure() head = 10 n, bins, patches = plt.hist(ans, bins=head, range=[-0.5, head + 0.5], density=True) print(sys.argv[0], "Draw Histogram") binsMiddles = 0.5 * (bins[1:] + bins[:-1]) params, covMatrix = curve_fit(poissonDist, binsMiddles, n) xPlot = np.linspace(0, head, 1000) #plt.plot(xPlot, poissonDist(xPlot, *params), "r-", lw=2) st = ks_2samp(ans, poissonDist(xPlot, *params)) print(sys.argv[0], "Draw Poission") plt.title("CNV with IGSR") plt.grid(True) plt.xlabel("CNV") plt.ylabel("Frequency") #plt.text(5, 0.125, "lambda = %.2f" % (params)) plt.text(5, 0.150, "n = %d" % (len(ans))) #plt.text(5, 0.175, "st = %.2f" % st[0]) fig = plt.gcf() fig.set_size_inches(24, 18) title = "HistCNA_" if len(sys.argv) > 1:
fmax_dict[treatment][taxon] = np.asarray(f_max_all) fmax_dict ks_dict = {} #treatment_ = [] p_values = [] for treatment in treatments: ks_dict[treatment] = {} sample_1 = fmax_dict[treatment]['B'] sample_2 = fmax_dict[treatment]['S'] D, p_value = stats.ks_2samp(sample_1, sample_2) ks_dict[treatment]['D'] = D ks_dict[treatment]['p_value'] = p_value #treatment_pairs.append((treatment_pair, taxon)) p_values.append(p_value) reject, pvals_corrected, alphacSidak, alphacBonf = multitest.multipletests( p_values, alpha=0.05, method='fdr_bh') for treatment_idx, treatment in enumerate(treatments): ks_dict[treatment]['p_value_bh'] = pvals_corrected[treatment_idx] for treatment in pt.treatments: for taxon, f_max_array in fmax_dict[treatment].items():
cont1[n] += len(p2_silences) cont2[n] += len(p1_silences) igd1[n] += len(p2_ignores) igd2[n] += len(p1_ignores) user_hists1.append(hists1) user_hists2.append(hists2) #個別に検定 p1_model = p1_stack[2] for n, p1 in enumerate(p1_stack): if n == 2: continue result = stats.mannwhitneyu(p1, p1_model) print "p1 model", types[n], result.pvalue w, p = stats.ks_2samp(p1, p1_model) print p p2_model = p2_stack[2] for n, p2 in enumerate(p2_times_stack): if n == 2: continue result = stats.mannwhitneyu(p2, p2_model) print "p2 model", types[n], result.pvalue w, p = stats.ks_2samp(p2, p2_model) print p user1 = np.array(user_hists1) user2 = np.array(user_hists2) sum1 = np.sum(user1, axis=0)
def main(): #full_network_filename="./network_all_users/full_network_all_users.gml" # i CANT use this network, because the labels dont match the users id from the dB # G_full = nx.read_gml(full_network_filename) # list_A=[] #Testign out how KS works on a random sample # list_B=[] #for i in range (10000): # list_A.append(random.random()) # list_B.append(random.random()) #print "KS test listA against normal distrib:", stats.kstest(list_A, "norm" ) # print "KS test listB against normal distrib:", stats.kstest(list_B, "norm" ) #print "two-sided KS test listA vs listB:", stats.ks_2samp(list_A, list_B) unrealistic_weight_change = 70. database = "calorie_king_social_networking_2010" server = "tarraco.chem-eng.northwestern.edu" user = "******" passwd = "n1ckuDB!" db = Connection(server, database, user, passwd) GC_network_filename = "./network_all_users/GC_full_network_all_users_merged_small_comm_roles_diff_layers1_roles_diff_layers1.5.gml" G = nx.read_gml(GC_network_filename) output_filename = "./network_all_users/Results_comparison_histograms_percent_weight_change.txt" file_output = open(output_filename, 'wt') # print "num. nodes:",len(G.nodes()) list_of_lists = nx.connected_components(G) print "num. of components:", len(list_of_lists), "size GC:", len( list_of_lists[0]) list_weight_changes_GC = [] list_weight_changes_R6friends = [] for node in G.nodes(): label = G.node[node]["label"] percent_weight_change = G.node[node]["percentage_weight_change"] R6_overlap = G.node[node]["R6_overlap"] #print node, label, weight_change, R6_overlap if percent_weight_change > -unrealistic_weight_change and percent_weight_change < unrealistic_weight_change: # filter out unrealistic values list_weight_changes_GC.append(percent_weight_change) if R6_overlap > 0: list_weight_changes_R6friends.append(percent_weight_change) print >> file_output, "num GC users:", len( list_weight_changes_GC), "num users with R6 friends:", len( list_weight_changes_R6friends) histograma_bines_gral.histograma_bins( list_weight_changes_GC, 20, "./network_all_users/histogram_weight_change_GC_users.dat") histograma_bines_gral.histograma_bins( list_weight_changes_R6friends, 20, "./network_all_users/histogram_weight_change_users_with_R6friends.dat") print >> file_output, "KS test GC against normal distrib:", stats.kstest( list_weight_changes_GC, "norm") print >> file_output, "KS test users with R6 friends against normal distrib:", stats.kstest( list_weight_changes_R6friends, "norm") print >> file_output, "two-sided KS test GC vs users with R6 friends:", stats.ks_2samp( list_weight_changes_GC, list_weight_changes_R6friends) list_weight_changes_all = [] query1 = """SELECT * FROM users""" result1 = db.query(query1) # is a list of dicts. for r1 in result1: percent_weight_change = (float(r1['most_recent_weight']) - float( r1['initial_weight'])) / float(r1['initial_weight']) # if percent_weight_change > -unrealistic_weight_change and percent_weight_change < unrealistic_weight_change : # filter out unrealistic values list_weight_changes_all.append(percent_weight_change) histograma_bines_gral.histograma_bins( list_weight_changes_all, 200, "./network_all_users/histogram_weight_change_users_all_200bins.dat") print >> file_output, "tot. number users", len(list_weight_changes_all) print >> file_output, "KS test all against normal distrib:", stats.kstest( list_weight_changes_all, "norm") print >> file_output, "two-sided KS test all vs GC:", stats.ks_2samp( list_weight_changes_all, list_weight_changes_GC) print >> file_output, "two-sided KS test all vs users with R6 friends:", stats.ks_2samp( list_weight_changes_GC, list_weight_changes_R6friends) file_output.close() print "written file:", output_filename exit() query1 = """SELECT * FROM friends order by src asc""" result1 = db.query(query1) # is a list of dict. print "number links:", len(result1) list_friends = [] for r1 in result1: label_src = r1['src'] label_dest = r1['dest'] if label_src not in list_friends: list_friends.append(label_src) if label_dest not in list_friends: list_friends.append(label_dest) print "num networked users:", len(list_friends)
def KS_test(y1, y2): temp = stats.ks_2samp(y1, y2) return temp.pvalue
def test_one_feature_mixture(component_model_type, num_clusters=3, show_plot=False, seed=None): """ """ random.seed(seed) N = 300 separation = .9 get_next_seed = lambda: random.randrange(2147483647) cluster_weights = [[1.0 / float(num_clusters)] * num_clusters] cctype = component_model_type.cctype T, M_c, structure = sdg.gen_data([cctype], N, [0], cluster_weights, [separation], seed=get_next_seed(), distargs=[distargs[cctype]], return_structure=True) T_list = list(T) T = numpy.array(T) # pdb.set_trace() # create a crosscat state M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype]) state = State.p_State(M_c, T_list) # Get support over all component models discrete_support = qtu.get_mixture_support( cctype, component_model_type, structure['component_params'][0], nbins=250) # calculate simple predictive probability for each point Q = [(N, 0, x) for x in discrete_support] # transitions state.transition(n_steps=200) # get the sample X_L = state.get_X_L() X_D = state.get_X_D() # generate samples # kstest has doesn't compute the same answer with row and column vectors # so we flatten this column vector into a row vector. predictive_samples = sdg.predictive_columns( M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1) probabilities = su.simple_predictive_probability(M_c, X_L, X_D, [] * len(Q), Q) # get histogram. Different behavior for discrete and continuous types. For some reason # the normed property isn't normalizing the multinomial histogram to 1. # T = T[:,0] if is_discrete[component_model_type.model_type]: bins = range(len(discrete_support)) T_hist = numpy.array(qtu.bincount(T, bins=bins)) S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins)) T_hist = T_hist / float(numpy.sum(T_hist)) S_hist = S_hist / float(numpy.sum(S_hist)) edges = numpy.array(discrete_support, dtype=float) else: T_hist, edges = numpy.histogram(T, bins=min(50, len(discrete_support)), normed=True) S_hist, _ = numpy.histogram(predictive_samples, bins=edges, normed=True) edges = edges[0:-1] # Goodness-of-fit-tests if not is_discrete[component_model_type.model_type]: # do a KS tests if the distribution in continuous # cdf = lambda x: component_model_type.cdf(x, model_parameters) # stat, p = stats.kstest(predictive_samples, cdf) # 1-sample test stat, p = stats.ks_2samp(predictive_samples, T[:, 0]) # 2-sample test test_str = "KS" else: # Cressie-Read power divergence statistic and goodness of fit test. # This function gives a lot of flexibility in the method <lambda_> used. freq_obs = S_hist * N freq_exp = numpy.exp(probabilities) * N stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson') test_str = "Chi-square" if show_plot: pylab.clf() lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type, structure['component_params'][0], [1.0 / num_clusters] * num_clusters) pylab.axes([0.1, 0.1, .8, .7]) # bin widths width = (numpy.max(edges) - numpy.min(edges)) / len(edges) pylab.bar(edges, T_hist, color='blue', alpha=.5, width=width, label='Original data', zorder=1) pylab.bar(edges, S_hist, color='red', alpha=.5, width=width, label='Predictive samples', zorder=2) # plot actual pdf of support given data params pylab.scatter(discrete_support, numpy.exp(lpdf), c="blue", edgecolor="none", s=100, label="true pdf", alpha=1, zorder=3) # plot predictive probability of support points pylab.scatter(discrete_support, numpy.exp(probabilities), c="red", edgecolor="none", s=100, label="predictive probability", alpha=1, zorder=4) pylab.legend() ylimits = pylab.gca().get_ylim() pylab.ylim([0, ylimits[1]]) title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \ % (N, num_clusters, component_model_type.cctype, test_str, round(p,4)) pylab.title(title_string, fontsize=12) filename = component_model_type.model_type + "_mixtrue.png" pylab.savefig(filename) pylab.close() return p
y = binom.rvs(n, p, size=s_size) samples.append([y, i, "binomial"]) for i in range(3 * n_of_samples, 4 * n_of_samples): y = geom.rvs(p, size=s_size) samples.append([y, i, "geometric"]) for i in range(4 * n_of_samples, 5 * n_of_samples): y = poisson.rvs(n, size=s_size) samples.append([y, i, "poisson"]) outlier_1 = beta.rvs(1, 10, size=1000) outlier_2 = chi2.rvs(n, size=1000) samples.append([outlier_1, 5 * n_of_samples, "beta"]) samples.append([outlier_2, 5 * n_of_samples + 1, "chi_square"]) for i in range(len(samples)): for j in range(i, len(samples)): ks_test_pvalue = ks_2samp(samples[i][0], samples[j][0])[1] epps_singleton_pvalue = epps_singleton_2samp(samples[i][0], samples[j][0])[1] if ks_test_pvalue > 0.05: G.add_edge(i, j, weight=0.01 / (ks_test_pvalue)) #0.01 scaling factor here if epps_singleton_pvalue > 0.05: H.add_edge(i, j, weight=0.01 / (epps_singleton_pvalue)) #0.01 scaling factor here # Testing whether two samples are generated by the same underlying distribution is a classical question in statistics. A widely used test is the Kolmogorov-Smirnov (KS) test which relies on the empirical distribution function. Epps and Singleton introduce a test based on the empirical characteristic function. # # One advantage of the ES test compared to the KS test is that is does not assume a continuous distribution.The authors conclude that the test also has a higher power than the KS test in many examples. They recommend the use of the ES test for discrete samples as well as continuous samples with at least 25 observations each. # In[2]: