Ejemplo n.º 1
1
	def test_mogsm(self):
		mcgsm = MCGSM(
			dim_in=0,
			dim_out=3,
			num_components=2,
			num_scales=2,
			num_features=0)

		p0 = 0.3
		p1 = 0.7
		N = 20000
		m0 = array([[2], [0], [0]])
		m1 = array([[0], [2], [1]])
		C0 = cov(randn(mcgsm.dim_out, mcgsm.dim_out**2))
		C1 = cov(randn(mcgsm.dim_out, mcgsm.dim_out**2))
		input = zeros([0, N])
		output = hstack([
			dot(cholesky(C0), randn(mcgsm.dim_out, round(p0 * N))) + m0,
			dot(cholesky(C1), randn(mcgsm.dim_out, round(p1 * N))) + m1]) * (rand(1, N) + 0.5)

		mcgsm.train(input, output, parameters={
			'verbosity': 0,
			'max_iter': 10,
			'train_means': True})

		mogsm = MoGSM(3, 2, 2)

		# translate parameters from MCGSM to MoGSM
		mogsm.priors = sum(exp(mcgsm.priors), 1) / sum(exp(mcgsm.priors))

		for k in range(mogsm.num_components):
			mogsm[k].mean = mcgsm.means[:, k]
			mogsm[k].covariance = inv(dot(mcgsm.cholesky_factors[k], mcgsm.cholesky_factors[k].T))
			mogsm[k].scales = exp(mcgsm.scales[k, :])
			mogsm[k].priors = exp(mcgsm.priors[k, :]) / sum(exp(mcgsm.priors[k, :]))

		self.assertAlmostEqual(mcgsm.evaluate(input, output), mogsm.evaluate(output), 5)

		mogsm_samples = mogsm.sample(N)
		mcgsm_samples = mcgsm.sample(input)

		# generated samples should have the same distribution
		for i in range(mogsm.dim):
			self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[0]) > 0.0001)
			self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[1]) > 0.0001)
			self.assertTrue(ks_2samp(mogsm_samples[i], mcgsm_samples[2]) > 0.0001)

		posterior = mcgsm.posterior(input, mcgsm_samples)

		# average posterior should correspond to prior
		for k in range(mogsm.num_components):
			self.assertLess(abs(1 - mean(posterior[k]) / mogsm.priors[k]), 0.1)
Ejemplo n.º 2
0
def tailStats(tail1, tail2, gene):
    threeLoc1 = []
    threeLoc2 = []
    tailLen1 = []
    tailLen2 = []
    for tail in tail1:
        if gene in tail[2]:
            repeater(int(tail[3]), threeLoc1, int(tail[1]))
            repeater(int(tail[4]), tailLen1, int(tail[1]))
            #threeLoc1.append(int(tail[3]))
            #tailLen1.append(int(tail[4]))
    for tail in tail2:
        if gene in tail[2]:
            repeater(int(tail[3]), threeLoc2, int(tail[1]))
            repeater(int(tail[4]), tailLen2, int(tail[1]))
            #threeLoc2.append(int(tail[3]))
            #tailLen2.append(int(tail[4]))
    if not threeLoc1 or not threeLoc2:
        pLoc = "nan"
        pTail = "nan"
    else:
        #pLoc = stats.ttest_ind(threeLoc1, threeLoc2)[1]
        #pTail = stats.ttest_ind(tailLen1, tailLen2)[1]
        pLoc = stats.ks_2samp(threeLoc1, threeLoc2)[1]
        pTail = stats.ks_2samp(tailLen1, tailLen2)[1]
    return gene, len(threeLoc1), np.average(threeLoc1), np.average(tailLen1), len(threeLoc2), np.average(threeLoc2), np.average(tailLen2), pLoc, pTail
Ejemplo n.º 3
0
def test_weibull(dist, p1, p2, report_file = None, round = False):
    """
     -----------------------------------------------------
        kstest for weibull distribution
            :param p1: scale, lambda > 0
            :param p2: shape, kappa > 0
            :param dist: The distribution to be tested
            :return: True, False
    -----------------------------------------------------
    """
    size = len(dist)
    # s = np.random.weibull(p2, size)
    # dist_weibull_np = map(lambda x : x * p1, s)
    dist_weibull_scipy = stats.weibull_min.rvs(c = p2, loc = 0, scale = p1, size = size)
    if round:
        dist_weibull_scipy2 = []
        for n in dist_weibull_scipy:
            dist_weibull_scipy2.append(round_to_n_digit(n, 7))
        result = stats.ks_2samp(dist_weibull_scipy2,dist)
    else:
        result = stats.ks_2samp(dist_weibull_scipy, dist)

    p = get_p_s_from_ksresult(result)['p']
    s = get_p_s_from_ksresult(result)['s']
    critical_value_s = calc_ks_critical_value(size)

    # return p >= 5e-2 or s <= critical_value_s
    if p >= 5e-2 or s <= critical_value_s:
        return True
    else:
        if report_file is not None:
            report_file.write("BAD: ({0},{1})failed with statistic={2}, pvalue={3}, expected s less than {4} and p larger than 0.05.\n".format(p1, p2, s, p, critical_value_s))
        return False
Ejemplo n.º 4
0
def motifStats(data,motifSize,degree, usetotal=False):
	
	for corr in ('corr','lcorr','lacorr'):
		motifsNL = findMotifs(data,('NL',corr), motifSize, degree, usetotal)
		motifsMCI = findMotifs(data,('MCI',corr), motifSize, degree, usetotal)
		motifsAD = findMotifs(data,('AD',corr), motifSize, degree, usetotal)
		
		allMotifs = list(set(motifsNL.keys()) | set(motifsAD.keys()) | set(motifsMCI.keys()))
		
		datatype = "Total" if usetotal else "Percent"
		filename = "result2/{}_ks-stats_size-{}_deg-{}.txt".format(corr+datatype,motifSize,degree)
		with open(filename,'w') as f:
			f.write("{0:>10}{1:>15}{2:>15}{3:>15}{4:>15}{5:>15}\n".format('ID','MCI','AD','NORM NL','NORM MCI','NORM AD'))
			for key in allMotifs:
				NLdata = motifsNL.get(key,np.zeros(88))
				MCIdata = motifsMCI.get(key,np.zeros(88))
				ADdata = motifsAD.get(key,np.zeros(88))
				KSstatistic, MCIpvalue = stats.ks_2samp(MCIdata,NLdata)
				KSstatistic, ADpvalue = stats.ks_2samp(ADdata,NLdata)
				k2,NLnorm = stats.normaltest(NLdata)
				k2,MCInorm = stats.normaltest(MCIdata)
				k2,ADnorm = stats.normaltest(ADdata)
				if MCIpvalue<0.01 or ADpvalue<0.01:
					line = "*{0:>9}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n"
				else:
					line = "{0:>10}{1:15.3}{2:15.3}{3:15.3}{4:15.3}{5:15.3}\n"
				f.write(line.format(str(int(key)),MCIpvalue,ADpvalue,NLnorm,MCInorm,ADnorm))
Ejemplo n.º 5
0
def main():
	pt = '/home/sdhawan/bol_ni_ej/'
	
	#sn =sys.argv[1]
	sbv = np.loadtxt(pt+'sbv_all_b14.txt', dtype='string')
	snlis = np.loadtxt('all91bg.txt', dtype='string')
	#p, f=ir_frac(sn)
	arr =[]
	for i in  snlis:
		try:
			s  = sbv[sbv[:,0] == i][0]
			
			irt = ir_at_max(i)
			print irt
			arr.append([irt, float(s[1]), float(s[2])])
		except:
			i
	print arr
	arr = np.array(arr)
	#print "the NIR fraction at bolometric maximum is:", round(ir_at_max(sn)*100, 2), "%"

	plt.errorbar(arr[:,0], arr[:,1] , arr[:,2], fmt='go')
	plt.show()
	return 0
	ir = np.loadtxt('../ejecmass.txt', usecols=(-2, -1))
	noir = np.loadtxt('../ejecmass_noir.txt', usecols=(-2, -1))
	
	print ks_2samp(ir[:,0], noir[:,0])
	plt.hist(ir[:,0], histtype='step')
	plt.hist(noir[:,0], histtype='step')
Ejemplo n.º 6
0
    def findKSstat(self):                       
        """
        """        
        # Load baseline files for comparison:
        baseline = self.loadPickle(condition='baseline')

        # KS stats:
        for syll in self.syllables:

            AED, AEp = stats.ks_2samp(self.syllables[syll]['dstFreq'], 
                                     baseline[syll]['dstFreq'])
            self.syllables[syll]['EntKS'] = AED
            self.syllables[syll]['EntPvalKS'] = AEp
            print 'syll ', syll, 'entropy : ', AED 

            AFD, AFp = stats.ks_2samp(self.syllables[syll]['dstEnt'], 
                                     baseline[syll]['dstEnt'])
            self.syllables[syll]['FreqKS'] = AFD
            self.syllables[syll]['FreqPvalKS'] = AFp
            print 'syll ', syll, 'freq : ', AED 
            
            EXPdur = []
            for song in self.syllables[syll]['duration']:
                if song is not None:
                    EXPdur.append(self.syllables[syll]['duration'][song])
                    
            BASEdur = []
            for song in baseline[syll]['duration']:
                if song is not None:
                    BASEdur.append(baseline[syll]['duration'][song])
                    
            ADT, ADp = stats.ks_2samp(EXPdur, BASEdur)
            self.syllables[syll]['DurKS'] = ADT
            self.syllables[syll]['DurPvalKS'] = ADp
            print 'syll ', syll, 'duration : ', AED            
Ejemplo n.º 7
0
def KS_test(groups, outfile):
    jdelim = args.delimiter if args.delimiter != None else ' '
    for i,u in enumerate(groups):
        for j,v in enumerate(groups):
            if j > i or (j == i and len(args.columns) == 1):
                break
            for x,us in enumerate(u.samples):
                for y,vs in enumerate(v.samples):
                    if len(vs) < args.ignore or len(us) < args.ignore:
                        continue
                    if j == i and y >= x:
                        break
                    if args.random != None:
                        verdict = False
                        for k in range(args.random):
                            res = ks_2samp(random.sample(us, args.subsample), random.sample(vs, args.subsample))
                            if res[0] < res[1]:
                                verdict = True
                            outfile.write(jdelim.join(u.tup + v.tup + map(str, res)) + '\n')
                        outfile.write('Verdict:' + str(verdict) + '\n')
                    else:
                        res = ks_2samp(us, vs)
                        verdict = False
                        if res[0] < res[1]:
                            verdict = True
                        outfile.write(jdelim.join(u.tup + v.tup + map(str, res)) + '\n')
                        outfile.write('Verdict:' + str(verdict) + '\n')
Ejemplo n.º 8
0
def test_points(xs,ys):
    print xs[0][0], " steps"
    #print ks_2samp(xs[2:],ys[2:])
    print ks_2samp(xs[2],ys[2])
    print ks_2samp(xs[3],ys[3])
    print ks_2samp(xs[4],ys[4])
    print ks_2samp(xs[5],ys[5])
    print ks_2samp(xs[6],ys[6])
    print "======"
Ejemplo n.º 9
0
def kstest(x,y,alpha, beta):
    """Find the K-S test probability that the fit and the data were from the same distribution"""
    #Vector of expected y from fit
    fity = beta*x + alpha
    #Vector of expected x from fit
    fitx = (y - alpha) / beta
    (D1, p1) = st.ks_2samp(y,fity)
    (D2, p2) = st.ks_2samp(x,fitx)
    return (np.sqrt(D1*D2),np.sqrt(p1*p2))
Ejemplo n.º 10
0
def kstest():
    n1=200
    n2=300
    a = stats.norm.rvs(size=n1, loc=0, scale=1)
    b = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
    c = stats.norm.rvs(size=n2, loc=0.01, scale=1)

    print (stats.ks_2samp(a, b))
    print (stats.ks_2samp(a, c))
Ejemplo n.º 11
0
 def samples_from_same_distribution(self, *args):
     # Test if flattened samples distributions match (marginals match)
     _, p_marginal = st.ks_2samp(*[s.flatten() for s in args])
     # Test if correlations within non independent draws match
     _, p_correlation = st.ks_2samp(
         *[np.array([np.corrcoef(ss) for ss in s]).flatten()
           for s in args]
     )
     assert p_marginal >= 0.05 and p_correlation >= 0.05
Ejemplo n.º 12
0
def stats(binding_data, proximity_data):
    n_bins = 50
    hist_b, bins_b = np.histogram(binding_data, bins=np.linspace(np.min(binding_data), np.max(binding_data), n_bins))
    hist_p, bins_p = np.histogram(proximity_data, bins=np.linspace(np.min(binding_data), np.max(binding_data), n_bins))
    #print hist_b, hist_p
    #print "Binding data> mean:%s, median:%s, std:%s" %(np.mean(binding_data), np.median(binding_data), np.std(binding_data))
    #print "Proximity data> mean:%s, median:%s, std:%s" %(np.mean(proximity_data), np.median(proximity_data), np.std(proximity_data))
    #print scipy.stats.spearmanr(hist_b, hist_p)
    #print bins_b, bins_p
    print ks_2samp(hist_b, hist_b)
    print ks_2samp(hist_b, hist_p)
Ejemplo n.º 13
0
def plot_posteriors_iter(eli,file_names):
    from scipy import stats
    import matplotlib as mpl
    mpl.use('Agg')
    mpl.rcParams.update({'font.size': 20})
    import matplotlib.pyplot as plt
    no_files=np.array(file_names).shape[0]
    samples=list()
    for i in np.arange(no_files):
        read=np.genfromtxt(file_names[i])
        samples.append(read[read.shape[0]/5:read.shape[0],:])
    down=eli.lower_bounds
    up=eli.upper_bounds
    no_of_pars=up.shape[0]
    plt.clf()
    f,axes=plt.subplots(2,int(np.ceil(no_of_pars/2)),figsize=(24,12))
    row=0
    col=0
    colors='bgcymbkrgrcmykw'
    linetypes=['--','-.','-.','-.','-.',':','-','-']
    kolmog_stats=np.zeros((2*no_files-1,up.shape[0]))
    for i in np.arange(up.shape[0]):
        # patches=[]
        lines=[]
        x=np.arange(down[i],up[i],0.01)
        for j in np.arange(no_files):
            Ds,ps=stats.ks_2samp(samples[j][:,i],samples[-1][:,i])
            kolmog_stats[j*2,i]=Ds
            kolmog_stats[j*2,i]=1
            if j>0:
                D,p=stats.ks_2samp(samples[j][:,i],samples[j-1][:,i])
                kolmog_stats[j*2-1,i]=D
                # print(D)
            kde=stats.gaussian_kde(samples[j][:,i])
            kde.covariance_factor = lambda : .3
            kde._compute_covariance()
            line,=axes[row,col].plot(x,kde.evaluate(x),linestyle=linetypes[j],color=colors[j],lw=3)
            lines.append(line)
        axes[row,col].set_title(eli.names[i])
        print("_________")
        if i>up.shape[0]-3:
            axes[row,col].set_ylim([0,15])
        row+=1
        if (row==2):
            row=0
            col+=1
    f.tight_layout(rect=[0, 0.13, 1, 1])
    np.savetxt("kolmog.dat",kolmog_stats,delimiter=" & ", fmt="%.2f")
    # f.legend([leg1,patches[2],patches[1],patches[0]],["Prior distribution","SWMM posterior","Emulator (improved) posterior","Emulator (standard) posterior",], bbox_to_anchor=[0.5, 0.05],loc='center',ncol=2)
    # f.legend([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6]],["noniterative (128)","0$^{th}$ iteration (64)","1$^{st}$ iteration (80)","2$^{nd}$ iteration (96)","3$^{rd}$ iteration (112)","4$^{th}$ iteration (128)","4$^{th}$ iteration (144)","SWMM",], bbox_to_anchor=[0.5, 0.08],loc='center',ncol=3)
    # f.legend([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6],lines[7]],["noniterative (72)","0$^{th}$ iteration (32)","1$^{st}$ iteration (40)","2$^{nd}$ iteration (48)","3$^{rd}$ iteration (56)","4$^{th}$ iteration (64)","5$^{th}$ iteration (72)","SWMM",], bbox_to_anchor=[0.5, 0.08],loc='center',ncol=3)
    f.legend([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6],lines[7]],["noniterative (108)","0$^{th}$ iteration (48)","1$^{st}$ iteration (60)","2$^{nd}$ iteration (72)","3$^{rd}$ iteration (84)","4$^{th}$ iteration (96)","5$^{th}$ iteration (108)","SWMM",], bbox_to_anchor=[0.5, 0.08],loc='center',ncol=3)
    f.savefig("posteriors.pdf",dpi=500)
    plt.close()
Ejemplo n.º 14
0
def omniDataCorr(srefDate, erefDate, startDate, endDate, epochs, SWP, binStride, CorrTime = 'Day', CorrType = 'kstest'):
    import numpy
    import bisect
    import datetime
    from scipy.stats import ks_2samp, pearsonr
    from getswdata import getOMNIfiles, dataClean, dateShift, dateList

    CorrTime = CorrTime.lower()
    CorrType = CorrType.lower()

    if endDate < startDate:
     print('(swdatanal.omniDataCorr).Error: Dates are not applicable')
     SWPDatRng=0; cepochs=0; KSVals=0; KSDist=0; aepochs=0
     return SWPDatRng, cepochs, KSVals, KSDist, aepochs

    sEpochID  = bisect.bisect_left(epochs, startDate)
    eEpochID  = bisect.bisect_left(epochs, endDate)
    cepochs   = epochs[sEpochID:eEpochID]
    SWPDatRng = SWP[sEpochID:eEpochID]
    if SWP[sEpochID:eEpochID] == []:
     print('(swdatanal.omniDataCorr).Error: No data avaliable for the designated date(s) and/or time(s).')
     SWPDatRng=0; cepochs=0; KSVals=0; KSDist=0; aepochs=0
     return SWPDatRng, cepochs, KSVals, KSDist, aepochs
    _, bins   = getDistrib(filter(lambda v: v==v, SWPDatRng), stride = binStride, norm = False)

    sEpochID = bisect.bisect_left(epochs, srefDate)
    eEpochID = bisect.bisect_left(epochs, erefDate)
    SWPV01   = SWP[sEpochID:eEpochID]
    SWPD01   = getDistrib(filter(lambda v: v==v, SWPV01), bins=bins, norm=True)

    if CorrTime == 'day':
     aepochs = []; KSVals = []; KSDist = []
     sEpoch = datetime.datetime(startDate.year,startDate.month,startDate.day, 0, 0, 0)
     eEpoch = dateShift(sEpoch, hours = 23, minutes = 59, seconds = 59)
     for i in range((endDate-startDate).days+1):
      aepochs  = aepochs + [dateShift(sEpoch,0,0,i,0,0,0)]
      sEpochID = bisect.bisect_left(epochs, dateShift(sEpoch,0,0,i,0,0,0))
      eEpochID = bisect.bisect_left(epochs, dateShift(eEpoch,0,0,i,0,0,0))

      SWPV02 = SWP[sEpochID:eEpochID]
      SWPD02 = getDistrib(filter(lambda v: v==v, SWPV02), bins=bins, norm=True)

      if CorrType == 'kstest':
       KSVals = KSVals + [ks_2samp(SWPV01, SWPV02)]
       KSDist = KSDist + [ks_2samp(SWPD01, SWPD02)]
      elif CorrType == 'pearson':
       KSVals = KSVals + [pearsonr(SWPV01, SWPV02)]
       KSDist = KSDist + [pearsonr(SWPD01, SWPD02)]

     KSVals = numpy.array(KSVals)
     KSDist = numpy.array(KSDist)

    return SWPDatRng, cepochs, KSVals, KSDist, aepochs
Ejemplo n.º 15
0
def simple_example():
    np.random.seed(12345678)

    n1 = 200000
    n2 = 300000

    '''
    rvs2, rvs3, rvs4 与 rvs1 的分布的相似性逐渐变大,表现在 pvalue 上是逐渐变大·
    '''
    rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
    rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
    rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
    rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)

    print stats.ks_2samp(rvs1, rvs2)
    print stats.ks_2samp(rvs1, rvs3)
    print stats.ks_2samp(rvs1, rvs4)
    print stats.ks_2samp(rvs1, rvs1)

    bins_cnt = 100
    alpha = 0.5
    plt.hist(rvs1, label='rvs1', bins=bins_cnt, alpha=1.0, density=True, histtype='stepfilled')
    plt.hist(rvs2, label='rvs2', bins=bins_cnt, alpha=alpha, density=True, histtype='stepfilled')
    plt.hist(rvs3, label='rvs3', bins=bins_cnt, alpha=alpha, density=True, histtype='stepfilled')
    plt.hist(rvs4, label='rvs4', bins=bins_cnt, alpha=alpha, density=True, histtype='stepfilled')
    plt.legend(loc='best', frameon=False)
    plt.show()
Ejemplo n.º 16
0
def plot_posteriors(eli,file_names):
    from scipy import stats
    import matplotlib as mpl
    mpl.use('Agg')
    mpl.rcParams.update({'font.size': 20})
    import matplotlib.pyplot as plt
    no_files=np.array(file_names).shape[0]
    samples=list()
    for i in np.arange(no_files):
        samples.append(np.genfromtxt(file_names[i]))
    down=eli.lower_bounds
    up=eli.upper_bounds
    no_of_pars=up.shape[0]
    plt.clf()
    f,axes=plt.subplots(2,int(np.ceil(no_of_pars/2)),figsize=(24,12))
    row=0
    col=0
    colors='bgrcmykwbgrcmykw'
    linetypes=['--','-','-','-','-','-','-']
    for i in np.arange(up.shape[0]):
        # patches=[]
        lines=[]
        x=np.arange(down[i],up[i],0.01)
        pri=np.zeros(x.shape[0])
        for j in np.arange(x.shape[0]):
            pri[j]=eli.prior_dist(x[j],i)
        leg1,=axes[row,col].plot(x,pri,'k.-')
        D1,p1=stats.ks_2samp(samples[no_files-1][:,i],samples[no_files-3][:,i])
        D2,p2=stats.ks_2samp(samples[no_files-1][:,i],samples[no_files-2][:,i])
        print(D1)
        print(D2)
        print("_________")
        for j in np.arange(no_files):
            kde=stats.gaussian_kde(samples[j][:,i])
            kde.covariance_factor = lambda : .3
            kde._compute_covariance()
            line,=axes[row,col].plot(x,kde.evaluate(x),linestyle=linetypes[j],color=colors[j],lw=2)
            lines.append(line)
        axes[row,col].set_title(eli.names[i])
        if i>up.shape[0]-3:
            axes[row,col].set_ylim([0,15])
        row+=1
        if (row==2):
            row=0
            col+=1
    f.tight_layout(rect=[0, 0.08, 1, 1])
    # f.legend([leg1,patches[2],patches[1],patches[0]],["Prior distribution","SWMM posterior","Emulator (improved) posterior","Emulator (standard) posterior",], bbox_to_anchor=[0.5, 0.05],loc='center',ncol=2)
    f.legend([leg1,lines[2],lines[1],lines[0]],["Prior distribution","SWMM posterior","Emulator (improved) posterior","Emulator (standard) posterior",], bbox_to_anchor=[0.5, 0.05],loc='center',ncol=2)
    f.savefig("posteriors.pdf",dpi=500)
    plt.close()
Ejemplo n.º 17
0
def model_v_model_cdfs_pdfs(arr, bins, cdf, pdf, args, nsinks=64):
    done = []
    for ref_key in arr.keys():
        for key in arr.keys():
            if ref_key == key:
                continue
            # if (ref_key != 'bInf' and ref_key != 'bInfsd3') and (key != 'bInf' and key != 'bInfsd3'): continue
            # if ref_key != 'hydro_both' and key != 'hydro_both': continue
            if "%s_v_%s" % (ref_key, key) in done or "%s_v_%s" % (key, ref_key) in done:
                continue
            done.append("%s_v_%s" % (ref_key, key))
            t = ttest_ind(arr[ref_key], arr[key], equal_var=False)
            ks = ks_2samp(arr[ref_key], arr[key])
            # plot histograms and show Welch t, KS p-values
            fig, axes = plt.subplots(2, 1, sharex=True)
            ax = axes.flatten()
            plt.subplots_adjust(hspace=0.1)
            ax[0].plot((bins[ref_key][1:] + bins[ref_key][:-1]) / 2, cdf[ref_key], c="b", label="%s" % ref_key)
            ax[0].plot((bins[key][1:] + bins[key][:-1]) / 2, cdf[key], c="k", label="%s" % key)
            ax[1].plot((bins[ref_key][1:] + bins[ref_key][:-1]) / 2, pdf[ref_key], c="b", label="%s" % ref_key)
            ax[1].plot((bins[key][1:] + bins[key][:-1]) / 2, pdf[key], c="k", label="%s" % key)
            ax[1].legend(loc=0, fontsize="medium")
            ax[0].set_ylabel("CDF")
            ax[1].set_ylabel("PDF")
            ax[1].set_xlabel(r"$\dot{M}$")
            for i in range(2):
                ax[i].set_xlim(-7, -2)
                ax[i].set_ylim(-0.1, 1.1)
            plt.suptitle("%s_v_%s: Welch P(t)=%.2g, KS P(t)=%.2g" % (ref_key, key, t[1], ks[1]))
            plt.savefig(os.path.join(args.outdir, "%s-v-%s-nsinks-%d.png" % (ref_key, key, nsinks)))
            plt.close()
            # repeat for hydro1, hydro2, hydro 1+2
    t = ttest_ind(arr[ref_key], arr[key], equal_var=False)
    ks = ks_2samp(arr[ref_key], arr[key])
    # plot histograms and show Welch t, KS p-values
    fig, axes = plt.subplots(2, 1, sharex=True)
    ax = axes.flatten()
    plt.subplots_adjust(hspace=0)
    for key, c in zip(["hydro_both", "bInf", "bInfsd3"], ["b", "y", "k"]):
        ax[0].plot((bins[key][1:] + bins[key][:-1]) / 2, cdf[key], c=c, label="%s" % key)
        ax[1].plot((bins[key][1:] + bins[key][:-1]) / 2, pdf[key], c=c, label="%s" % key)
    ax[1].legend(loc=0, fontsize="medium")
    ax[0].set_ylabel("CDF")
    ax[1].set_ylabel("PDF")
    ax[1].set_xlabel(r"$\dot{M}$")
    for i in range(2):
        ax[i].set_xlim(-6, -2.5)
    plt.savefig(os.path.join(args.outdir, "hydro1+2-v-hydro1-v-hydro2-nsinks-%d.png" % nsinks))
    plt.close()
Ejemplo n.º 18
0
def draw_overtraining(bdt_name, test, train):
    test_bg = test[test.classID==1]
    test_sig = test[test.classID==0]
    train_bg = train[train.classID==1]
    train_sig = train[train.classID==0]

    fig = plt.figure()
    ax = fig.add_subplot(111)

    low = min(test[bdt_name].min(), train[bdt_name].min())
    high = max(test[bdt_name].max(), train[bdt_name].max())

    print bdt_name, "signal",
    print ks_2samp(test_sig[bdt_name], train_sig[bdt_name])[1]
    print bdt_name, "background",
    print ks_2samp(test_bg[bdt_name], train_bg[bdt_name])[1]
    
    ax.hist(train_bg[bdt_name],
            bins=50,
            normed=True,
            range=(low,high),
            label="training background",
            color="blue",
            alpha=0.75)
    ax.hist(train_sig[bdt_name],
            bins=50,
            normed=True,
            range=(low,high),
            label="training signal",
            color="red",
            alpha=0.75)

    y,binEdges = np.histogram(test_bg[bdt_name],
                              bins=50,
                              normed=True,
                              range=(low,high))
    bincenters = 0.5*(binEdges[1:]+binEdges[:-1])
    ax.plot(bincenters, y, 'o', color="blue", label="test background")

    y,binEdges = np.histogram(test_sig[bdt_name],
                              bins=50,
                              normed=True,
                              range=(low,high))
    bincenters = 0.5*(binEdges[1:]+binEdges[:-1])
    ax.plot(bincenters, y, 'o', color="red", label="test signal")
    ax.legend(loc=2)
    ax.set_xlabel("BDT output")
    ax.set_ylabel("Arbitrary units")
    fig.savefig("/tmp/overtraining_%s.pdf"%(bdt_name))
Ejemplo n.º 19
0
def compute_ks_by_contained(contigs_by_lib_name, sinks, sources):
    # compute median of maxmin as well as ks p-value of contained maxmin
    for lib_snk in contigs_by_lib_name:
        # for a fixed lib_snk; do all source libs together
        # contained_ctg: contig names of all source libraries stored by source library names
        contained_ctg=collections.defaultdict(set)
        for snkCtg in contigs_by_lib_name[lib_snk].itervalues():
            for srcCtg in snkCtg.contained_in:
                contained_ctg[srcCtg.lib].add(srcCtg.name)
        for lib_src in contigs_by_lib_name:
            if lib_src in contained_ctg:
                contained=[]
                not_contained=[]
                for ctg in contigs_by_lib_name[lib_src]:
                    if ctg in contained_ctg[lib_src]:
                        contained.append(contigs_by_lib_name[lib_src][ctg].maxmin)
                    else:
                        not_contained.append(contigs_by_lib_name[lib_src][ctg].maxmin)
 #               contained=[contigs_by_lib_name[lib_src][ctg].maxmin for ctg in contigs_by_lib_name[lib_src] if ctg in contained_ctg[lib_src]]
 #               not_contained=[contigs_by_lib_name[lib_src][ctg].maxmin for ctg in contigs_by_lib_name[lib_src] if ctg not in contained_ctg[lib_src]]
                ks_pvalue = stats.ks_2samp(contained, not_contained)[1]
                print lib_src, lib_snk, ks_pvalue, sum(contained)/len(contained), sum(not_contained)/len(not_contained)
                if ks_pvalue < 0.05 and np.median(contained) > np.median(not_contained):
                    sources[lib_snk] |= {lib_src}
                    sinks[lib_src] |= {lib_snk}
Ejemplo n.º 20
0
	def p_value_scoring_object_test(clf, X, y):
		"""
		p_value_getter is a scoring callable that returns the negative p value from the KS test on the prediction probabilities for the particle and antiparticle samples.  
		"""
		print("Greeting : ", greeting)

		#Finding out the prediction probabilities
		prob_pred=clf.predict_proba(X)[:,1]
		#print(prob_pred)

		#This can be deleted if not using Keras
		#For Keras turn cathegorical y back to normal y
		if y.ndim==2:
			if y.shape[0]!=1 and y.shape[1]!=1:
                        #Then we have a cathegorical vector
                        	y = y[:,1]

		#making sure the inputs are row vectors
		y         = np.reshape(y,(1,y.shape[0]))
		prob_pred = np.reshape(prob_pred,(1,prob_pred.shape[0]))

		#Separate prob into particle and antiparticle samples
		prob_0    = prob_pred[np.logical_or.reduce([y==0])]
		prob_1    = prob_pred[np.logical_or.reduce([y==1])]
		#if __debug__:
			#print("Plot")
		p_KS_stat=stats.ks_2samp(prob_0,prob_1)
		print(p_KS_stat)
		p_KS=-p_KS_stat[1]
		return p_KS
Ejemplo n.º 21
0
	def calc_ks_test(self,true_distribution):
		if len(self.isize_list) >= 5:
			KS_statistic, self.pval = ks_2samp(self.isize_list, true_distribution)
			return KS_statistic, self.pval 
		else:
			self.pval = -1
			return -1, -1
Ejemplo n.º 22
0
def compare_fixlens(samp_fixlen, fixlendist, eps=.000000001):
    nonan_samp_fixlen = samp_fixlen[np.logical_not(np.isnan(samp_fixlen))]
    nonan_fixlendist = fixlendist[np.logical_not(np.isnan(fixlendist))]
    print nonan_samp_fixlen, nonan_fixlendist
    ks, p = sts.ks_2samp(nonan_samp_fixlen, nonan_fixlendist)
    print ks, p
    return np.log(p + eps)
Ejemplo n.º 23
0
def _get_xy_dataset_statistics(x_values, y_values, fcorrect_x_cutoff = 1.0, fcorrect_y_cutoff = 1.0, x_fuzzy_range = 0.1, y_scalar = 1.0):
    '''
    A function which takes two lists of values of equal length with corresponding entries and returns a dict containing
    a variety of metrics.
    :param x_values: A list of values for the X-axis (experimental values).
    :param y_values: A list of values for the X-axis (predicted values).
    :param fcorrect_x_cutoff: See get_xy_dataset_statistics.
    :param fcorrect_y_cutoff: See get_xy_dataset_statistics.
    :param x_fuzzy_range: See get_xy_dataset_statistics.
    :param y_scalar: See get_xy_dataset_statistics.
    :return: A table of statistics.
    '''
    from scipy.stats import pearsonr, spearmanr, normaltest, ks_2samp, kstest, norm
    assert(len(x_values) == len(y_values))
    return dict(
        pearsonr = pearsonr(x_values, y_values),
        spearmanr = spearmanr(x_values, y_values),
        gamma_CC = gamma_CC(x_values, y_values),
        MAE = mae(x_values, y_values),
        normaltestx = normaltest(x_values),
        normaltesty = normaltest(y_values),
        kstestx = kstest(x_values, 'norm'),
        kstesty = kstest(y_values, 'norm'),
        ks_2samp = ks_2samp(x_values, y_values),
        fraction_correct = fraction_correct(x_values, y_values, x_cutoff = fcorrect_x_cutoff, y_cutoff = fcorrect_y_cutoff),
        fraction_correct_fuzzy_linear = fraction_correct_fuzzy_linear(x_values, y_values, x_cutoff = fcorrect_x_cutoff, x_fuzzy_range = x_fuzzy_range, y_scalar = y_scalar),
    )
Ejemplo n.º 24
0
def calc_ks_stats(scores, exp_scores=None):
    from scipy import stats
    if exp_scores:
        (D, p_val) = stats.ks_2samp(scores, exp_scores)
    else:
        (D, p_val) = stats.kstest(scores, stats.uniform.cdf)
    return {'D':D, 'p_val':p_val}
Ejemplo n.º 25
0
def get_ks(iso, self):
    for daynr in BUCKETS:
        #print(sorted(Counter(choose_day(iso.days, daynr).rows["duration_of_state"]).items()))
        #print(sorted(Counter(choose_day(self.days, daynr).rows["duration_of_state"]).items()))
        iso_das = choose_day(iso.days, daynr).rows["duration_of_state"]
        self_das = choose_day(self.days, daynr).rows["duration_of_state"]
        yield stats.ks_2samp(iso_das, self_das)
Ejemplo n.º 26
0
def ks_statistic_calc(fund_ts_past, fund_ts_month):
    seq1 = deepcopy(fund_ts_past.values)
    seq2 = deepcopy(fund_ts_month.values)
    tsu.returnize0(seq1)
    tsu.returnize0(seq2)
    (ks, p) = scst.ks_2samp(seq1, seq2)
    return ks, p
Ejemplo n.º 27
0
def sort_features(df1, df2):
    """
    Takes two dataframes, and calculates a KS-Test between each 
    two columns that appear in both dataframes. Returns a list of
    column names, sorted by p-value in ascending order, and a list
    of corresponding p-values.

    Args:
        df1 (pd.DataFrame): Dataframe of feature columns for 'sample 1'
        df2 (pd.DataFrame): Dataframe of feature columns for 'sample 2'
    Returns ([str], [float]): Lists of column names and p-values.
    """

    common_cols = set.intersection(*[set(df.columns) for df in [df1, df2]])

    if len(common_cols) == 0:
        raise ValueError("The dataframes have no columns in common.")

    # calculate a KS-test for each feature column
    d = []
    p_vals = []
    for c in common_cols:
        this_d, this_p = ks_2samp(df1[c], df2[c])
        d.append(this_d)
        p_vals.append(this_p)

    # sort by p-value
    pc = list(zip(p_vals, common_cols))
    pc.sort()
    p_vals, common_cols = zip(*pc)

    # return sorted list of feature names and p-values
    return list(common_cols), list(p_vals)
Ejemplo n.º 28
0
def ks_test(samples1, samples2, threshold=0.9):
    """Applies a KS test to determine if two sets of samples are the same.

    The ks test is applied parameter-by-parameter. If the two-tailed p-value
    returned by the test is greater than ``threshold``, the samples are
    considered to be the same.

    Parameters
    ----------
    samples1 : dict
        Dictionary of mapping parameters to the first set of samples.
    samples2 : dict
        Dictionary of mapping parameters to the second set of samples.
    threshold : float
        The thershold to use for the p-value. Default is 0.9.

    Returns
    -------
    dict :
        Dictionary mapping parameter names to booleans indicating whether the
        given parameter passes the KS test.
    """
    is_the_same = {}
    assert set(samples1.keys()) == set(samples2.keys()), (
        "samples1 and 2 must have the same parameters")
    # iterate over the parameters
    for param in samples1:
        s1 = samples1[param]
        s2 = samples2[param]
        _, p_value = ks_2samp(s1, s2)
        is_the_same[param] = p_value > threshold
    return is_the_same
Ejemplo n.º 29
0
def CDFDistance2(rho1, v1, rho2, v2, rho_min, rho_max):
    """
    For two input 2D signals calculate the "distance" between their CDFs - 
    averaged (over density bins) distance between two 1D CDFs of speed 
    calculated for specific density bin. 
    input: 
      - rho1: density array of size n
      - v1: speed array of size n
      - rho2: density array of size m
      - v2: speed array of size m
      - rho_min: lower boundary of density value considered (used for bins creation)
      - rho_max: upper boundary of density value considered (used for bins creation)
    output: 
      - KSD: not negative number from 0 to 1
    """
    EMPTY = -1; 
    nBins = 10; # Now it is not obvious which value to get
    bins = np.linspace(rho_min, rho_max, nBins+1);
#    dist1D = EMPTY*np.ones((1,nBins));
    dist1D = [] ;
    
    for iBin in range(nBins):
        v1_b = v1[(rho1 >= bins[iBin]) * (rho1 <= bins[iBin+1])];        
        v2_b = v2[(rho2 >= bins[iBin]) * (rho2 <= bins[iBin+1])];
            
        if ((len(v1_b) > 0) and (len(v2_b) > 0)):
            [ks2stat,p] = stats.ks_2samp(v1_b, v2_b);
#            dist1D[0,iBin] = ks2stat;
            dist1D.append(ks2stat)
    
    #KSD = np.sum(dist1D[dist1D != EMPTY])/len(dist1D[dist1D != EMPTY]);
    KSD = np.sum(dist1D)/len(dist1D)
    return KSD
Ejemplo n.º 30
0
def do_ks_analysis(profiles, lens, name='', plot=False):
	L = np.array(0.446*(lens-np.mean(lens)), dtype='float64')
	n, bins = np.histogram(L, bins=2)
	idx_l = np.digitize(L, bins)
	pos_l = (idx_l == 1)
	r_list_l = np.where(pos_l)[0]
	lower_y = np.log(np.array(list(profiles[r_list_l]), dtype=np.float))
	# print lower_y.shape, L[r_list_l]
	pos_u = (idx_l > 1)
	r_list_u = np.where(pos_u)[0]
	upper_y = np.log(np.array(list(profiles[r_list_u]), dtype=np.float))
	# print upper_y.shape, L[r_list_u]
	if upper_y.shape[0] < 2 or lower_y.shape[0] < 2:
		return np.ones(profiles[0].shape[0])
	'''ks 2 sample'''
	pval=[]
	for k in xrange(lower_y.shape[1]):
		try:
			_, p = stats.ks_2samp(lower_y[:,k], upper_y[:,k])
		except ValueError:
			p = 1
		pval.append(p)
	if plot:
		plot_ks_analysis(lower_y, upper_y, pval, name)
	pv = np.array(pval)
	return pv
Ejemplo n.º 31
0
def ks(list_obs, list_poisson, sample_size, confidence, lamb):
    #D, p_value = ks_2samp(list_poisson, list_obs)
    D, p_value = ks_2samp(list_poisson, list_obs)

    a = "The Kolmogorov-Smirnov Test accept the null hypothesis"
    b = "The Kolmogorov-Smirnov Test reject the null hypothesis"

    global teste
    teste = ''

    if lamb > 10:
        teste = b
  
    D_critico = 0
    if confidence == 0.95:
        confidenceL = 1
    elif confidence == 0.99:
        confidenceL = 2
    else:
        confidenceL = 0
    
    s = sqrt(sample_size)
    
    table = np.matrix([[0.202, 0.214, 0.226, 0.237, 0.254],
                  [0.234, 0.242,0.254,0.265,0.281],
                  [0.290,0.3,0.310,0.324,0.334],
                  [0.152,0.166,0.172,0.179,0.185],
                  [0.180,0.188,0.194,0.199,0.206],
                  [0.223,0.234,0.236,0.243,0.249],
                  [0.120,0.132,0.140,0.144,0.149],
                  [0.141,0.151,0.156,0.160,0.165],
                  [0.176,0.185,0.188,0.195,0.197],
                  [0.100,0.112,0.116,0.120,0.124],
                  [0.116,0.125,0.129,0.134,0.140],
                  [0.149,0.154,0.158,0.160,0.168],
                  [0.087,0.097,0.102,0.106,0.110],
                  [0.101,0.108,0.113,0.118,0.122],
                  [0.130,0.135,0.137,0.143,0.146],
                  [0.55/s,0.61/s,0.65/s,0.67/s,0.7/s],
                  [0.64/s,0.69/s,0.72/s,0.75/s,0.77/s],
                  [0.82/s,0.86/s,0.87/s,0.9/s,0.93/s]])
    
    if sample_size < 12:
        if confidenceL == 0:
            if lamb <=1:
                if D <= table[0,0]:
                    teste = a
                    D_critico = table[0,0]
                    
                else:
                    teste = b
                    D_critico = table[0,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[0,1]:
                    teste = a
                    D_critico= table[0,1]
                    
                else:
                    teste = b
                    D_critico= table[0,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[0,2]:
                    teste = a
                    D_critico= table[0,2]
                    
                else:
                    teste = b
                    D_critico= table[0,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[0,3]:
                    teste = a
                    D_critico= table[0,3]
                    
                else:
                    teste = b
                    D_critico= table[0,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[0,4]:
                    teste = a
                    D_critico= table[0,4]
                    
                else:
                    teste = b
                    D_critico= table[0,4]
            
#-----------------------------------------------------------   
        
        elif confidenceL == 1:
            if lamb <=1:
                if D <= table[1,0]:
                    teste = a
                    D_critico= table[1,0]
                    
                else:
                    teste = b
                    D_critico= table[1,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[1,1]:
                    teste = a
                    D_critico= table[1,1]
                    
                else:
                    teste = b
                    D_critico= table[1,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[1,2]:
                    teste = a
                    D_critico= table[1,2]
                else:
                    teste = b
                    D_critico= table[1,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[1,3]:
                    teste = a
                    D_critico= table[1,3]
                    
                else:
                    teste = b
                    D_critico= table[1,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[1,4]:
                    teste = a
                    D_critico= table[1,4]
                    
                else:
                    teste = b
                    D_critico= table[1,4]
#-----------------------------------------------------------
        elif confidenceL == 2:
            if lamb <=1:
                if D <= table[2,0]:
                    teste = a
                    D_critico= table[2,0]
                    
                else:
                    teste = b
                    D_critico= table[2,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[2,1]:
                    teste = a
                    D_critico= table[2,1]
                else:
                    teste = b
                    D_critico= table[2,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[2,2]:
                    teste = a
                    D_critico= table[2,2]
                    
                else:
                    teste = b
                    D_critico= table[2,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[2,3]:
                    teste = a
                    D_critico= table[2,3]
                    
                else:
                    teste = b
                    D_critico= table[2,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[2,4]:
                    teste = a
                    D_critico= table[2,4]
                    
                else:
                    teste = b
                    D_critico= table[2,4]
#-----------------------------------------------------------
    elif sample_size >=12 and sample_size  < 20:
        if confidenceL == 0:
            if lamb <=1:
                if D <= table[3,0]:
                    teste = a
                    D_critico= table[3,0]
                    
                else:
                    teste = b
                    D_critico= table[3,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[3,1]:
                    teste = a
                    D_critico= table[3,1]

                else:
                    teste = b
                    D_critico= table[3,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[3,2]:
                    teste = a
                    D_critico= table[3,2]

                else:
                    teste = b
                    D_critico= table[3,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[3,3]:
                    teste = a
                    D_critico= table[3,3]

                else:
                    teste = b
                    D_critico= table[3,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[3,4]:
                    teste = a
                    D_critico= table[3,4]

                else:
                    teste = b
                    D_critico= table[3,4]
            
#-----------------------------------------------------------
        
        elif confidenceL == 1:
            if lamb <=1:
                if D <= table[4,0]:
                    teste = a
                    D_critico= table[4,0]

                else:
                    teste = b
                    D_critico= table[4,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[4,1]:
                    teste = a
                    D_critico= table[4,1]

                else:
                    teste = b
                    D_critico= table[4,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[4,2]:
                    teste = a
                    D_critico= table[4,2]

                else:
                    teste = b
                    D_critico= table[4,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[4,3]:
                    teste = a
                    D_critico= table[4,3]
                    
                else:
                    teste = b
                    D_critico= table[4,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[4,4]:
                    teste = a
                    D_critico= table[4,4]
                    
                else:
                    teste = b
                    D_critico= table[4,4]
#-----------------------------------------------------------           
        elif confidenceL == 2:
            if lamb <=1:
                if D <= table[5,0]:
                    teste = a
                    D_critico= table[5,0]
                    
                else:
                    teste = b
                    D_critico= table[5,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[5,1]:
                    teste = a
                    D_critico= table[5,1]
                    
                else:
                    teste = b
                    D_critico= table[5,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[5,2]:
                    teste = a
                    D_critico= table[5,2]
                    
                else:
                    teste = b
                    D_critico= table[5,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[5,3]:
                    teste = a
                    D_critico= table[5,3]
                    
                else:
                    teste = b
                    D_critico= table[5,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[5,4]:
                    teste = a
                    D_critico= table[5,4]
                    
                else:
                    teste = b
                    D_critico= table[5,4]
#-----------------------------------------------------------
    elif sample_size  >= 20 and sample_size < 30:
    
        if confidenceL == 0:
            if lamb <=1:
                if D <= table[6,0]:
                    teste = a
                    D_critico= table[6,0]
                    
                else:
                    teste = b
                    D_critico= table[6,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[6,1]:
                    teste = a
                    D_critico= table[6,1]
                    
                else:
                    teste = b
                    D_critico= table[6,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[6,2]:
                    teste = a
                    D_critico= table[6,2]

                else:
                    teste = b
                    D_critico= table[6,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[6,3]:
                    teste = a
                    D_critico= table[6,3]

                else:
                    teste = b
                    D_critico= table[6,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[6,4]:
                    teste = a
                    D_critico= table[6,4]

                else:
                    teste = b
                    D_critico= table[6,4]
#-----------------------------------------------------------
        
        
        elif confidenceL == 1:
            if lamb <=1:
                if D <= table[7,0]:
                    teste = a
                    D_critico= table[7,0]

                else:
                    teste = b
                    D_critico= table[7,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[7,1]:
                    teste = a
                    D_critico= table[7,1]

                else:
                    teste = b
                    D_critico= table[7,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[7,2]:
                    teste = a
                    D_critico= table[7,2]

                else:
                    teste = b
                    D_critico= table[7,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[7,3]:
                    teste = a
                    D_critico= table[7,3]

                else:
                    teste = b
                    D_critico= table[7,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[7,4]:
                    teste = a
                    D_critico= table[7,4]

                else:
                    teste = b
                    D_critico= table[7,4]
#-----------------------------------------------------------
        elif confidenceL == 2:
            if lamb <=1:
                if D <= table[8,0]:
                    teste = a
                    D_critico= table[8,0]

                else:
                    teste = b
                    D_critico= table[8,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[8,1]:
                    teste = a
                    D_critico= table[8,1]

                else:
                    teste = b
                    D_critico= table[8,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[8,2]:
                    teste = a
                    D_critico= table[8,2]

                else:
                    teste = b
            elif lamb > 3 and lamb <=5:
                if D <= table[8,3]:
                    teste = a
                    D_critico= table[8,3]
                    
                else:
                    teste = b
            elif lamb > 5 and lamb <=10:
                if D <= table[8,4]:
                    teste = a
                    D_critico= table[8,4]
                    
                else:
                    teste = b
                    D_critico= table[8,4]
        #-----------------------------------------------------------
    elif sample_size >= 30 and sample_size < 40:
    
        if confidenceL == 0:
            if lamb <=1:
                if D <= table[9,0]:
                    teste = a
                    D_critico= table[9,0]
                else:
                    teste = b
                    D_critico= table[9,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[9,1]:
                    teste = a
                    D_critico= table[9,1]
                    
                else:
                    teste = b
                    D_critico= table[9,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[9,2]:
                    teste = a
                    D_critico= table[9,2]
                    
                else:
                    teste = b
                    D_critico= table[9,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[9,3]:
                    teste = a
                    D_critico= table[9,3]
                    
                else:
                    teste = b
                    D_critico= table[9,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[9,4]:
                    teste = a
                    D_critico= table[9,4]
                else:
                    teste = b
                    D_critico= table[9,4]
            
#-----------------------------------------------------------
        
        elif confidenceL == 1:
            if lamb <=1:
                if D <= table[10,0]:
                    teste = a
                    D_critico= table[10,0]
                    
                else:
                    teste = b
                    D_critico= table[10,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[10,1]:
                    teste = a
                    D_critico= table[10,1]
                    
                else:
                    teste = b
                    D_critico= table[10,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[10,2]:
                    teste = a
                    D_critico= table[10,2]
                    
                else:
                    teste = b
                    D_critico= table[10,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[10,3]:
                    teste = a
                    D_critico= table[10,3]
                    
                else:
                    teste = b
                    D_critico= table[10,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[10,4]:
                    teste = a
                    D_critico= table[10,4]
                    
                else:
                    teste = b
                    D_critico= table[10,4]
#-----------------------------------------------------------
        elif confidenceL == 2:
            if lamb <=1:
                if D <= table[11,0]:
                    teste = a
                    D_critico= table[11,0]
                    
                else:
                    teste = b
                    D_critico= table[11,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[11,1]:
                    teste = a
                    D_critico= table[11,1]
                    
                else:
                    teste = b
                    D_critico= table[11,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[11,2]:
                    teste = a
                    D_critico= table[11,2]
                    
                else:
                    teste = b
                    D_critico= table[11,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[11,3]:
                    teste = a
                    D_critico= table[11,3]
                    
                else:
                    teste = b
                    D_critico= table[11,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[11,4]:
                    teste = a
                    D_critico= table[11,4]
                    
                else:
                    teste = b
                    D_critico= table[11,4]
#-----------------------------------------------------------
    
    elif sample_size == 40:

        if confidenceL == 0:
            if lamb <=1:
                if D <= table[12,0]:
                    teste = a
                    D_critico= table[12,0]

                else:
                    teste = b
                    D_critico= table[12,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[12,1]:
                    teste = a
                    D_critico= table[12,1]

                else:
                    teste = b
                    D_critico= table[12,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[12,2]:
                    teste = a
                    D_critico= table[12,2]

                else:
                    teste = b
                    D_critico= table[12,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[12,3]:
                    teste = a
                    D_critico= table[12,3]

                else:
                    teste = b
                    D_critico= table[12,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[12,4]:
                    teste = a
                    D_critico= table[12,4]

                else:
                    teste = b
                    D_critico= table[12,4]
            
#-----------------------------------------------------------
        
        elif confidenceL == 1:
            if lamb <=1:
                if D <= table[13,0]:
                    teste = a
                    D_critico= table[13,0]

                else:
                    teste = b
                    D_critico= table[13,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[13,1]:
                    teste = a
                    D_critico= table[13,1]

                else:
                    teste = b
                    D_critico= table[13,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[13,2]:
                    teste = a
                    D_critico= table[13,2]

                else:
                    teste = b
                    D_critico= table[13,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[13,3]:
                    teste = a
                    D_critico= table[13,3]

                else:
                    teste = b
                    D_critico= table[13,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[13,4]:
                    teste = a
                    D_critico= table[13,4]

                else:
                    teste = b
                    D_critico= table[13,4]
#-----------------------------------------------------------
        elif confidenceL == 2:
            if lamb <=1:
                if D <= table[14,0]:
                    teste = a
                    D_critico= table[14,0]

                else:
                    teste = b
                    D_critico= table[14,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[14,1]:
                    teste = a
                    D_critico= table[14,1]

                else:
                    teste = b
                    D_critico= table[14,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[14,2]:
                    teste = a
                    D_critico= table[14,2]

                else:
                    teste = b
                    D_critico= table[14,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[14,3]:
                    teste = a
                    D_critico= table[14,3]

                else:
                    teste = b
                    D_critico= table[14,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[14,4]:
                    teste = a
                    D_critico= table[14,4]

                else:
                    teste = b    
                    D_critico= table[14,4]
#-----------------------------------------------------------                    
    elif sample_size > 40:

        if confidenceL == 0:
            if lamb <=1:
                if D <= table[15,0]:
                    teste = a
                    D_critico= table[15,0]
                   
                else:
                    teste = b
                    D_critico= table[15,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[15,1]:
                    teste = a
                    D_critico= table[15,1]
                   
                else:
                    teste = b
                    D_critico= table[15,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[15,2]:
                    teste = a
                    D_critico= table[15,2]

                else:
                    teste = b
                    D_critico= table[15,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[15,3]:
                    teste = a
                    D_critico= table[15,3]

                else:
                    teste = b
                    D_critico= table[15,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[15,4]:
                    teste = a
                    D_critico= table[15,4]

                else:
                    teste = b
                    D_critico= table[15,4]
            
#-----------------------------------------------------------
        
        elif confidenceL == 1:
            if lamb <=1:
                if D <= table[16,0]:
                    teste = a
                    D_critico= table[16,0]

                else:
                    teste = b
                    D_critico= table[16,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[16,1]:
                    teste = a
                    D_critico= table[16,1]

                else:
                    teste = b
                    D_critico= table[16,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[16,2]:
                    teste = a
                    D_critico= table[16,2]

                else:
                    teste = b
                    D_critico= table[16,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[16,3]:
                    teste = a
                    D_critico= table[16,3]

                else:
                    teste = b
                    D_critico= table[16,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[16,4]:
                    teste = a
                    D_critico= table[16,4]

                else:
                    teste = b
                    D_critico= table[16,4]
#-----------------------------------------------------------
        elif confidenceL == 2:
            if lamb <=1:
                if D <= table[17,0]:
                    teste = a
                    D_critico= table[17,0]

                else:
                    teste = b
                    D_critico= table[17,0]
            elif lamb > 1 and lamb <=2:
                if D <= table[17,1]:
                    teste = a
                    D_critico= table[17,1]

                else:
                    teste = b
                    D_critico= table[17,1]
            elif lamb > 2 and lamb <=3:
                if D <= table[17,2]:
                    teste = a
                    D_critico= table[17,2]

                else:
                    teste = b
                    D_critico= table[17,2]
            elif lamb > 3 and lamb <=5:
                if D <= table[17,3]:
                    teste = a
                    D_critico= table[17,3]

                else:
                    teste = b
                    D_critico= table[17,3]
            elif lamb > 5 and lamb <=10:
                if D <= table[17,4]:
                    teste = a
                    D_critico= table[17,4]

                else:
                    teste = b
                    D_critico= table[17,4]    
    return D, teste, D_critico
    def calculate(self, reference_data: pd.DataFrame, production_data: pd.DataFrame, column_mapping): 
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [name for name in num_feature_names if is_numeric_dtype(reference_data[name])] 

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [name for name in cat_feature_names if is_numeric_dtype(reference_data[name])] 
        
        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [date_column, id_column, target_column, prediction_column]

            num_feature_names = list(set(reference_data.select_dtypes([np.number]).columns) - set(utility_columns))
            cat_feature_names = list(set(reference_data.select_dtypes([np.object]).columns) - set(utility_columns))

        if prediction_column is not None:
            #calculate output drift
            pred_p_value = ks_2samp(reference_data[prediction_column], production_data[prediction_column])[1]
            pred_sim_test = "detected" if pred_p_value < 0.05 else "not detected"

            #plot output distributions
            pred_distr = ff.create_distplot(
                [reference_data[prediction_column], production_data[prediction_column]], 
                ["Reference", "Production"],  
                colors=[grey, red],
                show_rug=True)

            pred_distr.update_layout(
                xaxis_title = "Value",
                yaxis_title = "Share",
                legend = dict(
                orientation="h",
                yanchor="bottom",
                y=1.02,
                xanchor="right",
                x=1
                )
            )

            pred_drift_json  = json.loads(pred_distr.to_json())

            self.wi = BaseWidgetInfo(
                title="Prediction Drift: " + pred_sim_test + ", p_value=" + str(round(pred_p_value, 6)),
                type="big_graph",
                details="",
                alertStats=AlertStats(),
                alerts=[],
                alertsPosition="row",
                insights=[],
                size=2,
                params={
                    "data": pred_drift_json['data'],
                    "layout": pred_drift_json['layout']
                },
                additionalGraphs=[],
            )
        else:
            self.wi = None
Ejemplo n.º 33
0
def compare_two_root_files(file1, file2, tolerance=0.02):
    """Compare two ROOT(.cern.ch) files and return dictionary of comparison."""
    comparison = {}
    # content1 = dict((key, value) for (key, value) in walk(file1))
    # content2 = dict((key, value) for (key, value) in walk(file2))
    keys1 = set(recursive_keys(file1))
    keys2 = set(recursive_keys(file2))
    all_keys = sorted(keys1 | keys2)
    print(f'Testing {len(all_keys)} distributions')
    # for name in tqdm(all_keys): # tqdm does not work well inside CI
    for name in all_keys:
        comparison[name] = {}
        status = FAILED
        evaluationValue, ks_statistic, pvalue = 0, 0, 0
        diff = np.array([])
        reason = ''

        evaluationFunc = maxRelativeDifference
        cut = 'value <= {}'.format(tolerance)

        try:
            value1 = load_value(name, file1, keys1)
            value2 = load_value(name, file2, keys2)
        except TypeError as e:
            yield name, dict(
                status=WARNING,
                reason=str(e),
                original=None,
                reference=None,
                diff=None,
            )
            continue
        try:
            v1_size = np.size(value1)
            v2_size = np.size(value2)
        except Exception:
            reason = f'Cannot handle {name} due to issues with value.size'
            yield name, dict(status=WARNING, reason=reason)
            continue

        if value1 is None or value2 is None:
            status = UNKNOWN
            reason = 'Cannot convert data to numpy array'
        elif len(value1) == 0 and len(value2) == 0:
            status = SUCCESS
            pvalue = 1
        elif (v1_size == 0 and v2_size > 0) or (v1_size > 0 and v2_size == 0):
            status = FAILED
            reason = 'original file is empty' if v1_size > 0 else 'reference file is empty'
            diff = value1 if v1_size > 0 else value2
        else:
            ks_statistic, pvalue = stats.ks_2samp(ak.to_numpy(value2),
                                                  ak.to_numpy(value1))

            try:
                diff = difference(value2, value1)
                evaluationValue = evaluationFunc(value1, value2)
                status = evaluateStatus(value1, value2, evaluationFunc, cut)
                if status == FAILED:
                    reason = f'evaluationFunc({evaluationValue} > {cut}) failed'
            except Exception as e:
                reason = str(e)
                status = UNKNOWN
        yield name, dict(
            status=status,
            original=value1,
            reference=value2,
            diff=diff,
            evaluationValue=evaluationValue,
            ks_statistic=ks_statistic,
            pvalue=pvalue,
            reason=reason,
        )
Ejemplo n.º 34
0
def get_index_date_ad_save_together():
    # Creates files streaming
    file_one = None
    file_two = None
    """
    # Read the source file with previously separated lines 
    """
    file_one = open('separate_lines.csv', 'r')

    mjd_one = []
    yyyymmdd = []
    vlr_s1 = []
    sig_s1 = []

    with file_one as f1:
        for line in f1:
            splits = line[:-1].split(',')
            mjd_one.append(splits[0])
            yyyymmdd.append(splits[1])
            vlr_s1.append(float(splits[2]))
            sig_s1.append(float(splits[3]))
    file_one.close()
    """
    # Read the source file with interpolated data 
    """
    file_two = open('file-interpolated-rounded.csv', 'r')
    mjd_two = []
    vlr_s2 = []
    sig_s2 = []

    file_two.readline()  # skip 1st line
    #file_two.readline() # skip 2nd line
    with file_two as f2:
        for line in f2:
            splits = line[:-1].split(',')
            mjd_two.append(splits[0])
            vlr_s2.append(float(splits[2]))
            sig_s2.append(float(splits[3]))
    file_two.close()
    """
     Compare
    """
    mjd = []
    dts = []
    fd1 = []
    er1 = []
    fd2 = []
    er2 = []

    c = 0
    for c in range(len(mjd_one)):
        x = 0
        for x in range(len(mjd_two)):
            if mjd_one[c] in mjd_two[x]:
                mjd.append(mjd_one[c])
                dts.append(yyyymmdd[c])
                fd1.append(float(vlr_s1[c]))
                er1.append(float(sig_s1[c]))
                fd2.append(float(vlr_s2[x]))
                er2.append(float(sig_s2[x]))
                #print('{0},{1},{2} {3:5.2f},{4:5.2f},{5:5.2f},{6:5.2f}'.format(mjd_one[c],mjd_two[x],yyyymmdd[c],vlr_s1[c],sig_s1[c],vlr_s2[x],sig_s2[x]))
    """
    # Write list
    
    """
    outf = open('final_to_evaluate_with_k-s-test.csv', 'w')
    outf.write('mjd,date,Soriginal,sigSoriginal,Scalc,sigScalc\n')
    c = 0
    for c in range(len(mjd)):
        outf.write('{0},{1},{2:5.2f},{3:5.2f},{4:5.2f},{5:5.2f}\n'.format(
            mjd[c], dts[c], fd1[c], er1[c], fd2[c], er2[c]))

    outf.close()
    """
     Kolmogorov-Smirnov Test
    """
    result_ks = ks_2samp(fd1, fd2)
    print(result_ks)

    fne = open('k-s_test.txt', 'w')
    fne.write('# Kolmogorov-Smirnov Test\n')
    fne.write('# statistic={0} pvalue={1}\n'.format(result_ks[0],
                                                    result_ks[1]))
    fne.close()

    return
 lr = LogisticRegression(epoch=20, solver='NM', learning_rate=0.001,threshold=1e-4)      
 lr.fit(trainX,trainY)    
 #===========================================================================
 # test
 #===========================================================================
 y_pro,y_pre = lr.predict(testX) 
     
 #===========================================================================
 # evaluation
 #===========================================================================
 tn,fp,fn,tp = confusion_matrix(y_true=testY, y_pred=y_pre).ravel()
 print('准确率:',(tp+tn)/(tn+fp+fn+tp))
 print('查全率:',tp/(tp+fn))
 print('查准率:',tp/(tp+fp))    
 print('auc:',roc_auc_score(y_true=testY, y_score=y_pro))
 get_ks = lambda y_pred,y_true: ks_2samp(y_pred[y_true==1], y_pred[y_true!=1]).statistic      
 print('ks:',get_ks(y_pre,testY))
 
 #plot ROC
 fpr,tpr,thresholds = roc_curve(y_true=testY, y_score=y_pro)
 roc_auc = auc(fpr,tpr)  
 plt.title('Receiver Operating Characteristic')  
 plt.plot(fpr,tpr,'b',label='AUC = %0.2f'% roc_auc)  
 plt.legend(loc='lower right')  
 plt.plot([0,1],[0,1],'r--')      
 plt.ylabel('True Positive Rate')  
 plt.xlabel('False Positive Rate')  
 plt.show()   
 
 #Somer’s D concordance statistics
 pr_0 = []
plt.xlabel('Time', fontsize=16, fontweight='bold')
plt.ylabel('Displacement', fontsize=16, fontweight='bold')
plt.legend(loc=0, fontsize=14)

plt.ylim(-42, 75)

ax = plt.gca()
PlotStyle(ax, '')

###############################################################################
#                    Residuals Statistical test
###############################################################################

ObRes = [signal - model for signal, model in zip(WhiteSignal, FitSolutionA)]

KS = stats.ks_2samp(ObRes, WhiteNoise)

print(KS)

###############################################################################
#                              ODE system  solving
###############################################################################

SolverTime = np.linspace(0, 20, num=120)

#Model B Parameters
k1 = 0.3
k2 = 0.25
k3 = 0.1

# variance, standard deviation, mean, median, etc
from statistics import variance, stdev, mean, median
lst = [1, 2, 3, 4]
my_variance = variance(lst)
my_sd = stdev(lst)
my_mean = mean(lst)
my_median = median(lst)

# normalize a list into [0,1]
## Method 1 - min, max
max_num = max(lst)
min_num = min(lst)
normalized_lst = [(x - min_num) / (max_num - min_num)]

## Method 2 - Normalize to Standard Gaussian Distribution
mean_num = mean(lst)
my_sd = stdev(lst)
normalized_lst = [(x - mean_num) / my_sd for x in lst]

# Compare 2 curves
## Method 1 - Kolmogorov–Smirnov test
from scipy.stats import ks_2samp
from numpy import array

pv = ks_2samp(num_lst1, num_lst2)
# if p-value smaller than the threshold, then reject null hypothesis, which means the 2 curves are not similar
df_list = list()


# simulation loop
for sample_size in sample_sizes:
    df = pd.DataFrame(data=means, columns=["mean_data"])
    
    print("Simulating data for sample size {}".format(sample_size))

    for i in range(num_iterations):
        ks_results = list()
        
        # calculate the KS test p-value
        for mean in means:
            s0 = np.random.normal(loc=0, scale=1, size=sample_size)
            s_test = np.random.normal(loc=mean, scale=1, size=sample_size)
            ks_results.append(stats.ks_2samp(s0, s_test)[1])
            
        # add results to dataframe
        df["iter_"+str(i)] = ks_results
        
    # calculate mean values across simulation
    df["mean_res"] = df[df.columns[1:]].mean(axis=1)
    df["std_res"] = df[df.columns[1:]].std(axis=1)

    # append results to dataframe list
    df_list.append(df)

# saving data to files
for df, sample_size in zip(df_list, sample_sizes):
    df.to_csv(str(destination_path) + "/size_"+str(sample_size), header=True, index=False)
Ejemplo n.º 39
0
stats.scoreatpercentile(generated,95)
#数值所在的百分比
stats.percentileofscore(generated,1)

#分布直方图
import matplotlib.pyplot as plt
plt.hist(generated)
plt.show()
#均值检验
import numpy as np
price = get_price(['000001.XSHE','601398.XHSG'],start_date = '2016-01-01',end_date = '2017-01-01',fields='close')
price_001 = np.diff(np.log(np.array(price['000001.XSHE'])))
price_398 = np.diff(np.log(np.array(price['601398.XHSG'])))

#Kolmogorov-Smirnov检验
stats.ks_2samp(price_001,price_398)
#Jarque-Bera正态性检验
stats.jarque_bera(price_001 -price_398)[-1]

#信号处理

#检验股价的线性趋势
from datetime import date,datetme,time
from scipy import signal
import pandas as pd
from matplotlib.dates import DateFormatter
from matplotlib.dates import DayLocator
from matplotlib.dates import MonthLocater
price = get_price(['000001.XSHE','601398.XHSG'],start_date = '2016-01-01',end_date = '2017-01-01',fields='close')
y = signal.detrend(price)
#Series=串
Ejemplo n.º 40
0
plt.fill_between(xpoints,
                 ypoints + errors,
                 ypoints - errors,
                 facecolor='green',
                 alpha=0.4,
                 label='error')
plt.grid()
plt.legend(loc='lower right')
plt.savefig('BDT_roccurve.png')
#print('ROCS', Rocs)
"""
#plot overtraining graph

plt.figure(200)
plt.xlabel('Ratio of data used to train')
plt.ylabel('Accuracy of BDT')
#plt.title('Graph to study overtraining')
plt.plot(np.arange(0.01,1,0.01), accuraciestt[0], label='accuracies for testing')
plt.plot(np.arange(0.01,1,0.01), accuraciestt[1], label='accuracies for training')
plt.legend()
plt.savefig('Accuracy of BDT')"""

from scipy.stats import ks_2samp

a, Gluon_KS = ks_2samp(probs[0][0], probs[1][0])
a, Quark_KS = ks_2samp(probs[0][1], probs[1][1])
print(Gluon_KS, Quark_KS)

print('acc', accuraciestt)
plt.show()
Ejemplo n.º 41
0
def main():
    # assuming 'theFile' contains one name per line, read the file

    if getpass.getuser() == 'frenchd':

        #         pickleFilename = '/Users/frenchd/Research/inclination/git_inclination/picklePilot_plusSALT_14.p'
        #         gtPickleFilename = '/Users/frenchd/Research/inclination/git_inclination/pickleGT.p'
        #         saveDirectory = '/Users/frenchd/Research/inclination/git_inclination/plotting_code/figs'

        #         gtPickleFilename = '/Users/frenchd/Research/inclination/git_inclination/pickleGT_filteredAll.p'
        gtPickleFilename = '/Users/frenchd/Research/GT_update2/pickleGT_filteredAll.p'

        saveDirectory = '/Users/frenchd/Research/inclination/git_inclination/plotting_code/figs/'

        isolated_filename = '/Users/frenchd/Research/inclination/git_inclination/isolated4.p'
        L_isolated_filename = '/Users/frenchd/Research/inclination/git_inclination/L_isolated4.p'
        L_associated_isolated_filename = '/Users/frenchd/Research/inclination/git_inclination/L_associated_isolated4.p'
        L_associated_filename = '/Users/frenchd/Research/inclination/git_inclination/L_associated4.p'
        L_nonassociated_filename = '/Users/frenchd/Research/inclination/git_inclination/L_nonassociated4.p'
        L_two_filename = '/Users/frenchd/Research/inclination/git_inclination/L_two4.p'
        L_two_plus_filename = '/Users/frenchd/Research/inclination/git_inclination/L_two_plus4.p'
        L_group_filename = '/Users/frenchd/Research/inclination/git_inclination/L_group4.p'

    else:
        print 'Could not determine username. Exiting.'
        sys.exit()

    # pickle file for the whole galaxy table:
    gtPickleFile = open(gtPickleFilename, 'rU')
    gtDict = pickle.load(gtPickleFile)
    gtPickleFile.close()

    # open all the pickle files
    isolated_file = open(isolated_filename, 'r')
    L_isolated_file = open(L_isolated_filename, 'r')
    L_associated_isolated_file = open(L_associated_isolated_filename, 'r')
    L_associated_file = open(L_associated_filename, 'r')
    L_nonassociated_file = open(L_nonassociated_filename, 'r')
    L_two_file = open(L_two_filename, 'r')
    L_two_plus_file = open(L_two_plus_filename, 'r')
    L_group_file = open(L_group_filename, 'r')

    # unload the data from them
    isolated = pickle.load(isolated_file)
    L_isolated = pickle.load(L_isolated_file)
    L_associated_isolated = pickle.load(L_associated_isolated_file)
    L_associated = pickle.load(L_associated_file)
    L_nonassociated = pickle.load(L_nonassociated_file)
    L_two = pickle.load(L_two_file)
    L_two_plus = pickle.load(L_two_plus_file)
    L_group = pickle.load(L_group_file)

    # close the files
    isolated_file.close()
    L_isolated_file.close()
    L_associated_isolated_file.close()
    L_associated_file.close()
    L_nonassociated_file.close()
    L_two_file.close()
    L_two_plus_file.close()
    L_group_file.close()

    # if match, then the includes in the file have to MATCH the includes above. e.g., if
    # virInclude = False, cusInclude = True, finalInclude = False, then only systems
    # matching those three would be included. Otherwise, all cusInclude = True would be included
    # regardless of the others

    dataSet = L_associated_isolated

    Lya_vs = dataSet['Lya_vs']
    e_Lya_vs = dataSet['e_Lya_vs']
    Lya_Ws = dataSet['Lya_Ws']
    e_Lya_Ws = dataSet['e_Lya_Ws']
    Nas = dataSet['Nas']
    e_Nas = dataSet['e_Nas']
    bs = dataSet['bs']
    e_bs = dataSet['e_bs']
    Ws = dataSet['Ws']
    e_Ws = dataSet['e_Ws']
    targets = dataSet['targets']
    z_targets = dataSet['z_targets']
    RA_targets = dataSet['RA_targets']
    Dec_targets = dataSet['Dec_targets']
    Names = dataSet['Names']
    RA_galaxies = dataSet['RA_galaxies']
    Dec_galaxies = dataSet['Dec_galaxies']
    impacts = dataSet['impacts']
    azimuths = dataSet['azimuths']
    PAs = dataSet['PAs']
    incs = dataSet['incs']
    adjustedIncs = dataSet['adjustedIncs']
    ls = dataSet['ls']
    l_cuss = dataSet['l_cuss']
    R_virs = dataSet['R_virs']
    cuss = dataSet['cuss']
    MajDiams = dataSet['MajDiams']
    MTypes = dataSet['MTypes']
    Vhels = dataSet['Vhels']
    vcorrs = dataSet['vcorrs']
    bestDists = dataSet['bestDists']
    e_bestDists = dataSet['e_bestDists']
    group_nums = dataSet['group_nums']
    group_mems = dataSet['group_mems']
    group_dists = dataSet['group_dists']
    Lstar_meds = dataSet['Lstar_meds']
    e_Lstar_meds = dataSet['e_Lstar_meds']
    Bmags = dataSet['Bmags']

    majorAxisL = gtDict['majorAxis']
    incL = gtDict['inc']
    adjustedIncL = gtDict['adjustedInc']
    paL = gtDict['PA']
    BmagL = gtDict['Bmag']
    #     Bmag_sdssL = gtDict['Bmag_sdss']
    RID_medianL = gtDict['RID_median']
    RID_meanL = gtDict['RID_mean']
    RID_stdL = gtDict['RID_std']
    VhelL = gtDict['Vhel']
    RAdegL = gtDict['RAdeg']
    DEdegL = gtDict['DEdeg']
    NameL = gtDict['Name']

    allPA = paL
    allInclinations = []
    allAdjustedIncs = []
    allCosInclinations = []

    #     print 'type: ',type(incL)
    for i in incL:
        if i != -99:
            i = float(i)
            allInclinations.append(i)

            i2 = pi / 180. * i
            cosi2 = cos(i)
            allCosInclinations.append(cosi2)

    allCosFancyCosInclinations = []
    for i in adjustedIncL:
        if str(i) != '-99':
            i = float(i)

            allAdjustedIncs.append(i)

            i2 = pi / 180. * i
            cosi2 = cos(i)
            allCosFancyCosInclinations.append(cosi2)

    allDiameter = majorAxisL

    print 'finished with this shit'
    print 'len(allAdjustedIncs): ', len(allAdjustedIncs)
    print

    total = 0
    totalNo = 0
    totalYes = 0
    totalIsolated = 0
    totalGroup = 0

    ########################################################################################
    #########################################################################################

    # print all the things
    #

    # absorber info lists
    blues = []
    reds = []
    blueAbs = []
    redAbs = []
    blueW = []
    redW = []
    blueB = []
    redB = []
    e_blueB = []
    e_redB = []
    blueErr = []
    redErr = []
    blueV = []
    redV = []
    blueImpact = []
    redImpact = []

    # galaxy info lists
    blueInc = []
    redInc = []
    blueFancyInc = []
    redFancyInc = []
    blueAz = []
    redAz = []
    bluePA = []
    redPA = []
    blueVcorr = []
    redVcorr = []
    blueVir = []
    redVir = []
    blueLike = []
    redLike = []

    # for absorbers
    for Lya_v, w, e_w, Vhel, i, b, e_b in zip(Lya_vs, Lya_Ws, e_Lya_Ws, Vhels,
                                              impacts, bs, e_bs):
        vel_dif = Lya_v - Vhel
        if vel_dif >= 0:
            reds.append(float(vel_dif))
            redW.append(float(w))
            redErr.append(float(e_w))
            redV.append(float(Vhel))
            redImpact.append(float(i))
            redAbs.append(abs(vel_dif))
            redB.append(float(b))
            e_redB.append(float(e_b))

        else:

            blues.append(float(vel_dif))
            blueW.append(float(w))
            blueErr.append(float(e_w))
            blueV.append(float(Vhel))
            blueImpact.append(float(i))
            blueAbs.append(abs(vel_dif))
            blueB.append(float(b))
            e_blueB.append(float(e_b))

##########################################################################################
##########################################################################################

    nameDict = {}
    # for galaxies
    for Lya_v, Vhel, inc, adjustedInc, az, pa, vcorr, vir, l, name in zip(
            Lya_vs, Vhels, incs, adjustedIncs, azimuths, PAs, vcorrs, R_virs,
            ls, Names):
        vel_dif = Lya_v - Vhel

        if nameDict.has_key(name):
            i = nameDict[name]
            i += 1
            nameDict[name] = i
        else:
            nameDict[name] = 1

        if vel_dif >= 0:
            if inc != -99:
                redInc.append(float(inc))
            if adjustedInc != -99:
                redFancyInc.append(float(adjustedInc))
            if az != -99:
                redAz.append(float(az))
            if pa != -99:
                redPA.append(float(pa))
            if vcorr != -99:
                redVcorr.append(float(vcorr))
            if vir != -99:
                redVir.append(float(vir))
            if l != -99:
                redLike.append(float(l))
        else:

            if inc != -99:
                blueInc.append(float(inc))
            if adjustedInc != -99:
                blueFancyInc.append(float(adjustedInc))
            if az != -99:
                blueAz.append(float(az))
            if pa != -99:
                bluePA.append(float(pa))
            if vcorr != -99:
                blueVcorr.append(float(vcorr))
            if vir != -99:
                blueVir.append(float(vir))
            if l != -99:
                blueLike.append(float(l))

    galaxyNames = nameDict.keys()

    # how many absorbers above vs below vel_cut?
    redVelCount200 = 0
    redVelCount100 = 0
    blueVelCount200 = 0
    blueVelCount100 = 0

    for b in blues:
        if b >= 200:
            blueVelCount200 += 1
        if b >= 100:
            blueVelCount100 += 1

    for r in reds:
        if abs(r) >= 200:
            redVelCount200 += 1
        if abs(r) >= 100:
            redVelCount100 += 1

    assocFancyInc = adjustedIncs

    AGNnameDict = {}
    for i in targets:
        if AGNnameDict.has_key(i):
            c = AGNnameDict[i]
            c += 1
            AGNnameDict[i] = c
        else:
            AGNnameDict[i] = 1

    AGN_list = AGNnameDict.keys()

    # write out a file breaking down all this shit
    #     out_directory = '/Users/frenchd/Research/inclination/git_inclination/rotation_paper/'
    #     save_name = 'full_stats.txt'
    #     stats_filename = '{0}/{1}'.format(out_directory, save_name)
    #     stats_file = open(stats_filename,'wt')

    print
    print '------------------------ Pilot Data -----------------------------'
    print
    #     print 'total number of lines: ', len(lyaWList) + len(lyaWAmbList)
    print 'total number of lines: ', len(Lya_vs)
    print 'total number of unique galaxies matched: ', len(galaxyNames)
    print 'total number of AGN: ', len(AGN_list)
    print '# of redshifted lines: ', len(reds)
    print '# of blueshifted lines: ', len(blues)
    print
    print
    print ' TARGETS '
    print
    print 'final target number: ', len(AGNnameDict.keys())
    for i in AGNnameDict.keys():
        print i
    print
    print
    print
    print
    print '----------------------- Absorber info ----------------------------'
    print
    print 'avg blueshifted EW: ', mean(blueW)
    print 'median blueshifted EW: ', median(blueW)
    print 'avg blue err: ', mean(blueErr)
    print 'median blue err: ', median(blueErr)
    print
    print 'std(blue EW): ', std(blueW)
    print 'stats.sem(blue EW): ', stats.sem(blueW)
    print 'stats.describe(blue EW): ', stats.describe(blueW)
    print
    print 'avg blueshifted vel_diff: ', mean(blues)
    print 'median blueshifted vel_diff: ', median(blues)
    print 'std(blueshifted vel_diff): ', std(blues)
    print 'stats.sem(blue vel_dif): ', stats.sem(blues)
    print 'stats.describe(blue vel_dif: ', stats.describe(blues)
    print
    print '% blueshifted which have vel_diff >= 200 km/s: {0}'.format(
        float(blueVelCount200) / len(blues))
    print 'total number with abs(vel_diff) >= 200 km/s: {0}'.format(
        blueVelCount200)
    print '% blueshifted which have vel_diff >= 100 km/s: {0}'.format(
        float(blueVelCount100) / len(blues))
    print 'total number with abs(vel_diff) >= 100 km/s: {0}'.format(
        blueVelCount100)
    print
    print
    print 'avg blue velocity: ', mean(blueV)
    print 'median blue velocity: ', median(blueV)
    print 'std(blue Velocity): ', std(blueV)
    print 'avg blue impact: ', mean(blueImpact)
    print 'median blue impact: ', median(blueImpact)
    print 'stats.sem(blue impact): ', stats.sem(blueImpact)
    print 'stats.describe(blue impact): ', stats.describe(blueImpact)

    print
    print

    print 'avg redshifted EW: ', mean(redW)
    print 'median redshifted EW: ', median(redW)
    print 'avg red err: ', mean(redErr)
    print 'median red err: ', median(redErr)
    print
    print 'std(red EW): ', std(redW)
    print 'stats.sem(red EW): ', stats.sem(redW)
    print 'stats.describe(red EW): ', stats.describe(redW)

    print
    print 'avg redshifted vel_diff: ', mean(reds)
    print 'median redshifted vel_diff: ', median(reds)
    print 'std(redshifted vel_dif): ', std(reds)
    print 'stats.sem(red vel_dif): ', stats.sem(reds)
    print 'stats.describe(red vel_dif): ', stats.describe(reds)
    print
    print '% redshifted which have abs(vel_diff) >= 200 km/s: {0}'.format(
        float(redVelCount200) / len(reds))
    print 'total number with abs(vel_diff) >= 200 km/s: {0}'.format(
        redVelCount200)
    print '% redshifted which have abs(vel_diff) >= 100 km/s: {0}'.format(
        float(redVelCount100) / len(reds))
    print 'total number with abs(vel_diff) >= 100 km/s: {0}'.format(
        redVelCount100)
    print

    print 'avg red velocity: ', mean(redV)
    print 'median red velocity: ', median(redV)
    print
    print 'avg red impact: ', mean(redImpact)
    print 'median red impact: ', median(redImpact)
    print 'stats.sem(red impact): ', stats.sem(redImpact)
    print 'stats.describe(red impact): ', stats.describe(redImpact)
    print 'std(red impact): ', std(redImpact)

    print
    print '----------------------- Galaxy info ----------------------------'
    print

    # regular inclinations
    incCut = 50
    totalBlueInc = len(blueInc)
    totalRedInc = len(redInc)

    print
    print
    print 'totalBlueInc: ', totalBlueInc
    print 'totalRedInc: ', totalRedInc
    print
    print "blueInc: ", blueInc
    print

    blueIncCount = 0
    for i in blueInc:
        if i >= incCut:
            blueIncCount += 1

    redIncCount = 0
    for i in redInc:
        if i >= incCut:
            redIncCount += 1

    totalInc = len(allInclinations)
    totalCount = 0
    for i in allInclinations:
        if i >= incCut:
            totalCount += 1

    # fancy inclinations
    totalBlueFancyInc = len(blueFancyInc)
    totalRedFancyInc = len(redFancyInc)

    blueFancyIncCount = 0
    for i in blueFancyInc:
        if i >= incCut:
            blueFancyIncCount += 1

    redFancyIncCount = 0
    for i in redFancyInc:
        if i >= incCut:
            redFancyIncCount += 1

    combinedCount = redFancyIncCount + blueFancyIncCount
    totalCombinedCount = totalRedFancyInc + totalBlueFancyInc

    totalFancyInc = len(allAdjustedIncs)
    totalFancyCount = 0
    for i in allAdjustedIncs:
        if i >= incCut:
            totalFancyCount += 1

    print
    print ' INCLINATIONS: '
    print
    print 'Blue: {0} % of associated galaxies have >={1}% inclination'.format(
        float(blueIncCount) / float(totalBlueInc), incCut)
    print 'Red: {0} % of associated galaxies have >={1}% inclination'.format(
        float(redIncCount) / float(totalRedInc), incCut)
    print 'All: {0} % of ALL galaxies have >={1}% inclination'.format(
        float(totalCount) / float(totalInc), incCut)
    print
    print ' FANCY INCLINATIONS: '
    print
    print 'Blue: {0} % of associated galaxies have >={1}% fancy inclination'.format(
        float(blueFancyIncCount) / float(totalBlueFancyInc), incCut)
    print 'Red: {0} % of associated galaxies have >={1}% fancy inclination'.format(
        float(redFancyIncCount) / float(totalRedFancyInc), incCut)
    print 'All: {0} % of ALL galaxies have >={1}% fancy inclination'.format(
        float(totalFancyCount) / float(totalFancyInc), incCut)
    print 'Combined: {0} % of associated galaxies have >= {1} fancy inclination'.format(
        float(combinedCount) / float(totalCombinedCount), incCut)
    print
    print 'Average all fancy inclination: ', mean(allAdjustedIncs)
    print 'stats.sem(all): ', stats.sem(allAdjustedIncs)
    print
    print 'avg blue inclination: ', mean(blueInc)
    print 'median blue inclination: ', median(blueInc)
    print 'avg blue fancy inclination: ', mean(blueFancyInc)
    print 'median blue fancy inclination: ', median(blueFancyInc)
    print
    print 'avg red inclination: ', mean(redInc)
    print 'median red inclination: ', median(redInc)
    print 'avg red fancy inclination: ', mean(redFancyInc)
    print 'median red fancy inclination: ', median(redFancyInc)

    print
    print 'mean associated: ', mean(assocFancyInc)
    print 'stats.sem(associated): ', stats.sem(assocFancyInc)
    print 'stats.describe(associated): ', stats.describe(assocFancyInc)
    print 'stats.sem(blue): ', stats.sem(blueFancyInc)
    print 'stats.describe(blue): ', stats.describe(blueFancyInc)
    print
    print 'stats.sem(red): ', stats.sem(redFancyInc)
    print 'stats.describe(red): ', stats.describe(redFancyInc)
    print
    print
    print "  AZIMUTHS and PA:  "
    print
    print 'avg blue azimuth: ', mean(blueAz)
    print 'median blue azimuth: ', median(blueAz)
    print 'stats.sem(blue az): ', stats.sem(blueAz)
    print 'stats.describe(blue az): ', stats.describe(blueAz)
    print
    print 'avg red azimuth: ', mean(redAz)
    print 'median red azimuth: ', median(redAz)
    print 'stats.sem(red az): ', stats.sem(redAz)
    print 'stats.describe(red az): ', stats.describe(redAz)
    print
    print 'avg blue PA: ', mean(bluePA)
    print 'median blue PA: ', median(bluePA)
    print
    print 'avg red PA: ', mean(redPA)
    print 'median red PA: ', median(redPA)

    print
    print ' VCORR : '
    print
    print 'avg blue vcorr: ', mean(blueVcorr)
    print 'median blue vcorr: ', median(blueVcorr)
    print
    print 'avg red vcorr: ', mean(redVcorr)
    print 'median red vcorr: ', median(redVcorr)

    print
    print ' R_vir: '
    print
    print 'avg blue R_vir: ', mean(blueVir)
    print 'median blue R_vir: ', median(blueVir)
    print 'stats.sem(blue R_vir): ', stats.sem(blueVir)
    print 'stats.describe(blue R_vir): ', stats.describe(blueVir)
    print
    print 'avg red R_vir: ', mean(redVir)
    print 'median red R_vir: ', median(redVir)
    print 'stats.sem(red R_vir): ', stats.sem(redVir)
    print 'stats.describe(red R_vir): ', stats.describe(redVir)

    print
    print ' LIKELIHOOD: '
    print
    print 'avg blue likelihood: ', mean(blueLike)
    print 'median blue likelihood: ', median(blueLike)
    print
    print 'avg red likelihood: ', mean(redLike)
    print 'median red likelihood: ', median(redLike)

    print
    print
    print '-------------------- Distribution analysis ----------------------'
    print
    print

    print ' FANCY INCLINATIONS: '

    # perform the K-S and AD tests for inclination
    ans1 = stats.ks_2samp(blueFancyInc, redFancyInc)
    ans1a = stats.anderson_ksamp([blueFancyInc, redFancyInc])

    print 'KS for blue vs red fancy inclinations: ', ans1
    print 'AD for blue vs red fancy inclinations: ', ans1a

    ans2 = stats.ks_2samp(blueFancyInc, allAdjustedIncs)
    print 'KS for blue vs all fancy inclinations: ', ans2

    ans3 = stats.ks_2samp(redFancyInc, allAdjustedIncs)
    print 'KS for red vs all fancy inclinations: ', ans3

    print
    z_statrb, p_valrb = stats.ranksums(blueFancyInc, redFancyInc)
    z_statall, p_valall = stats.ranksums(assocFancyInc, allAdjustedIncs)
    print 'ranksum red vs blue p-value: ', p_valrb
    print 'ranksum associated vs all: ', p_valall

    ans4 = stats.ks_2samp(assocFancyInc, allAdjustedIncs)
    ans4a = stats.anderson_ksamp([assocFancyInc, allAdjustedIncs])

    print 'KS for all associated vs all fancy inclinations: ', ans4
    print 'AD for all associated vs all fancy inclinations: ', ans4a

    print

    #     ans5 = stats.ks_2samp(spiralIncList, allSpiralIncList)
    #     ans5a = stats.anderson_ksamp([spiralIncList,allSpiralIncList])
    #
    #     print 'KS for all spiral associated vs all spiral fancy inclinations: ',ans5
    #     print 'AD for all spiral associated vs all spiral fancy inclinations: ',ans5a

    print
    print ' INCLINATIONS: '
    print

    # perform the K-S and AD tests for inclination
    ans1 = stats.ks_2samp(blueInc, redInc)
    ans1a = stats.anderson_ksamp([blueInc, redInc])

    print 'KS for blue vs red inclinations: ', ans1
    print 'AD for blue vs red inclinations: ', ans1a

    ans2 = stats.ks_2samp(blueInc, allInclinations)
    print 'KS for blue vs all inclinations: ', ans2

    ans3 = stats.ks_2samp(redInc, allInclinations)
    print 'KS for red vs all inclinations: ', ans3

    assocInc = incs
    ans4 = stats.ks_2samp(assocInc, allInclinations)
    print 'KS for associated vs all inclinations: ', ans4

    print
    print ' EW Distributions: '
    print

    # perform the K-S and AD tests for EW
    ans1 = stats.ks_2samp(blueW, redW)
    ans1a = stats.anderson_ksamp([blueW, redW])
    print 'KS for blue vs red EW: ', ans1
    print 'AD for blue vs red EW: ', ans1a

    print
    print ' Impact parameter Distributions: '
    print

    # perform the K-S and AD tests for impact parameter
    ans1 = stats.ks_2samp(blueImpact, redImpact)
    ans1a = stats.anderson_ksamp([blueImpact, redImpact])
    print 'KS for blue vs red impact parameters: ', ans1
    print 'AD for blue vs red impact parameters: ', ans1a

    print
    print ' \Delta v Distributions: '
    print

    # perform the K-S and AD tests for \delta v
    ans1 = stats.ks_2samp(blueAbs, redAbs)
    ans1a = stats.anderson_ksamp([blueAbs, redAbs])
    print 'KS for blue vs red \Delta v: ', ans1
    print 'AD for blue vs red \Delta v: ', ans1a

    print
    print ' Azimuth Distributions: '
    print

    # perform the K-S and AD tests for azimuth
    ans1 = stats.ks_2samp(blueAz, redAz)
    ans1a = stats.anderson_ksamp([blueAz, redAz])
    print 'KS for blue vs red azimuth: ', ans1
    print 'AD for blue vs red azimuth: ', ans1a
    print

    # now against a flat distribution
    flatRed = arange(0, 90, 1)
    flatBlue = arange(0, 90, 1)

    ans1 = stats.ks_2samp(blueAz, flatBlue)
    ans1a = stats.anderson_ksamp([blueAz, flatBlue])
    print 'KS for blue vs flat azimuth: ', ans1
    print 'AD for blue vs flat azimuth: ', ans1a
    print
    ans1 = stats.ks_2samp(redAz, flatRed)
    ans1a = stats.anderson_ksamp([redAz, flatRed])
    print 'KS for red vs flat azimuth: ', ans1
    print 'AD for erd vs flat azimuth: ', ans1a
    print

    print
    print ' R_vir Distributions: '
    print

    # perform the K-S and AD tests for r_vir
    ans1 = stats.ks_2samp(blueVir, redVir)
    ans1a = stats.anderson_ksamp([blueVir, redVir])
    print 'KS for blue vs red R_vir: ', ans1
    print 'AD for blue vs red R_vir: ', ans1a

    print
    print ' Doppler parameter Distributions: '
    print

    # perform the K-S and AD tests for doppler parameter
    ans1 = stats.ks_2samp(blueB, redB)
    ans1a = stats.anderson_ksamp([blueB, redB])
    print 'KS for blue vs red doppler parameter: ', ans1
    print 'AD for blue vs red doppler parameter: ', ans1a

    print
    print ' Likelihood Distributions: '
    print

    # perform the K-S and AD tests for doppler parameter
    ans1 = stats.ks_2samp(blueLike, redLike)
    ans1a = stats.anderson_ksamp([blueLike, redLike])
    print 'KS for blue vs red likelihood: ', ans1
    print 'AD for blue vs red likelihood: ', ans1a

    print
    print ' COMPLETED. '
Ejemplo n.º 42
0
  def _kernel_leaves_target_invariant(self, initial_draws,
                                      independent_chain_ndims):
    def log_gamma_log_prob(x):
      event_dims = tf.range(independent_chain_ndims, tf.rank(x))
      return self._log_gamma_log_prob(x, event_dims)

    def fake_log_prob(x):
      """Cooled version of the target distribution."""
      return 1.1 * log_gamma_log_prob(x)

    hmc = tfp.mcmc.HamiltonianMonteCarlo(
        target_log_prob_fn=log_gamma_log_prob,
        step_size=0.4,
        num_leapfrog_steps=5,
        seed=_set_seed(43))
    sample, kernel_results = hmc.one_step(
        current_state=initial_draws,
        previous_kernel_results=hmc.bootstrap_results(initial_draws))

    bad_hmc = tfp.mcmc.HamiltonianMonteCarlo(
        target_log_prob_fn=fake_log_prob,
        step_size=0.4,
        num_leapfrog_steps=5,
        seed=_set_seed(44))
    bad_sample, bad_kernel_results = bad_hmc.one_step(
        current_state=initial_draws,
        previous_kernel_results=bad_hmc.bootstrap_results(initial_draws))

    [
        log_accept_ratio_,
        bad_log_accept_ratio_,
        initial_draws_,
        updated_draws_,
        fake_draws_,
    ] = self.evaluate([
        kernel_results.log_accept_ratio,
        bad_kernel_results.log_accept_ratio,
        initial_draws,
        sample,
        bad_sample,
    ])

    # Confirm step size is small enough that we usually accept.
    acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))
    bad_acceptance_probs = np.exp(np.minimum(bad_log_accept_ratio_, 0.))
    self.assertGreater(acceptance_probs.mean(), 0.5)
    self.assertGreater(bad_acceptance_probs.mean(), 0.5)

    # Confirm step size is large enough that we sometimes reject.
    self.assertLess(acceptance_probs.mean(), 0.99)
    self.assertLess(bad_acceptance_probs.mean(), 0.99)

    _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
                                        updated_draws_.flatten())
    _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
                                        fake_draws_.flatten())

    tf.compat.v1.logging.vlog(
        1,
        'acceptance rate for true target: {}'.format(acceptance_probs.mean()))
    tf.compat.v1.logging.vlog(
        1, 'acceptance rate for fake target: {}'.format(
            bad_acceptance_probs.mean()))
    tf.compat.v1.logging.vlog(
        1, 'K-S p-value for true target: {}'.format(ks_p_value_true))
    tf.compat.v1.logging.vlog(
        1, 'K-S p-value for fake target: {}'.format(ks_p_value_fake))
    # Make sure that the MCMC update hasn't changed the empirical CDF much.
    self.assertGreater(ks_p_value_true, 1e-3)
    # Confirm that targeting the wrong distribution does
    # significantly change the empirical CDF.
    self.assertLess(ks_p_value_fake, 1e-6)
Ejemplo n.º 43
0
def get_property_from_dataset(dataset, property_key):
    result = []
    for row in dataset:
        result.append(row[property_key])
    return result


def getClicksFromFile(file_path):
    with open(file_path, 'rb') as csvfile:
        dataset_sample_a = csv.DictReader(csvfile,
                                          delimiter=';',
                                          quotechar='|')

        clicks = np.array(get_property_from_dataset(dataset_sample_a,
                                                    DICT_KEY))

        clicks = map(lambda x: 1 if x == 'yes' else 0, clicks)
        return clicks


clicks_sample_a = getClicksFromFile('amostra_A_click.csv')
clicks_sample_b = getClicksFromFile('amostra_B_click.csv')

ks, p_value = stats.ks_2samp(clicks_sample_a, clicks_sample_b)
print "Ks: " + str(ks)
print "P-value: " + str(p_value)

print "Sum Sample A: " + str(np.sum(clicks_sample_a))
print "Sum Sample B: " + str(np.sum(clicks_sample_b))
def getKSStatPVal(regions, region_to_corr, region_to_info):
    corr_stat, corr_p_val = ks_2samp(region_to_corr[regions[0]],
                                     region_to_corr[regions[1]])
    info_stat, info_p_val = ks_2samp(region_to_info[regions[0]],
                                     region_to_info[regions[1]])
    return regions[0], regions[1], corr_stat, corr_p_val, info_stat, info_p_val
Ejemplo n.º 45
0
def ksprob(arr1, arr2):
    from scipy.stats import ks_2samp
    return ks_2samp(arr1, arr2)[1]
Ejemplo n.º 46
0
##Histogram of dH
NGC_dH = []
SGC_dH = []

i = 0
while i < len(NGCred):
    NGC_dH.append((fun.D(fun.HubbleIntegrate(NGCred[i])) - NGCdist[i]) /
                  fun.D(fun.HubbleIntegrate(NGCred[i])))
    i += 1
i = 0
while i < len(SGCred):
    SGC_dH.append((fun.D(fun.HubbleIntegrate(SGCred[i])) - SGCdist[i]) /
                  fun.D(fun.HubbleIntegrate(SGCred[i])))
    i += 1

print(ks_2samp(NGC_dH, SGC_dH))

plt.plot(SGCred, SGC_dH, '.')
plt.plot([min(SGCred), max(SGCred)], [0, 0])
plt.show()

NGChist = plt.hist(NGC_dH, bins=binno, alpha=0.5, label='NGC', color='blue')
##Best fit for data
(NGCmu, NGCsigma) = norm.fit(NGC_dH)
NGCSE = NGCsigma / np.sqrt(len(NGC_dH))
emptyNGC = plt.hist([],
                    range=[-0.25, 0.25],
                    alpha=0.5,
                    label='$N=%i, \mu=%.2f \pm %.2f$, $\sigma=%.2f$' %
                    (len(NGC_dH), NGCmu, NGCSE, NGCsigma),
                    color='white')
	t_in=np.zeros(gal_count)
	gal_count_out=len(t)-gal_count
	t_out=np.zeros(gal_count_out)

	gal_count=0
	gal_count_out=0
	for j in range(len(t)):
		x_gal, y_gal, z_gal=hp.ang2vec(theta[j], phi[j])
		gal_separation=np.sqrt((cone_vec_x-x_gal)**2+(cone_vec_y-y_gal)**2+(cone_vec_z-z_gal)**2) #calculating the separation of the gal from the r_cone
		if gal_separation < radius_of_separation:
			t_in[gal_count]=t[j]
			gal_count=gal_count+1
		else:
			t_out[gal_count_out]=t[j]
			gal_count_out=gal_count_out+1
	ksd[i],p_value[i]=stats.ks_2samp(t_in, t_out)
	l=phi_cone*180.0/math.pi
	b=90-(theta_cone*180.0/math.pi)
	f.write(str(l)+"\t"+str(b)+"\t"+str(ksd[i])+"\t"+str(p_value[i])+"\t"+str(gal_count)+"\t"+str(gal_count_out)+"\n")

f.close()

max_ksd_index=np.unravel_index(ksd.argmax(), ksd.shape)
theta_max, phi_max=hp.pix2ang(nside,max_ksd_index[0])
l_max=phi_max*180.0/math.pi
b_max=90-(theta_max*180.0/math.pi)

f2=open("max_KS_hemisphere.txt","w")
f2.write("direction of the largest KS"+"\n")
f2.write(str(l_max)+"\t"+str(b_max)+"\t"+str(max(ksd))+"\t"+str(p_value[max_ksd_index[0]])+"\n")
f2.close()
Ejemplo n.º 48
0
def kstest2samp(samp1, samp2):
    ksval, pval = stats.ks_2samp(samp1, samp2)
    return ksval, pval
Ejemplo n.º 49
0
    axis.plot(bin_centers, bin_vals, color='blue', alpha=0.75, lw=1)

    bin_vals, foo = np.histogram(fl_by_time_clpXminus_cut[i],
                                 bins=bin_edges,
                                 density=True)
    axis.plot(bin_centers, bin_vals, color='red', alpha=0.75, lw=1)

    ### Various statistical tests
    # calculate unequal variance t statistic
    tstat, ttpval = stats.ttest_ind(fl_by_time_clpXplus_cut[i],
                                    fl_by_time_clpXminus_cut[i],
                                    equal_var=False)
    #     print('ttest', i, tstat, ttpval)

    # KS test
    ksstat, ks_pval = stats.ks_2samp(fl_by_time_clpXminus_cut[i],
                                     fl_by_time_clpXplus_cut[i])
    #     print('KS test', ksstat, ks_pval)

    # Mann-Whitney
    mwstat, mwpval = stats.mannwhitneyu(fl_by_time_clpXminus_cut[i],
                                        fl_by_time_clpXplus_cut[i],
                                        alternative='greater')
    #     print('Mann-Whitney', mwstat, mwpval)

    # Mood's median test
    median_args = (fl_by_time_clpXminus_cut[i], fl_by_time_clpXplus_cut[i])
    mstat, mpval, _, _ = stats.median_test(*median_args)
    #     print('Median test', mstat, mpval)
    #     print('\n')

    # F test for variance
Ejemplo n.º 50
0
        #I'm going to cut out groups who have less than 5 attacks
        interevent = empdf[empdf.gname == group].idate.diff()[1:].values.tolist()
        if len(interevent) >= 5:
            empirical_data[group] = empdf[empdf.gname == group].idate.diff()[1:].values.tolist()
    #Load the abm data
    abmdf = pd.read_csv('../../results/abm_runs_v3/%s_20181009.csv' % country, header=None,
                         names = header)
    #extract the params
    alpha = abmdf.alpha.unique()
    beta = abmdf.beta.unique()
    omega = abmdf.omega.unique()
    groups = abmdf.group.unique()
    #Group it up
    lvl_one_gdf = abmdf.groupby(['alpha', 'beta', 'omega'])
    for a in alpha:
        for b in beta:
            for o in omega:
                #Create the rundata
                rundata = []
                #Pull the level two groups together by run
                lvl_two_gdf = lvl_one_gdf.get_group((a, b, o)).groupby('run')
                for r in lvl_two_gdf.groups.keys():
                    tdf = lvl_two_gdf.get_group(r)
                    for group in tdf.group.unique():
                        if group in trans.keys() and trans[group] in empirical_data:
                            diffset = tdf[tdf.group==group].step.diff()[1:].tolist()
                            D, p = stats.ks_2samp(empirical_data[trans[group]], diffset)
                            rundata.append(p)
                #Now write it out
                print('%d,%f,%f,%f,%s,%f' % (r, float(a), float(b), float(o), country, len([x for x in rundata if x<0.05])/len(rundata)), file=wfile)
Ejemplo n.º 51
0
    def bias_test(self, df, gbias, sco, display=True):
        '''
        gbias: group bias, e.g. ['CAM_TYPE','SIZE','REGION4']
        sco: field where bias is present e.g. SZSCORE'

        kstest D stat: rate of convergence.
        at significance 0.05 reject H0 (eq. distr.) if D>0.043
        '''
        if display == True:
            print('\n', hlp.color.BOLD, hlp.color.CYAN, sco, hlp.color.END,
                  '\n')

        sts = []
        for gb in gbias:

            if display == True:
                print(hlp.color.BOLD, hlp.color.RED, gb, 'BIAS', hlp.color.END,
                      '\n')
            gbitems = list(df[-df[gb].isnull()][gb].drop_duplicates())

            dfstats = pd.DataFrame()
            full = np.array(df[(df.DIMENSION.isnull())
                               & (-df[sco].isnull())][sco])
            sts.append([
                gb, 'ALL',
                round(full.mean(), 4),
                round(full.std(), 4),
                len(full)
            ])

            for gbi in gbitems:
                if display == True:
                    print(hlp.color.BOLD, gbi, hlp.color.END, '\n')
                gidx = df[(df.DIMENSION.isnull()) & (-df[sco].isnull()) &
                          (df[gb] == gbi)].index
                g = np.array(df.loc[gidx][sco])
                fidx = df[(df.DIMENSION.isnull()) & (-df[sco].isnull()) &
                          (df[gb] != gbi)].index
                f = np.array(df.loc[fidx][sco])
                sts.append(
                    [gb, gbi,
                     round(g.mean(), 4),
                     round(g.std(), 4),
                     len(g)])

                if display == True:
                    ##normality check
                    # print('length:', gbi ,':', len(g), '/ rest:', len(f))
                    # print('norm test:', stats.kstest(g, 'norm'))
                    # stats.probplot(g, dist="norm", plot=plt)
                    # plt.show()

                    ##qq plot
                    print('two sample test:', stats.ks_2samp(g, f))
                    # print('', gb, gbi, ' - mean:', round(g.mean(), 4), '; std:', round(g.std(), 4),
                    #      '\n Full sample - mean:', round(full.mean(), 4), '; std:', round(full.std(), 4))

                    q = np.linspace(0, 100, 101)
                    k, ax = plt.subplots()
                    ax.scatter(np.percentile(f, q),
                               np.percentile(g, q),
                               color='b')
                    ax.plot(ax.get_xlim(), ax.get_xlim(), ls="--", c=".3")
                    plt.ylabel(gb + ' ' + gbi + ' ' + sco)
                    plt.xlabel(gb + ' ' + 'rest ' + sco)
                    plt.title('qq plot - ' + gbi + ' ' + gb + ' vs rest' + '',
                              fontsize=14)
                    plt.show()
                    print('')

            # sts.append([gb, 'ALL', round(full.mean(), 4), round(full.std(), 4), len(full)])

        dfstsall = pd.DataFrame(
            sts, columns=['group', 'item', 'mean', 'std', 'size'])

        if display == True:
            for gr in dfstsall.group.drop_duplicates():

                dfsts = dfstsall[dfstsall.group == gr]

                jet = plt.get_cmap('jet')
                colors = iter(jet(np.linspace(0, 1, 7)))
                div = dfsts['size'].max() / 1200
                for index, row in dfsts.iterrows():
                    plt.scatter(row['mean'],
                                row['std'],
                                label=row['item'],
                                color=next(colors),
                                s=row['size'] / div)

                plt.xticks(rotation=45)
                plt.xlabel('mean', fontsize=14)
                plt.ylabel('stdev', fontsize=14)
                plt.title(gr, fontsize=14)
                plt.xlim(-dfsts['mean'].abs().max() * 1.2,
                         dfsts['mean'].abs().max() * 1.2)
                lgnd = plt.legend(bbox_to_anchor=(0, 1),
                                  loc=2,
                                  scatterpoints=1,
                                  fontsize=10)
                for handle in lgnd.legendHandles:
                    handle.set_sizes([10])

                plt.show()

        return dfstsall
Ejemplo n.º 52
0
# Compair trace segment distributions with KS tests

pp.close()

print(
    "\nIf the KS statistic is small and p value is high we cannot reject that the distributions of the samples are the same"
)
for n, d in enumerate(data):
    res = np.zeros((3, 2))

    if n == 0:
        print('Packet length')
    else:
        print('Time stamp')

    for i in range(len(d) - 1):
        res[i, 0], res[i, 1] = stats.ks_2samp(d[i], d[i + 1])
    res[len(d) - 1, 0], res[len(d) - 1,
                            1] = stats.ks_2samp(d[0], d[len(d) - 1])

    #Save KS test to CSV
    writer = csv.writer(
        open(
            "/home/francesco/Documents/Thesis_project/Results/trace_samples_compare_kstest.csv",
            'a'))
    writer.writerow(res)

#multi_plot_data(pkt_segs[0],time_segs[0],cst_segs[0])
#multi_plot_data(pkt_segs[1],time_segs[1],cst_segs[1])
#multi_plot_data(pkt_segs[2],time_segs[2],cst_segs[2])
Ejemplo n.º 53
0
for mi, mr in zip(utr3['mir'], utr3['mrna']):
    cor.append(spearmanr(np.log2(stad.loc[mr]+1), np.log2(stadmirs.loc[mi]+1))[0])
utr3['stad_corr'] = cor


n_permute = 100
stad_random_cor_e = np.zeros(n_permute * len(utr3['mir']))
i = 0
for c in range(n_permute):
    for mi in utr3['mir']:
        rand_index = randint(0, len(stad.index) - 1)
        stad_random_cor_e[i] = spearmanr(np.log2(stad.iloc[rand_index]+1), np.log2(stadmirs.loc[mi]+1))[0]
        i += 1


print(ks_2samp(utr3['stad_corr'], stad_random_cor_e, alternative="greater"), np.median(utr3['stad_corr']), np.median(stad_random_cor_e))


cor = []
for mi, mr in zip(cds['mir'], cds['mrna']):
    cor.append(spearmanr(np.log(stad.loc[mr]+1), np.log(stadmirs.loc[mi]+1))[0])
cds['stad_corr'] = cor

n_permute = 100
stad_random_cor_e_cds = np.zeros(n_permute * len(cds['mir']))
i = 0
for c in range(n_permute):
    for mi in cds['mir']:
        rand_index = randint(0, len(stad.index) - 1)
        stad_random_cor_e_cds[i] = spearmanr(np.log2(stad.iloc[rand_index]+1), np.log2(stadmirs.loc[mi]+1))[0]
        i += 1
Ejemplo n.º 54
0
print(sys.argv, "Q1&Q3", Q1, Q3)
print(sys.argv[0], "Load Data", len(ans))

plt.figure()
head = 10
n, bins, patches = plt.hist(ans,
                            bins=head,
                            range=[-0.5, head + 0.5],
                            density=True)
print(sys.argv[0], "Draw Histogram")

binsMiddles = 0.5 * (bins[1:] + bins[:-1])
params, covMatrix = curve_fit(poissonDist, binsMiddles, n)
xPlot = np.linspace(0, head, 1000)
#plt.plot(xPlot, poissonDist(xPlot, *params), "r-", lw=2)
st = ks_2samp(ans, poissonDist(xPlot, *params))
print(sys.argv[0], "Draw Poission")

plt.title("CNV with IGSR")
plt.grid(True)
plt.xlabel("CNV")
plt.ylabel("Frequency")
#plt.text(5, 0.125, "lambda = %.2f" % (params))
plt.text(5, 0.150, "n = %d" % (len(ans)))
#plt.text(5, 0.175, "st = %.2f" % st[0])

fig = plt.gcf()
fig.set_size_inches(24, 18)

title = "HistCNA_"
if len(sys.argv) > 1:
        fmax_dict[treatment][taxon] = np.asarray(f_max_all)

fmax_dict

ks_dict = {}
#treatment_ = []
p_values = []
for treatment in treatments:

    ks_dict[treatment] = {}

    sample_1 = fmax_dict[treatment]['B']
    sample_2 = fmax_dict[treatment]['S']

    D, p_value = stats.ks_2samp(sample_1, sample_2)

    ks_dict[treatment]['D'] = D
    ks_dict[treatment]['p_value'] = p_value

    #treatment_pairs.append((treatment_pair, taxon))
    p_values.append(p_value)

reject, pvals_corrected, alphacSidak, alphacBonf = multitest.multipletests(
    p_values, alpha=0.05, method='fdr_bh')
for treatment_idx, treatment in enumerate(treatments):
    ks_dict[treatment]['p_value_bh'] = pvals_corrected[treatment_idx]

for treatment in pt.treatments:

    for taxon, f_max_array in fmax_dict[treatment].items():
Ejemplo n.º 56
0
        cont1[n] += len(p2_silences)
        cont2[n] += len(p1_silences)
        igd1[n] += len(p2_ignores)
        igd2[n] += len(p1_ignores)

    user_hists1.append(hists1)
    user_hists2.append(hists2)

    #個別に検定
    p1_model = p1_stack[2]
    for n, p1 in enumerate(p1_stack):
        if n == 2:
            continue
        result = stats.mannwhitneyu(p1, p1_model)
        print "p1 model", types[n], result.pvalue
        w, p = stats.ks_2samp(p1, p1_model)
        print p

    p2_model = p2_stack[2]
    for n, p2 in enumerate(p2_times_stack):
        if n == 2:
            continue
        result = stats.mannwhitneyu(p2, p2_model)
        print "p2 model", types[n], result.pvalue
        w, p = stats.ks_2samp(p2, p2_model)
        print p

user1 = np.array(user_hists1)
user2 = np.array(user_hists2)

sum1 = np.sum(user1, axis=0)
Ejemplo n.º 57
0
def main():

    #full_network_filename="./network_all_users/full_network_all_users.gml"  # i CANT use this network, because the labels dont match the users id from the dB
    # G_full = nx.read_gml(full_network_filename)

    #  list_A=[]   #Testign out how KS works on a random sample
    # list_B=[]
    #for i in range (10000):
    #   list_A.append(random.random())
    #  list_B.append(random.random())

    #print "KS test listA against normal distrib:", stats.kstest(list_A, "norm" )
    # print "KS test listB against normal distrib:", stats.kstest(list_B, "norm" )
    #print "two-sided KS test listA vs listB:", stats.ks_2samp(list_A, list_B)

    unrealistic_weight_change = 70.

    database = "calorie_king_social_networking_2010"
    server = "tarraco.chem-eng.northwestern.edu"
    user = "******"
    passwd = "n1ckuDB!"
    db = Connection(server, database, user, passwd)

    GC_network_filename = "./network_all_users/GC_full_network_all_users_merged_small_comm_roles_diff_layers1_roles_diff_layers1.5.gml"
    G = nx.read_gml(GC_network_filename)

    output_filename = "./network_all_users/Results_comparison_histograms_percent_weight_change.txt"
    file_output = open(output_filename, 'wt')

    #  print "num. nodes:",len(G.nodes())

    list_of_lists = nx.connected_components(G)

    print "num. of components:", len(list_of_lists), "size GC:", len(
        list_of_lists[0])

    list_weight_changes_GC = []
    list_weight_changes_R6friends = []
    for node in G.nodes():
        label = G.node[node]["label"]
        percent_weight_change = G.node[node]["percentage_weight_change"]
        R6_overlap = G.node[node]["R6_overlap"]
        #print node, label, weight_change, R6_overlap
        if percent_weight_change > -unrealistic_weight_change and percent_weight_change < unrealistic_weight_change:  # filter out unrealistic values

            list_weight_changes_GC.append(percent_weight_change)

            if R6_overlap > 0:
                list_weight_changes_R6friends.append(percent_weight_change)

    print >> file_output, "num GC users:", len(
        list_weight_changes_GC), "num users with R6 friends:", len(
            list_weight_changes_R6friends)

    histograma_bines_gral.histograma_bins(
        list_weight_changes_GC, 20,
        "./network_all_users/histogram_weight_change_GC_users.dat")
    histograma_bines_gral.histograma_bins(
        list_weight_changes_R6friends, 20,
        "./network_all_users/histogram_weight_change_users_with_R6friends.dat")

    print >> file_output, "KS test GC against normal distrib:", stats.kstest(
        list_weight_changes_GC, "norm")
    print >> file_output, "KS test users with R6 friends against normal distrib:", stats.kstest(
        list_weight_changes_R6friends, "norm")

    print >> file_output, "two-sided KS test GC vs users with R6 friends:", stats.ks_2samp(
        list_weight_changes_GC, list_weight_changes_R6friends)

    list_weight_changes_all = []
    query1 = """SELECT * FROM users"""
    result1 = db.query(query1)  # is a list of dicts.
    for r1 in result1:
        percent_weight_change = (float(r1['most_recent_weight']) - float(
            r1['initial_weight'])) / float(r1['initial_weight'])

        #    if percent_weight_change > -unrealistic_weight_change and  percent_weight_change < unrealistic_weight_change :   # filter out unrealistic values
        list_weight_changes_all.append(percent_weight_change)

    histograma_bines_gral.histograma_bins(
        list_weight_changes_all, 200,
        "./network_all_users/histogram_weight_change_users_all_200bins.dat")

    print >> file_output, "tot. number users", len(list_weight_changes_all)
    print >> file_output, "KS test all against normal distrib:", stats.kstest(
        list_weight_changes_all, "norm")
    print >> file_output, "two-sided KS test all vs GC:", stats.ks_2samp(
        list_weight_changes_all, list_weight_changes_GC)
    print >> file_output, "two-sided KS test all vs users with R6 friends:", stats.ks_2samp(
        list_weight_changes_GC, list_weight_changes_R6friends)

    file_output.close()
    print "written file:", output_filename

    exit()

    query1 = """SELECT * FROM friends order by src asc"""
    result1 = db.query(query1)  # is a list of dict.

    print "number links:", len(result1)
    list_friends = []

    for r1 in result1:

        label_src = r1['src']

        label_dest = r1['dest']

        if label_src not in list_friends:
            list_friends.append(label_src)
        if label_dest not in list_friends:
            list_friends.append(label_dest)

    print "num networked users:", len(list_friends)
Ejemplo n.º 58
0
def KS_test(y1, y2):
    temp = stats.ks_2samp(y1, y2)
    return temp.pvalue
def test_one_feature_mixture(component_model_type,
                             num_clusters=3,
                             show_plot=False,
                             seed=None):
    """

    """
    random.seed(seed)

    N = 300
    separation = .9

    get_next_seed = lambda: random.randrange(2147483647)

    cluster_weights = [[1.0 / float(num_clusters)] * num_clusters]

    cctype = component_model_type.cctype
    T, M_c, structure = sdg.gen_data([cctype],
                                     N, [0],
                                     cluster_weights, [separation],
                                     seed=get_next_seed(),
                                     distargs=[distargs[cctype]],
                                     return_structure=True)

    T_list = list(T)
    T = numpy.array(T)

    # pdb.set_trace()
    # create a crosscat state
    M_c = du.gen_M_c_from_T(T_list, cctypes=[cctype])

    state = State.p_State(M_c, T_list)

    # Get support over all component models
    discrete_support = qtu.get_mixture_support(
        cctype,
        component_model_type,
        structure['component_params'][0],
        nbins=250)

    # calculate simple predictive probability for each point
    Q = [(N, 0, x) for x in discrete_support]

    # transitions
    state.transition(n_steps=200)

    # get the sample
    X_L = state.get_X_L()
    X_D = state.get_X_D()

    # generate samples
    # kstest has doesn't compute the same answer with row and column vectors
    # so we flatten this column vector into a row vector.
    predictive_samples = sdg.predictive_columns(
        M_c, X_L, X_D, [0], seed=get_next_seed()).flatten(1)

    probabilities = su.simple_predictive_probability(M_c, X_L, X_D,
                                                     [] * len(Q), Q)

    # get histogram. Different behavior for discrete and continuous types. For some reason
    # the normed property isn't normalizing the multinomial histogram to 1.
    # T = T[:,0]
    if is_discrete[component_model_type.model_type]:
        bins = range(len(discrete_support))
        T_hist = numpy.array(qtu.bincount(T, bins=bins))
        S_hist = numpy.array(qtu.bincount(predictive_samples, bins=bins))
        T_hist = T_hist / float(numpy.sum(T_hist))
        S_hist = S_hist / float(numpy.sum(S_hist))
        edges = numpy.array(discrete_support, dtype=float)
    else:
        T_hist, edges = numpy.histogram(T,
                                        bins=min(50, len(discrete_support)),
                                        normed=True)
        S_hist, _ = numpy.histogram(predictive_samples,
                                    bins=edges,
                                    normed=True)
        edges = edges[0:-1]

    # Goodness-of-fit-tests
    if not is_discrete[component_model_type.model_type]:
        # do a KS tests if the distribution in continuous
        # cdf = lambda x: component_model_type.cdf(x, model_parameters)
        # stat, p = stats.kstest(predictive_samples, cdf)   # 1-sample test
        stat, p = stats.ks_2samp(predictive_samples, T[:, 0])  # 2-sample test
        test_str = "KS"
    else:
        # Cressie-Read power divergence statistic and goodness of fit test.
        # This function gives a lot of flexibility in the method <lambda_> used.
        freq_obs = S_hist * N
        freq_exp = numpy.exp(probabilities) * N
        stat, p = stats.power_divergence(freq_obs, freq_exp, lambda_='pearson')
        test_str = "Chi-square"

    if show_plot:
        pylab.clf()
        lpdf = qtu.get_mixture_pdf(discrete_support, component_model_type,
                                   structure['component_params'][0],
                                   [1.0 / num_clusters] * num_clusters)
        pylab.axes([0.1, 0.1, .8, .7])
        # bin widths
        width = (numpy.max(edges) - numpy.min(edges)) / len(edges)
        pylab.bar(edges,
                  T_hist,
                  color='blue',
                  alpha=.5,
                  width=width,
                  label='Original data',
                  zorder=1)
        pylab.bar(edges,
                  S_hist,
                  color='red',
                  alpha=.5,
                  width=width,
                  label='Predictive samples',
                  zorder=2)

        # plot actual pdf of support given data params
        pylab.scatter(discrete_support,
                      numpy.exp(lpdf),
                      c="blue",
                      edgecolor="none",
                      s=100,
                      label="true pdf",
                      alpha=1,
                      zorder=3)

        # plot predictive probability of support points
        pylab.scatter(discrete_support,
                      numpy.exp(probabilities),
                      c="red",
                      edgecolor="none",
                      s=100,
                      label="predictive probability",
                      alpha=1,
                      zorder=4)

        pylab.legend()

        ylimits = pylab.gca().get_ylim()
        pylab.ylim([0, ylimits[1]])

        title_string = "%i samples drawn from %i %s components: \ninference after 200 crosscat transitions\n%s test: p = %f" \
            % (N, num_clusters, component_model_type.cctype, test_str, round(p,4))

        pylab.title(title_string, fontsize=12)

        filename = component_model_type.model_type + "_mixtrue.png"
        pylab.savefig(filename)
        pylab.close()

    return p
Ejemplo n.º 60
0
    y = binom.rvs(n, p, size=s_size)
    samples.append([y, i, "binomial"])
for i in range(3 * n_of_samples, 4 * n_of_samples):
    y = geom.rvs(p, size=s_size)
    samples.append([y, i, "geometric"])
for i in range(4 * n_of_samples, 5 * n_of_samples):
    y = poisson.rvs(n, size=s_size)
    samples.append([y, i, "poisson"])
outlier_1 = beta.rvs(1, 10, size=1000)
outlier_2 = chi2.rvs(n, size=1000)
samples.append([outlier_1, 5 * n_of_samples, "beta"])
samples.append([outlier_2, 5 * n_of_samples + 1, "chi_square"])

for i in range(len(samples)):
    for j in range(i, len(samples)):
        ks_test_pvalue = ks_2samp(samples[i][0], samples[j][0])[1]
        epps_singleton_pvalue = epps_singleton_2samp(samples[i][0],
                                                     samples[j][0])[1]

        if ks_test_pvalue > 0.05:
            G.add_edge(i, j, weight=0.01 /
                       (ks_test_pvalue))  #0.01 scaling factor here
        if epps_singleton_pvalue > 0.05:
            H.add_edge(i, j, weight=0.01 /
                       (epps_singleton_pvalue))  #0.01 scaling factor here

# Testing whether two samples are generated by the same underlying distribution is a classical question in statistics. A widely used test is the Kolmogorov-Smirnov (KS) test which relies on the empirical distribution function. Epps and Singleton introduce a test based on the empirical characteristic function.
#
# One advantage of the ES test compared to the KS test is that is does not assume a continuous distribution.The authors conclude that the test also has a higher power than the KS test in many examples. They recommend the use of the ES test for discrete samples as well as continuous samples with at least 25 observations each.

# In[2]: