Beispiel #1
0
def pdf_plot(fig2, indata, i, xbrs, ybrs, munits):
	''' creates a Gaussian pdf plot of any array. If the array has multiple dimensions
	it is flattened first. Relies on the Python "Statistics"  and "Matplotlib" libraries. 
	
	http://bonsai.hgc.jp/~mdehoon/software/python/
	
	http://matplotlib.org/'''

	import statistics as st
	import numpy as np
	import matplotlib.pyplot as plt
	from scipy import stats
	colours=['Teal', 'MediumBlue', 'DodgerBlue', 'DarkTurquoise', 'Chartreuse', 'Green', 'Yellow', 'Red', 'Orange', 'Chocolate', 'DarkRed', 'Black']
	
	pshape=np.zeros((12))
	ploc=np.zeros((12))
	pscale=np.zeros((12))
	
	##flatten array if necessary 
	#indata=indata.flatten(1)
	
	# adjust weight or bandwidth
	#weight=[1]*len(z4)
	#h1=bandwidth(z4,weight,"Epanechnikov")
	#h2=bandwidth(z4,weight,"Gaussian")
	
	# do pdf calculation
	y0,x0=st.pdf(indata)
	yg,xg=st.pdf(indata, kernel="Gaussian")
	if i > 10:
		plt.plot(x0,y0,color='black',linewidth=2.5)
		plt.title(' ')
		plt.title('1961-1990', fontsize=14)
	else:
		plt.plot(x0,y0,colours[i])
		plt.title(' ')
		plt.title('2061-2090', fontsize=14)
	
	# limits axes
	yy1=ybrs[0]
	yy2=ybrs[1]
	xx1=xbrs[0]
	xx2=xbrs[1]
	plt.ylim((yy1,yy2))
	plt.xlim((xx1,xx2))
	plt.locator_params(nbins=7)
	# set axis label size 
	plt.rc('font', size=12)
	#fig2.set_xlabel(munits, fontsize=11)

	x=[5,25,50,75,95]
	index_pdf = stats.norm.pdf(x) 
	
	pshape[i], ploc[i], pscale[i] = stats.lognorm.fit(indata, floc=0)
Beispiel #2
0
    def kl_divergence_coalescent_waiting_times(allele_waiting_time_dist,
                                               haploid_pop_size):
        """
        `allele_branch_len_dist` is a dictionary with number of alleles as keys
        and a list of waiting times associated with that number of alleles as
        values. `haploid_pop_size` is the population size in terms of total numbers
        of genes. This returns a the KL-divergence between the distribution of
        waiting times and the Kingman coalescent distribution.

        D_{\mathrm{KL}}(P\|Q) = \sum_i P(i) \log \frac{P(i)}{Q(i)}.

        """
        d_kl = 0.0
        for k, wts in allele_waiting_time_dist.items():
            p = float(probability.binomial_coefficient(k,
                                                       2)) / haploid_pop_size
            for t in wts:
                # Kernel types:
                #
                # 'E' or 'Epanechnikov'
                #     Epanechnikov kernel (default)
                #
                # 'U' or 'Uniform'
                #     Uniform kernel
                #
                # 'T' or 'Triangle'
                #     Triangle kernel
                #
                # 'G' or  'Gaussian'
                #     Gaussian kernel
                #
                # 'B' or 'Biweight'
                #     Quartic/biweight kernel
                #
                # '3' or 'Triweight'
                #     Triweight kernel
                #
                # 'C' or 'Cosine'
                #     Cosine kernel
                q = de_hoon_lib.pdf(wts, [k], kernel='Gaussian')
                if q == 0:
                    q = 1e-100
                d_kl += p * math.log(p / q)
        return d_kl
Beispiel #3
0
    def kl_divergence_coalescent_waiting_times(allele_waiting_time_dist, haploid_pop_size):
        """
        `allele_branch_len_dist` is a dictionary with number of alleles as keys
        and a list of waiting times associated with that number of alleles as
        values. `haploid_pop_size` is the population size in terms of total numbers
        of genes. This returns a the KL-divergence between the distribution of
        waiting times and the Kingman coalescent distribution.

        D_{\mathrm{KL}}(P\|Q) = \sum_i P(i) \log \frac{P(i)}{Q(i)}.

        """
        d_kl = 0.0
        for k, wts in allele_waiting_time_dist.items():
            p = float(probability.binomial_coefficient(k, 2)) / haploid_pop_size
            for t in wts:
                # Kernel types:
                #
                # 'E' or 'Epanechnikov'
                #     Epanechnikov kernel (default)
                #
                # 'U' or 'Uniform'
                #     Uniform kernel
                #
                # 'T' or 'Triangle'
                #     Triangle kernel
                #
                # 'G' or  'Gaussian'
                #     Gaussian kernel
                #
                # 'B' or 'Biweight'
                #     Quartic/biweight kernel
                #
                # '3' or 'Triweight'
                #     Triweight kernel
                #
                # 'C' or 'Cosine'
                #     Cosine kernel
                q = de_hoon_lib.pdf(wts, [k], kernel = 'Gaussian')
                if q == 0:
                    q = 1e-100
                d_kl += p * math.log(p/q)
        return d_kl
Beispiel #4
0
def plothist(x, bin=None, nbins=None, xrange=None, yrange=None, min=None,
			max=None, overplot=False, color='black', xlog=False, ylog=False,
			nan=False, weights=None, norm=False, kernel=None, retpoints=False,
			adaptive=False, adaptive_thresh=30, adaptive_depth=[2,10],
			weight_norm=False, apply_func=None, **kw):
	"""
	Plot the 1D histogram
	Example:
	>> plothist(dat, bin=0.1, min=0, max=3)

	Keyword parameters:
	------------------
	bin
		the binsize(float)
	nbins
		number of bins(integer)
		It cannot be specified together with the bin= parameter
	xlog, ylog
		log the appropriate axis
	weights
		the 1-D array of weights used in the histogram creation
	nan
		boolean flag to ignore nan's
	norm
		boolean flag to normalize the histogram by the peak value
	min,max
		range of data for which the histogram is constructed
	retpoints
		boolean parameter controlling whether to return or not the
		computed histogram.
		If yes the tuple with two arrays (bin centers, Number of points in bins) 
		is returned
	overplot
		boolean parameter for overplotting 
	adaptive
		boolean for turning on/off the adaptive regime of
		histogramming (adaptive bin size). 
		If True weights, nbins, bin,kernel parameters are ignored
	adaptive_thresh
		the limiting number of points in the bin for the adaptive 
		histogramming (default 30)
	adaptive_depth
		the list of two integers for the detalisation levels of 
		adaptive histogramming (default [2,10]) 
	weight_norm
		if True the value in each bin is mean weight of points within
		the bin
	"""
	if nan:
		ind = numpy.isfinite(x)
		if weights is not None:
			ind = numpy.isfinite(weights)&ind
		dat = x[ind]
		if weights is not None:
			weights =weights[ind]
	else:
		dat = x
	maxNone = False
	if min is None:
		min = numpy.min(dat)
	if max is None:
		maxNone = True
		max = numpy.max(dat)
	
	if bin is None and nbins is None:
		nbins = 100
		bin = (max - min) * 1. / nbins
	elif nbins is None:
		nbins = int(math.ceil((max - min) * 1. / bin))
		if maxNone:
			max = min + nbins * bin
	elif bin is None:
		bin = (max - min) * 1. / nbins
	else:
		warnings.warn("both bin= and nbins= keywords were specified in the plothist call",RuntimeWarning)
		pass
		# if both nbins and bin are defined I don't do anything 
		# it may be non-intuitive if kernel option is used, because
		# it uses both nbins and bin options
	if kernel is None:
		if not adaptive:
			if not np.isscalar(weights):
				hh, loc = numpy.histogram(dat, range=(min, max), bins=nbins, weights=weights)
			else:
				hh, loc = numpy.histogram(dat, range=(min, max), bins=nbins)
				hh = hh * weights

			if weight_norm:
				hh1, loc = numpy.histogram(dat, range=(min, max), bins=nbins, weights=None)	
				hh = hh*1./hh1
		else:
			import adabinner
			hh, loc = adabinner.hist(dat, xmin=min, xmax=max, hi=adaptive_depth,
						thresh=adaptive_thresh)
		
		hh1 = np.repeat(hh,2)
		loc1 = np.concatenate(([loc[0]],np.repeat(loc[1:-1],2),[loc[-1]]))
	else:
		loc1=numpy.linspace(min,max,nbins*5)
		import statistics
		if weights is not None:
			hh1 = statistics.pdf( dat, loc1, h=bin/2.,kernel=kernel,weight=weights)*bin*len(dat)
		else:
			hh1 = statistics.pdf( dat, loc1, h=bin/2.,kernel=kernel)*bin*len(dat)

	if overplot:
		func = oplot 
	else:
		func = plot
	if norm:
		hh1=hh1*1./hh1.max()
	kw['ps'] = kw.get('ps') or 0
	if 'yr' not in kw:
		kw['yr']=[hh1.min(),hh1.max()]
	if 'xr' not in kw:
		kw['xr']=[min,max]
	if apply_func is not None:
		hh1 = apply_func (loc1,hh1)
	func(loc1, hh1, color=color,
		xlog=xlog, ylog=ylog, **kw)
	if retpoints:
		return 0.5*(loc[1:]+loc[:-1]),hh
Beispiel #5
0
 def restrict(self,x,grid,domain):
     edges = scipy.r_[domain[0],(grid[1:]+grid[:-1])/2.,domain[-1]]
     # estimating the cumulative density to make the estimation conservative
     rho = statistics.pdf(x,edges[1:-1],weight=weight,h=self.h)
     return rho
Beispiel #6
0
 def restrict(self, x, grid, domain):
     edges = scipy.r_[domain[0], (grid[1:] + grid[:-1]) / 2., domain[-1]]
     # estimating the cumulative density to make the estimation conservative
     rho = statistics.pdf(x, edges[1:-1], weight=weight, h=self.h)
     return rho
 cb.set_label('log10(N)')

###################################################################################################

answer2=raw_input("Plot the magnitude distribution (Default=no)? (y/n) ")

if answer2=='y':
 mpt.figure(1,figsize=(11,7),dpi=100)
 mpt.suptitle("Magnitude Distribution")
 mpt.subplot(211)
 mpt.hexbin(a,mag,gridsize=200,bins='log',mincnt=1)
 mpt.xlabel("$a'$ (AU)")
 mpt.ylabel("$Magnitudes$")
 
 mpt.subplot(212)
 dist, mag_dist = stc.pdf(mag)
 mpt.plot(mag_dist, dist,'r-',linewidth=2)
 mpt.hist(mag,bins=200,normed=True)
 mpt.xlim(3,20)
 mpt.xlabel("$Magnitudes$")
 mpt.ylabel("$f$")

# Completeza:
 mpt.plot([17,17],[0,0.5],'b--')
 mpt.plot([15.55,15.55],[0,0.5],'r--')

###################################################################################################

answer3=raw_input("Plot the taxonomic distribution (Default=no)? (y/n) ")

if answer3=='y':
Beispiel #8
0
def show_obs_sbr():
    import numpy as np
    import matplotlib.pyplot as plt
    from copy import deepcopy
    import statistics 
    import LetsgoSerializer as ls
    from SparseBayes import SparseBayes


    dimension = 1
#    basisWidth = 0.05
#    basisWidth = basisWidth**(1/dimension)
        
    def dist_squared(X,Y):
        import numpy as np
        nx = X.shape[0]
        ny = Y.shape[0]
        
        return np.dot(np.atleast_2d(np.sum((X**2),1)).T,np.ones((1,ny))) + \
            np.dot(np.ones((nx,1)),np.atleast_2d(np.sum((Y**2),1))) - 2*np.dot(X,Y.T);
    
    def basis_func(X,basisWidth):
        import numpy as np
        C = X.copy()
        BASIS = np.exp(-dist_squared(X,C)/(basisWidth**2))
        return BASIS

    def basis_vector(X,x,basisWidth):
        import numpy as np
        BASIS = np.exp(-dist_squared(x,X)/(basisWidth**2))
        return BASIS
    
    total_co_cs = None
    total_inco_cs = None
    for c in range(7):
        co_cs = ls.load_model('_correct_confidence_score_class_%d.model'%c)
        inco_cs = ls.load_model('_incorrect_confidence_score_class_%d.model'%c)

        if total_co_cs == None:
            total_co_cs = deepcopy(co_cs)
            total_inco_cs = deepcopy(inco_cs)
        else:
            for k in co_cs.keys():
                total_co_cs[k].extend(co_cs[k])
                total_inco_cs[k].extend(inco_cs[k])
    
    #    plt.subplot(121)   
    title = {'multi':'Total of multiple actions',\
             'multi2': 'Two actions',\
             'multi3': 'Three actions',\
             'multi4': 'Four actions',\
             'multi5': 'Five actions',\
             'total': 'Global',\
             'yes': 'Affirm',\
             'no': 'Deny',\
             'bn': 'Bus number',\
             'dp': 'Departure place',\
             'ap': 'Arrival place',\
             'tt': 'Travel time',\
             'single': 'Total of single actions'
             }
    for k in total_co_cs.keys():
        if not k in ['yes','no','bn','dp','ap','tt','multi2','multi3','multi4','multi5']:
            continue
        co = total_co_cs[k]
        inco = total_inco_cs[k]
        
        print 'length of correct: ',len(co)
        print 'length of incorrect: ',len(inco)
        
#        n,bins,patches = plt.hist([co,inco],bins=np.arange(0.0,1.1,0.1),\
#                                  normed=0,color=['green','yellow'],\
#                                  label=['Correct','Incorrect'],alpha=0.75)
    
        try:
            x_co = np.arange(0,1.001,0.001)
            x_inco = np.arange(0,1.001,0.001)
            h_co = statistics.bandwidth(np.array(co),weight=None,kernel='Gaussian')
            print 'bandwidth of correct: ',h_co
#            y_co,x_co = statistics.pdf(np.array(co),kernel='Gaussian',n=1000)
            y_co = statistics.pdf(np.array(co),x=x_co,kernel='Gaussian')
            print 'length of correct: ',len(x_co)
            h_inco = statistics.bandwidth(np.array(inco),weight=None,kernel='Gaussian')
            print 'bandwidth of incorrect: ',h_inco
#            y_inco,x_inco = statistics.pdf(np.array(inco),kernel='Gaussian',n=1000)
            y_inco = statistics.pdf(np.array(inco),x=x_inco,kernel='Gaussian')
            print 'length of incorrect: ',len(x_inco)
            
            y_co += 1e-10
            y_inco = y_inco*(float(len(inco))/len(co)) + 1e-10
    
            y_co_max = np.max(y_co)
            print 'max of correct: ',y_co_max
            y_inco_max = np.max(y_inco)
            print 'max of incorrect: ',y_inco_max
            y_max = max([y_co_max,y_inco_max])
            print 'max of total: ',y_max         
            plt.plot(x_co,y_co/y_max,'g.-',alpha=0.75)
            plt.plot(x_inco,y_inco/y_max,'r.-',alpha=0.75)
            print x_co
            print x_inco
            y = y_co/(y_co + y_inco)
            plt.plot(x_co,y,'b--',alpha=0.75)

            m = SparseBayes()
            X = np.atleast_2d(x_co).T
            Y = np.atleast_2d(y).T
            basisWidth=min([h_co,h_inco])
            BASIS = basis_func(X,basisWidth)
            try:   
                Relevant,Mu,Alpha,beta,update_count,add_count,delete_count,full_count = \
                m.learn(X,Y,lambda x: basis_func(x,basisWidth))
                ls.store_model({'data_points':X[Relevant],'weights':Mu,'basis_width':basisWidth},\
                               '_calibrated_confidence_score_sbr_%s.model'%k)
            except RuntimeError as e:
                print e
            w_infer = np.zeros((BASIS.shape[1],1))
            w_infer[Relevant] = Mu 
            
            Yh = np.dot(BASIS[:,Relevant],Mu)
            e = Yh - Y
            ED = np.dot(e.T,e)
            
            print 'ED: %f'%ED
            
            print np.dot(basis_vector(X[Relevant],np.ones((1,1))/2,basisWidth),Mu)
            
            
            plt.plot(X.ravel(),Yh.ravel(),'yo-',alpha=0.75)

    #        plt.legend(loc='upper center')
            plt.xlabel('Confidence Score')
            plt.ylabel('Count')
            plt.title(title[k])
    #        if k == 'multi5':
    #            plt.axis([0,1,0,1.2])
    #        elif k == 'multi4':
    #            plt.axis([0,1,0,10])
            plt.grid(True)
            plt.savefig(title[k]+'.png')
#            plt.show()
            plt.clf()
        except (ValueError,RuntimeError) as e:
            print e