def pdf_plot(fig2, indata, i, xbrs, ybrs, munits): ''' creates a Gaussian pdf plot of any array. If the array has multiple dimensions it is flattened first. Relies on the Python "Statistics" and "Matplotlib" libraries. http://bonsai.hgc.jp/~mdehoon/software/python/ http://matplotlib.org/''' import statistics as st import numpy as np import matplotlib.pyplot as plt from scipy import stats colours=['Teal', 'MediumBlue', 'DodgerBlue', 'DarkTurquoise', 'Chartreuse', 'Green', 'Yellow', 'Red', 'Orange', 'Chocolate', 'DarkRed', 'Black'] pshape=np.zeros((12)) ploc=np.zeros((12)) pscale=np.zeros((12)) ##flatten array if necessary #indata=indata.flatten(1) # adjust weight or bandwidth #weight=[1]*len(z4) #h1=bandwidth(z4,weight,"Epanechnikov") #h2=bandwidth(z4,weight,"Gaussian") # do pdf calculation y0,x0=st.pdf(indata) yg,xg=st.pdf(indata, kernel="Gaussian") if i > 10: plt.plot(x0,y0,color='black',linewidth=2.5) plt.title(' ') plt.title('1961-1990', fontsize=14) else: plt.plot(x0,y0,colours[i]) plt.title(' ') plt.title('2061-2090', fontsize=14) # limits axes yy1=ybrs[0] yy2=ybrs[1] xx1=xbrs[0] xx2=xbrs[1] plt.ylim((yy1,yy2)) plt.xlim((xx1,xx2)) plt.locator_params(nbins=7) # set axis label size plt.rc('font', size=12) #fig2.set_xlabel(munits, fontsize=11) x=[5,25,50,75,95] index_pdf = stats.norm.pdf(x) pshape[i], ploc[i], pscale[i] = stats.lognorm.fit(indata, floc=0)
def kl_divergence_coalescent_waiting_times(allele_waiting_time_dist, haploid_pop_size): """ `allele_branch_len_dist` is a dictionary with number of alleles as keys and a list of waiting times associated with that number of alleles as values. `haploid_pop_size` is the population size in terms of total numbers of genes. This returns a the KL-divergence between the distribution of waiting times and the Kingman coalescent distribution. D_{\mathrm{KL}}(P\|Q) = \sum_i P(i) \log \frac{P(i)}{Q(i)}. """ d_kl = 0.0 for k, wts in allele_waiting_time_dist.items(): p = float(probability.binomial_coefficient(k, 2)) / haploid_pop_size for t in wts: # Kernel types: # # 'E' or 'Epanechnikov' # Epanechnikov kernel (default) # # 'U' or 'Uniform' # Uniform kernel # # 'T' or 'Triangle' # Triangle kernel # # 'G' or 'Gaussian' # Gaussian kernel # # 'B' or 'Biweight' # Quartic/biweight kernel # # '3' or 'Triweight' # Triweight kernel # # 'C' or 'Cosine' # Cosine kernel q = de_hoon_lib.pdf(wts, [k], kernel='Gaussian') if q == 0: q = 1e-100 d_kl += p * math.log(p / q) return d_kl
def kl_divergence_coalescent_waiting_times(allele_waiting_time_dist, haploid_pop_size): """ `allele_branch_len_dist` is a dictionary with number of alleles as keys and a list of waiting times associated with that number of alleles as values. `haploid_pop_size` is the population size in terms of total numbers of genes. This returns a the KL-divergence between the distribution of waiting times and the Kingman coalescent distribution. D_{\mathrm{KL}}(P\|Q) = \sum_i P(i) \log \frac{P(i)}{Q(i)}. """ d_kl = 0.0 for k, wts in allele_waiting_time_dist.items(): p = float(probability.binomial_coefficient(k, 2)) / haploid_pop_size for t in wts: # Kernel types: # # 'E' or 'Epanechnikov' # Epanechnikov kernel (default) # # 'U' or 'Uniform' # Uniform kernel # # 'T' or 'Triangle' # Triangle kernel # # 'G' or 'Gaussian' # Gaussian kernel # # 'B' or 'Biweight' # Quartic/biweight kernel # # '3' or 'Triweight' # Triweight kernel # # 'C' or 'Cosine' # Cosine kernel q = de_hoon_lib.pdf(wts, [k], kernel = 'Gaussian') if q == 0: q = 1e-100 d_kl += p * math.log(p/q) return d_kl
def plothist(x, bin=None, nbins=None, xrange=None, yrange=None, min=None, max=None, overplot=False, color='black', xlog=False, ylog=False, nan=False, weights=None, norm=False, kernel=None, retpoints=False, adaptive=False, adaptive_thresh=30, adaptive_depth=[2,10], weight_norm=False, apply_func=None, **kw): """ Plot the 1D histogram Example: >> plothist(dat, bin=0.1, min=0, max=3) Keyword parameters: ------------------ bin the binsize(float) nbins number of bins(integer) It cannot be specified together with the bin= parameter xlog, ylog log the appropriate axis weights the 1-D array of weights used in the histogram creation nan boolean flag to ignore nan's norm boolean flag to normalize the histogram by the peak value min,max range of data for which the histogram is constructed retpoints boolean parameter controlling whether to return or not the computed histogram. If yes the tuple with two arrays (bin centers, Number of points in bins) is returned overplot boolean parameter for overplotting adaptive boolean for turning on/off the adaptive regime of histogramming (adaptive bin size). If True weights, nbins, bin,kernel parameters are ignored adaptive_thresh the limiting number of points in the bin for the adaptive histogramming (default 30) adaptive_depth the list of two integers for the detalisation levels of adaptive histogramming (default [2,10]) weight_norm if True the value in each bin is mean weight of points within the bin """ if nan: ind = numpy.isfinite(x) if weights is not None: ind = numpy.isfinite(weights)&ind dat = x[ind] if weights is not None: weights =weights[ind] else: dat = x maxNone = False if min is None: min = numpy.min(dat) if max is None: maxNone = True max = numpy.max(dat) if bin is None and nbins is None: nbins = 100 bin = (max - min) * 1. / nbins elif nbins is None: nbins = int(math.ceil((max - min) * 1. / bin)) if maxNone: max = min + nbins * bin elif bin is None: bin = (max - min) * 1. / nbins else: warnings.warn("both bin= and nbins= keywords were specified in the plothist call",RuntimeWarning) pass # if both nbins and bin are defined I don't do anything # it may be non-intuitive if kernel option is used, because # it uses both nbins and bin options if kernel is None: if not adaptive: if not np.isscalar(weights): hh, loc = numpy.histogram(dat, range=(min, max), bins=nbins, weights=weights) else: hh, loc = numpy.histogram(dat, range=(min, max), bins=nbins) hh = hh * weights if weight_norm: hh1, loc = numpy.histogram(dat, range=(min, max), bins=nbins, weights=None) hh = hh*1./hh1 else: import adabinner hh, loc = adabinner.hist(dat, xmin=min, xmax=max, hi=adaptive_depth, thresh=adaptive_thresh) hh1 = np.repeat(hh,2) loc1 = np.concatenate(([loc[0]],np.repeat(loc[1:-1],2),[loc[-1]])) else: loc1=numpy.linspace(min,max,nbins*5) import statistics if weights is not None: hh1 = statistics.pdf( dat, loc1, h=bin/2.,kernel=kernel,weight=weights)*bin*len(dat) else: hh1 = statistics.pdf( dat, loc1, h=bin/2.,kernel=kernel)*bin*len(dat) if overplot: func = oplot else: func = plot if norm: hh1=hh1*1./hh1.max() kw['ps'] = kw.get('ps') or 0 if 'yr' not in kw: kw['yr']=[hh1.min(),hh1.max()] if 'xr' not in kw: kw['xr']=[min,max] if apply_func is not None: hh1 = apply_func (loc1,hh1) func(loc1, hh1, color=color, xlog=xlog, ylog=ylog, **kw) if retpoints: return 0.5*(loc[1:]+loc[:-1]),hh
def restrict(self,x,grid,domain): edges = scipy.r_[domain[0],(grid[1:]+grid[:-1])/2.,domain[-1]] # estimating the cumulative density to make the estimation conservative rho = statistics.pdf(x,edges[1:-1],weight=weight,h=self.h) return rho
def restrict(self, x, grid, domain): edges = scipy.r_[domain[0], (grid[1:] + grid[:-1]) / 2., domain[-1]] # estimating the cumulative density to make the estimation conservative rho = statistics.pdf(x, edges[1:-1], weight=weight, h=self.h) return rho
cb.set_label('log10(N)') ################################################################################################### answer2=raw_input("Plot the magnitude distribution (Default=no)? (y/n) ") if answer2=='y': mpt.figure(1,figsize=(11,7),dpi=100) mpt.suptitle("Magnitude Distribution") mpt.subplot(211) mpt.hexbin(a,mag,gridsize=200,bins='log',mincnt=1) mpt.xlabel("$a'$ (AU)") mpt.ylabel("$Magnitudes$") mpt.subplot(212) dist, mag_dist = stc.pdf(mag) mpt.plot(mag_dist, dist,'r-',linewidth=2) mpt.hist(mag,bins=200,normed=True) mpt.xlim(3,20) mpt.xlabel("$Magnitudes$") mpt.ylabel("$f$") # Completeza: mpt.plot([17,17],[0,0.5],'b--') mpt.plot([15.55,15.55],[0,0.5],'r--') ################################################################################################### answer3=raw_input("Plot the taxonomic distribution (Default=no)? (y/n) ") if answer3=='y':
def show_obs_sbr(): import numpy as np import matplotlib.pyplot as plt from copy import deepcopy import statistics import LetsgoSerializer as ls from SparseBayes import SparseBayes dimension = 1 # basisWidth = 0.05 # basisWidth = basisWidth**(1/dimension) def dist_squared(X,Y): import numpy as np nx = X.shape[0] ny = Y.shape[0] return np.dot(np.atleast_2d(np.sum((X**2),1)).T,np.ones((1,ny))) + \ np.dot(np.ones((nx,1)),np.atleast_2d(np.sum((Y**2),1))) - 2*np.dot(X,Y.T); def basis_func(X,basisWidth): import numpy as np C = X.copy() BASIS = np.exp(-dist_squared(X,C)/(basisWidth**2)) return BASIS def basis_vector(X,x,basisWidth): import numpy as np BASIS = np.exp(-dist_squared(x,X)/(basisWidth**2)) return BASIS total_co_cs = None total_inco_cs = None for c in range(7): co_cs = ls.load_model('_correct_confidence_score_class_%d.model'%c) inco_cs = ls.load_model('_incorrect_confidence_score_class_%d.model'%c) if total_co_cs == None: total_co_cs = deepcopy(co_cs) total_inco_cs = deepcopy(inco_cs) else: for k in co_cs.keys(): total_co_cs[k].extend(co_cs[k]) total_inco_cs[k].extend(inco_cs[k]) # plt.subplot(121) title = {'multi':'Total of multiple actions',\ 'multi2': 'Two actions',\ 'multi3': 'Three actions',\ 'multi4': 'Four actions',\ 'multi5': 'Five actions',\ 'total': 'Global',\ 'yes': 'Affirm',\ 'no': 'Deny',\ 'bn': 'Bus number',\ 'dp': 'Departure place',\ 'ap': 'Arrival place',\ 'tt': 'Travel time',\ 'single': 'Total of single actions' } for k in total_co_cs.keys(): if not k in ['yes','no','bn','dp','ap','tt','multi2','multi3','multi4','multi5']: continue co = total_co_cs[k] inco = total_inco_cs[k] print 'length of correct: ',len(co) print 'length of incorrect: ',len(inco) # n,bins,patches = plt.hist([co,inco],bins=np.arange(0.0,1.1,0.1),\ # normed=0,color=['green','yellow'],\ # label=['Correct','Incorrect'],alpha=0.75) try: x_co = np.arange(0,1.001,0.001) x_inco = np.arange(0,1.001,0.001) h_co = statistics.bandwidth(np.array(co),weight=None,kernel='Gaussian') print 'bandwidth of correct: ',h_co # y_co,x_co = statistics.pdf(np.array(co),kernel='Gaussian',n=1000) y_co = statistics.pdf(np.array(co),x=x_co,kernel='Gaussian') print 'length of correct: ',len(x_co) h_inco = statistics.bandwidth(np.array(inco),weight=None,kernel='Gaussian') print 'bandwidth of incorrect: ',h_inco # y_inco,x_inco = statistics.pdf(np.array(inco),kernel='Gaussian',n=1000) y_inco = statistics.pdf(np.array(inco),x=x_inco,kernel='Gaussian') print 'length of incorrect: ',len(x_inco) y_co += 1e-10 y_inco = y_inco*(float(len(inco))/len(co)) + 1e-10 y_co_max = np.max(y_co) print 'max of correct: ',y_co_max y_inco_max = np.max(y_inco) print 'max of incorrect: ',y_inco_max y_max = max([y_co_max,y_inco_max]) print 'max of total: ',y_max plt.plot(x_co,y_co/y_max,'g.-',alpha=0.75) plt.plot(x_inco,y_inco/y_max,'r.-',alpha=0.75) print x_co print x_inco y = y_co/(y_co + y_inco) plt.plot(x_co,y,'b--',alpha=0.75) m = SparseBayes() X = np.atleast_2d(x_co).T Y = np.atleast_2d(y).T basisWidth=min([h_co,h_inco]) BASIS = basis_func(X,basisWidth) try: Relevant,Mu,Alpha,beta,update_count,add_count,delete_count,full_count = \ m.learn(X,Y,lambda x: basis_func(x,basisWidth)) ls.store_model({'data_points':X[Relevant],'weights':Mu,'basis_width':basisWidth},\ '_calibrated_confidence_score_sbr_%s.model'%k) except RuntimeError as e: print e w_infer = np.zeros((BASIS.shape[1],1)) w_infer[Relevant] = Mu Yh = np.dot(BASIS[:,Relevant],Mu) e = Yh - Y ED = np.dot(e.T,e) print 'ED: %f'%ED print np.dot(basis_vector(X[Relevant],np.ones((1,1))/2,basisWidth),Mu) plt.plot(X.ravel(),Yh.ravel(),'yo-',alpha=0.75) # plt.legend(loc='upper center') plt.xlabel('Confidence Score') plt.ylabel('Count') plt.title(title[k]) # if k == 'multi5': # plt.axis([0,1,0,1.2]) # elif k == 'multi4': # plt.axis([0,1,0,10]) plt.grid(True) plt.savefig(title[k]+'.png') # plt.show() plt.clf() except (ValueError,RuntimeError) as e: print e