def main(): import getdist from getdist import plots, MCSamples, loadMCSamples import numpy as np import pandas as pd args = parse_args() out = os.path.expanduser(args.out) out = os.path.join(out,'plots') if not os.path.isdir(out): os.makedirs(out) allnames = np.array(['Om','h0','Ob','ns','a_s','Onuh2','b1','b2','b3','b4','b5','m1','m2','m3','m4','ia_a','ia_alpha', 'wpz_b1','wpz_b2','wpz_b3','wpz_b4','lpz_b1','lpz_bin2','lpz_bin3','lpz_bin4','lpz_bin5','s8','like','post','weight']) alllabels = np.array(['\Omega_m', 'h', '\Omega_b', 'n_s','a_s', r'\Omega_{\nu}','b1','b2','b3','b4','b5','m1','m2','m3','m4','ia_a','ia_alpha', 'wpz_b1','wpz_b2','wpz_b3','wpz_b4','lpz_b1','lpz_bin2','lpz_bin3','lpz_bin4','lpz_bin5',r'\sigma_{8}','like','post','weight']) #useindex = [0, 1, 2, 3, 4, 5,- 4] useindex = [0, 1, 2, 3, 4, 5,- 4] usednames = allnames[useindex] usedlabels = alllabels[useindex] nsample = get_nsample(args.samplesfile_forecast) allsamplestable = np.loadtxt(args.samplesfile_forecast) allsamplestable = allsamplestable[ -nsample:, : ] usedsamples = allsamplestable[:, useindex] usedweights = allsamplestable[: , -1] usedpost = allsamplestable[:, -2] samples = MCSamples(samples=usedsamples, names=usednames, labels=usedlabels, weights=usedweights , loglikes=usedpost, label='Forecast' ) samples.removeBurn(remove=0.1) nsample_cont = get_nsample(args.samplesfile_contaminated) allsamplestable_cont = np.loadtxt(args.samplesfile_contaminated) allsamplestable_cont = allsamplestable_cont[-nsample_cont:, : ] usedsamples_cont = allsamplestable_cont[:, useindex] usedweights_cont = allsamplestable_cont[: , -1] usedpost_cont = allsamplestable_cont[:, -2] samples_cont = MCSamples(samples=usedsamples_cont, names=usednames, labels=usedlabels, weights=usedweights_cont ,loglikes=usedpost_cont, label='PSF contamination' ) samples_cont.removeBurn(remove=0.1) g = plots.getSubplotPlotter() g.triangle_plot([samples, samples_cont], filled_compare=True, contour_colors=['green','darkblue']) #g.add_legend(legend_labels=[legend_name], fontsize=36, legend_loc=(-3.5,7)) g.export("getdistplot.png")
class MCEvidence(object): def __init__(self,method,ischain=True,isfunc=None, thinlen=0.0,burnlen=0.0, ndim=None, kmax= 5, priorvolume=1,debug=False, nsample=None, nbatch=1, brange=None, bscale='', verbose=1,args={}, **gdkwargs): """Evidence estimation from MCMC chains :param method: chain name (str) or array (np.ndarray) or python class If string or numpy array, it is interpreted as MCMC chain. Otherwise, it is interpreted as a python class with at least a single method sampler and will be used to generate chain. :param ischain (bool): True indicates the passed method is to be interpreted as a chain. This is important as a string name can be passed for to refer to a class or chain name :param nbatch (int): the number of batchs to divide the chain (default=1) The evidence can be estimated by dividing the whole chain in n batches. In the case nbatch>1, the batch range (brange) and batch scaling (bscale) should also be set :param brange (int or list): the minimum and maximum size of batches in linear or log10 scale e.g. [3,4] with bscale='logscale' means minimum and maximum batch size of 10^3 and 10^4. The range is divided nbatch times. :param bscale (str): the scaling in batch size. Allowed values are 'log','linear','constant'/ :param kmax (int): kth-nearest-neighbours, with k between 1 and kmax-1 :param args (dict): argument to be passed to method. Only valid if method is a class. :param gdkwargs (dict): arguments to be passed to getdist. :param verbose: chattiness of the run """ # self.verbose=verbose if debug or verbose>1: logging.basicConfig(level=logging.DEBUG) if verbose==0: logging.basicConfig(level=logging.WARNING) self.logger = logging.getLogger(__name__) self.info={} # self.nbatch=nbatch self.brange=brange #todo: check for [N] self.bscale=bscale if not isinstance(self.brange,int) else 'constant' # The arrays of powers and nchain record the number of samples # that will be analysed at each iteration. #idtrial is just an index self.idbatch=np.arange(self.nbatch,dtype=int) self.powers = np.zeros(self.nbatch) self.bsize = np.zeros(self.nbatch,dtype=int) self.nchain = np.zeros(self.nbatch,dtype=int) # self.kmax=max(2,kmax) self.priorvolume=priorvolume # self.ischain=ischain # self.fname=None # if ischain: if isinstance(method,str): self.fname=method self.logger.debug('Using chains: ',method) else: self.logger.debug('dictionary of samples and loglike array passed') else: #python class which includes a method called sampler if nsample is None: self.nsample=100000 else: self.nsample=nsample #given a class name, get an instance if isinstance(method,str): XClass = getattr(sys.modules[__name__], method) else: XClass=method if hasattr(XClass, '__class__'): self.logger.debug(__name__+': method is an instance of a class') self.method=XClass else: self.logger.debug(__name__+': method is class variable .. instantiating class') self.method=XClass(*args) #if passed class has some info, display it try: print() msg=self.method.info() print() except: pass # Now Generate samples. # Output should be dict - {'chains':,'logprob':,'weight':} method=self.method.Sampler(nsamples=self.nsamples) #======== By this line we expect only chains either in file or dict ==== self.gd = MCSamples(method,debug=verbose>1,**gdkwargs) if burnlen>0: _=self.gd.removeBurn(remove=burnlen) if thinlen>0: if thinlen<1: self.logger.info('calling poisson_thin ..') _=self.gd.thin_poisson(thinlen) else: _=self.gd.thin(nthin=thinlen) if isfunc: #try: self.gd.importance_sample(isfunc) #except: # self.logger.warn('Importance sampling failed. Make sure getdist is installed.') self.info['NparamsMC']=self.gd.nparamMC self.info['Nsamples_read']=self.gd.get_shape()[0] self.info['Nparams_read']=self.gd.get_shape()[1] # #after burn-in and thinning self.nsample = self.gd.get_shape()[0] if ndim is None: ndim=self.gd.nparamMC self.ndim=ndim # self.info['NparamsCosmo']=self.ndim self.info['Nsamples']=self.nsample # #self.info['MaxAutoCorrLen']=np.array([self.gd.samples.getCorrelationLength(j) for j in range(self.ndim)]).max() #print('***** ndim,nparamMC,MaxAutoCorrLen :',self.ndim,self.nparamMC,self.info['MaxAutoCorrLen']) #print('init minmax logl',method['lnprob'].min(),method['lnprob'].max()) self.logger.info('chain array dimensions: %s x %s ='%(self.nsample,self.ndim)) # self.set_batch() def summary(self): print() print('ndim={}'.format(self.ndim)) print('nsample={}'.format(self.nsample)) print('kmax={}'.format(self.kmax)) print('brange={}'.format(self.brange)) print('bsize'.format(self.bsize)) print('powers={}'.format(self.powers)) print('nchain={}'.format(self.nchain)) print() def get_batch_range(self): if self.brange is None: powmin,powmax=None,None else: powmin=np.array(self.brange).min() powmax=np.array(self.brange).max() if powmin==powmax and self.nbatch>1: self.logger.error('nbatch>1 but batch range is set to zero.') raise return powmin,powmax def set_batch(self,bscale=None): if bscale is None: bscale=self.bscale else: self.bscale=bscale # if self.brange is None: self.bsize=self.brange #check powmin,powmax=None,None self.nchain[0]=self.nsample self.powers[0]=np.log10(self.nsample) else: if bscale=='logpower': powmin,powmax=self.get_batch_range() self.powers=np.linspace(powmin,powmax,self.nbatch) self.bsize = np.array([int(pow(10.0,x)) for x in self.powers]) self.nchain=self.bsize elif bscale=='linear': powmin,powmax=self.get_batch_range() self.bsize=np.linspace(powmin,powmax,self.nbatch,dtype=np.int) self.powers=np.array([int(log10(x)) for x in self.nchain]) self.nchain=self.bsize else: #constant self.bsize=self.brange #check self.powers=self.idbatch self.nchain=np.array([x for x in self.bsize.cumsum()]) def get_samples(self,nsamples,istart=0,rand=False): # If we are reading chain, it will be handled here # istart - will set row index to start getting the samples ntot=self.gd.get_shape()[0] if rand and not self.brange is None: if nsamples>ntot: self.logger.error('nsamples=%s, ntotal_chian=%s'%(nsamples,ntot)) raise idx=np.random.randint(0,high=ntot,size=nsamples) else: idx=np.arange(istart,nsamples+istart) self.logger.info('requested nsamples=%s, ntotal_chian=%s'%(nsamples,ntot)) s,lnp,w=self.gd.arrays() return s[idx,0:self.ndim],lnp[idx],w[idx] def evidence(self,verbose=None,rand=False,info=False, profile=False,pvolume=None,pos_lnp=False, nproc=-1,prewhiten=True): ''' MARGINAL LIKELIHOODS FROM MONTE CARLO MARKOV CHAINS algorithm described in Heavens et. al. (2017) Parameters --------- :param verbose - controls the amount of information outputted during run time :param rand - randomised sub sampling of the MCMC chains :param info - if True information about the analysis will be returd to the caller :param pvolume - prior volume :param pos_lnp - if input log likelihood is multiplied by negative or not :param nproc - determined how many processors the scikit package should use or not :param prewhiten - if True chains will be normalised to have unit variance Returns --------- MLE - maximum likelihood estimate of evidence: self.info (optional) - returned if info=True. Contains useful information about the chain analysed Notes --------- The MCEvidence algorithm is implemented using scikit nearest neighbour code. Examples --------- To run the evidence estimation from an ipython terminal or notebook >> from MCEvidence import MCEvidence >> MLE = MCEvidence('/path/to/chain').evidence() To run MCEvidence from shell $ python MCEvidence.py </path/to/chain> References ----------- .. [1] Heavens etl. al. (2017) ''' if verbose is None: verbose=self.verbose #get prior volume if pvolume is None: logPriorVolume=math.log(self.priorvolume) else: logPriorVolume=math.log(pvolume) self.logger.debug('log prior volume: ',logPriorVolume) kmax=self.kmax ndim=self.ndim MLE = np.zeros((self.nbatch,kmax)) #get covariance matrix of chain #ChainCov=self.gd.samples.getCovMat() #eigenVal,eigenVec = np.linalg.eig(ChainCov) #Jacobian = math.sqrt(np.linalg.det(ChainCov)) #ndim=len(eigenVal) # Loop over different numbers of MCMC samples (=S): itot=0 for ipow,nsample in zip(self.idbatch,self.nchain): S=int(nsample) DkNN = np.zeros((S,kmax)) indices = np.zeros((S,kmax)) volume = np.zeros((S,kmax)) samples_raw = np.zeros((S,ndim)) samples_raw_cmc,logL,weight=self.get_samples(S,istart=itot,rand=rand) samples_raw[:,0:ndim] = samples_raw_cmc[:,0:ndim] #We need the logarithm of the likelihood - not the negative log if pos_lnp: logL=-logL # Renormalise loglikelihood (temporarily) to avoid underflows: logLmax = np.amax(logL) fs = logL-logLmax #print('(mean,min,max) of LogLikelihood: ',fs.mean(),fs.min(),fs.max()) if prewhiten: self.logger.info('Prewhitenning chains using sample covariance matrix ..') # Covariance matrix of the samples, and eigenvalues (in w) and eigenvectors (in v): ChainCov = np.cov(samples_raw.T) eigenVal,eigenVec = np.linalg.eig(ChainCov) Jacobian = math.sqrt(np.linalg.det(ChainCov)) # Prewhiten: First diagonalise: samples = np.dot(samples_raw,eigenVec); #print('EigenValues.shape,ndim',eigenVal.shape,ndim) #print('EigenValues=',eigenVal) # And renormalise new parameters to have unit covariance matrix: for i in range(ndim): samples[:,i]= samples[:,i]/math.sqrt(eigenVal[i]) else: #no diagonalisation Jacobian=1 samples=samples_raw #print('samples, after prewhiten', samples[1000:1010,0:ndim]) #print('Loglikes ',logLmax,logL[1000:1010],fs[1000:1010]) #print('weights',weight[1000:1010]) #print('EigenValues=',eigenVal) # Use sklearn nearest neightbour routine, which chooses the 'best' algorithm. # This is where the hard work is done: nbrs = NearestNeighbors(n_neighbors=kmax+1, algorithm='auto',n_jobs=nproc).fit(samples) DkNN, indices = nbrs.kneighbors(samples) # Create the posterior for 'a' from the distances (volumes) to nearest neighbour: for k in range(1,self.kmax): for j in range(0,S): # Use analytic formula for the volume of ndim-sphere: volume[j,k] = math.pow(math.pi,ndim/2)*math.pow(DkNN[j,k],ndim)/sp.gamma(1+ndim/2) #print('volume minmax: ',volume[:,k].min(),volume[:,k].max()) #print('weight minmax: ',weight.min(),weight.max()) # dotp is the summation term in the notes: dotp = np.dot(volume[:,k]/weight[:],np.exp(fs)) # The MAP value of 'a' is obtained analytically from the expression for the posterior: amax = dotp/(S*k+1.0) # Maximum likelihood estimator for the evidence SumW = np.sum(self.gd.adjusted_weights) print('********sumW=',SumW,np.sum(weight)) MLE[ipow,k] = math.log(SumW*amax*Jacobian) + logLmax - logPriorVolume print('SumW,S,amax,Jacobian,logLmax,logPriorVolume,MLE:',SumW,S,amax,Jacobian,logLmax,logPriorVolume,MLE[ipow,k]) print('---') # Output is: for each sample size (S), compute the evidence for kmax-1 different values of k. # Final columm gives the evidence in units of the analytic value. # The values for different k are clearly not independent. If ndim is large, k=1 does best. if self.brange is None: #print('(mean,min,max) of LogLikelihood: ',fs.mean(),fs.min(),fs.max()) if verbose>1: self.logger.info('k={},nsample={}, dotp={}, median_volume={}, a_max={}, MLE={}'.format( k,S,dotp,statistics.median(volume[:,k]),amax,MLE[ipow,k])) else: if verbose>1: if ipow==0: self.logger.info('(iter,mean,min,max) of LogLikelihood: ',ipow,fs.mean(),fs.min(),fs.max()) self.logger.info('-------------------- useful intermediate parameter values ------- ') self.logger.info('nsample, dotp, median volume, amax, MLE') self.logger.info(S,k,dotp,statistics.median(volume[:,k]),amax,MLE[ipow,k]) #MLE[:,0] is zero - return only from k=1 if self.brange is None: MLE=MLE[0,1:] else: MLE=MLE[:,1:] if verbose>0: print('') print('MLE[k=(1,2,3,4)] = ',MLE) print('') if info: return MLE, self.info else: return MLE