def estep(): if any(self.param['alpha'] < 1e-20): self.param['alpha'][where(self.param['alpha'] < 1e-20)] = 1e-20 self.param['alpha'] /= sum(self.param['alpha']) for k in xrange(K): LP[k,:] = self.param['P'][k].loglik(dat) + log(self.param['alpha'][k]) for k in xrange(K): T[k,:] = exp(LP[k,:]-logsumexp(LP,axis=0)) # return sum((T*LP).flatten()) return sum(logsumexp(LP,axis=0))
def check_loglik_importance(dic): """ Checks a log-likelihood function via importance sampling. Nsamples are drawn from the proposal_high distribution (given in the dictionary). Therefore, it is assumed that the log-likelihood function of this distribution is correct. If also the likelihood function of the distribution under test (dic['dist']) is correct, the importance sampling estimate of the partition function, should yield a value close to 1. Here, an absolute error of dic['tolerance'] is accepted to pass the test. Argument: :param dic: dictionary filled with (at least) proposal_high, nsamples, tolerance :type dic : dictionary """ dic = fill_dict_with_defaults(dic) data = dic['proposal_high'].sample(dic['nsamples']) logQ = dic['proposal_high'].loglik(data) logP = dic['dist'].loglik(data) logZ = logsumexp(logP - logQ) - np.log(dic['nsamples']) diff = np.abs(np.exp(logZ) - 1) assert np.abs(np.exp(logZ) - 1) < dic[ 'tolerance'], "Testing loglik failed for %s, difference is %g, which is bigger than %g, number of samples are: %d " % ( dic['dist'].name, diff, dic['tolerance'], dic['nsamples'])
def check_sample(dic): """ Checks a sample function via importance sampling. Nsamples are drawn from the distribution under test. Then the partition function of the proposal_low distribution is estimated using these samples and the log-likelihood function of the distribution under test. If sampling and both log-likelihood functions are correct, the importance sampling estimate should yield a partition function estimate of 1. Here, an absolute error of dic['tolerance'] is accepted to pass the test. Argument: :param dic: dictionary filled with (at least) proposal_low, nsamples, tolerance :type dic : dictionary """ dic = fill_dict_with_defaults(dic) data = dic['dist'].sample(dic['nsamples']) logP = dic['proposal_low'].loglik(data) logQ = dic['dist'].loglik(data) logZ = logsumexp(logP - logQ) - np.log(dic['nsamples']) diff = np.abs(np.exp(logZ) - 1) assert np.abs(np.exp(logZ) - 1) < dic[ 'tolerance'], "Testing sampling failed for %s, difference is %g, which is bigger than %g, number of smaples are: %d" % ( dic['dist'].name, diff, dic['tolerance'], dic['nsamples'])
def test_loglik(self): p1 = Distributions.TruncatedExponentialPower({'a':-1.0,'b':2.0,'p':1.0,'s':2.0}) p2 = Distributions.TruncatedExponentialPower({'a':-1.0,'b':2.0,'p':1.5,'s':2.0}) nsamples = 1000000 data = p2.sample(nsamples) logZ = logsumexp(p1.loglik(data) -p2.loglik(data) - np.log(nsamples)) print "Estimated partition function: ", np.exp(logZ) self.assertTrue(np.abs(np.exp(logZ)-1.0) < 0.1*self.TolParam,'Difference in estimated partition function (1.0) greater than' + str(0.1*self.TolParam))
def test_loglik(self): nsamples = 1000000 Gauss = Gaussian(n=1,mu=array([0]),sigma=array([[4]])) dat = Gauss.sample(nsamples) logWeights = self.mog.loglik(dat) - Gauss.loglik(dat) Z = logsumexp(logWeights)-log(nsamples) print "test_loglik: z: " ,exp(Z) self.assertTrue(abs(exp(Z)-1)<1e-01)
def test_loglik(self): nsamples = 1000000 Gauss = Gaussian(n=1, mu=array([0]), sigma=array([[4]])) dat = Gauss.sample(nsamples) logWeights = self.mog.loglik(dat) - Gauss.loglik(dat) Z = logsumexp(logWeights) - log(nsamples) print "test_loglik: z: ", exp(Z) self.assertTrue(abs(exp(Z) - 1) < 1e-01)
def test_loglik(self): p1 = Distributions.Kumaraswamy({'a': 2.0, 'b': 3.0}) p2 = Distributions.Kumaraswamy({'a': 1.0, 'b': 1.0}) nsamples = 1000000 data = p2.sample(nsamples) logZ = logsumexp(p1.loglik(data) - p2.loglik(data) - np.log(nsamples)) print "Estimated partition function: ", np.exp(logZ) self.assertTrue( np.abs(np.exp(logZ) - 1.0) < 0.1 * self.TolParam, 'Difference in estimated partition function (1.0) greater than' + str(0.1 * self.TolParam))
def test_loglik(self): nsamples=100000 q = self.q.copy() q['s'] = 2.0*self.q['s'] dataImportance = q.sample(nsamples) # from matplotlib.pyplot import show # self.P.histogram(dataImportance,bins=200) # show() # raw_input() logweights = self.P.loglik(dataImportance)-q.loglik(dataImportance) Z = logsumexp(logweights)-log(nsamples) err = abs(exp(Z)-1) self.assertTrue(err<1e-01,'Estimated partition function deviates from 1.0 by %.4g' % (err,))
def test_loglik(self): p1 = self.p p2 = self.p.copy() p2['mu'] *= 1.1 nsamples = 1000000 data = p2.sample(nsamples) logZ = logsumexp(p1.loglik(data) - p2.loglik(data) - np.log(nsamples)) print np.exp(logZ) print "Estimated partition function: ", np.exp(logZ) self.assertTrue( np.abs(np.exp(logZ) - 1.0) < 0.1 * self.TolParam, 'Difference in estimated partition function (1.0) greater than' + str(0.1 * self.TolParam))
def test_loglik(self): p1 = Distributions.Gamma({'u': 2.0, 's': 3.0}) p2 = Distributions.Gamma({'u': 1.0, 's': 1.0}) nsamples = 1000000 data = p2.sample(nsamples) logZ = logsumexp(p1.loglik(data) - p2.loglik(data) - np.log(nsamples)) print "Estimated partition function: ", np.exp(logZ) print "Testing log-likelihood of Gamma distribution ... " sys.stdout.flush() p = Distributions.Gamma({'u': 2.0, 's': 3.0}) l = p.loglik(self.X) for k in range(len(self.LL)): self.assertFalse(np.abs(l[k] - self.LL[k]) > self.Tol,\ 'Difference in log-likelihood for Gamma greater than ' + str(self.Tol))
def loglik(self,dat): ''' Computes the loglikelihood of the data points in dat. :param dat: Data points for which the loglikelihood will be computed. :type dat: natter.DataModule.Data :returns: An array containing the loglikelihoods. :rtype: numpy.array ''' ret = zeros((self.param['K'],dat.size(1))) for k in range(self.param['K']): ret[k,:] = log(self.param['pi'][k]) + squeeze(self.__kloglik(dat,k)) return squeeze(logsumexp(ret,0))
def test_loglik(self): p1 = Distributions.Gamma({'u':2.0,'s':3.0}) p2 = Distributions.Gamma({'u':1.0,'s':1.0}) nsamples = 1000000 data = p2.sample(nsamples) logZ = logsumexp(p1.loglik(data) -p2.loglik(data) - np.log(nsamples)) print "Estimated partition function: ", np.exp(logZ) print "Testing log-likelihood of Gamma distribution ... " sys.stdout.flush() p = Distributions.Gamma({'u':2.0,'s':3.0}) l = p.loglik(self.X) for k in range(len(self.LL)): self.assertFalse(np.abs(l[k] - self.LL[k]) > self.Tol,\ 'Difference in log-likelihood for Gamma greater than ' + str(self.Tol))
def test_loglik(self): nsamples = 100000 q = self.q.copy() q['s'] = 2.0 * self.q['s'] dataImportance = q.sample(nsamples) # from matplotlib.pyplot import show # self.P.histogram(dataImportance,bins=200) # show() # raw_input() logweights = self.P.loglik(dataImportance) - q.loglik(dataImportance) Z = logsumexp(logweights) - log(nsamples) err = abs(exp(Z) - 1) self.assertTrue( err < 1e-01, 'Estimated partition function deviates from 1.0 by %.4g' % (err, ))
def loglik(self, dat): ''' Computes the loglikelihood of the data points in dat. :param dat: Data points for which the loglikelihood will be computed. :type dat: natter.DataModule.Data :returns: An array containing the loglikelihoods. :rtype: numpy.array ''' ret = zeros((self.param['K'], dat.size(1))) for k in xrange(self.param['K']): ret[k, :] = log(self.param['pi'][k]) + squeeze( self.__kloglik(dat, k)) return squeeze(logsumexp(ret, 0))
def loglik(self,dat): ''' Computes the loglikelihood of the data points in dat. :param dat: Data points for which the loglikelihood will be computed. :type dat: natter.DataModule.Data :returns: An array containing the loglikelihoods. :rtype: numpy.array ''' self._checkAlpha() n,m = dat.size() X = zeros((m,len(self.param['P']))) for k,p in enumerate(self.param['P']): X[:,k] = p.loglik(dat) + log(self.param['alpha'][k]) return logsumexp(X,axis=1)
def mixturePosterior(self,dat): """ Returns the posterior p(k|x) over the inidicator variable for the mixture components given the data points in dat. :param dat: data points at which the posterior is computed :type dat: natter.numpy.ndarray :returns: posterior over the mixture components :rtype: numpy.ndarray """ n,m = dat.size() K = len(self.param['P']) T = zeros((K,m)) # alpha(i)*p_i(x|theta)/(sum_j alpha(j) p_j(x|theta)) LP = zeros((K,m)) # log likelihoods of the single mixture components for k,p in enumerate(self.param['P']): LP[k,:] = p.loglik(dat) + log(self.param['alpha'][k]) for k in xrange(K): T[k,:] = exp(LP[k,:]-logsumexp(LP,axis=0)) return T
def check_sample(dic): """ Checks a sample function via importance sampling. Nsamples are drawn from the distribution under test. Then the partition function of the proposal_low distribution is estimated using these samples and the log-likelihood function of the distribution under test. If sampling and both log-likelihood functions are correct, the importance sampling estimate should yield a partition function estimate of 1. Here, an absolute error of dic['tolerance'] is accepted to pass the test. Argument: :param dic: dictionary filled with (at least) proposal_low, nsamples, tolerance :type dic : dictionary """ dic = fill_dict_with_defaults(dic) data = dic['dist'].sample(dic['nsamples']) logP = dic['proposal_low'].loglik(data) logQ = dic['dist'].loglik(data) logZ = logsumexp(logP -logQ) - np.log(dic['nsamples']) diff = np.abs(np.exp(logZ)-1) assert np.abs(np.exp(logZ)-1)<dic['tolerance'], "Testing sampling failed for %s, difference is %g, which is bigger than %g, number of smaples are: %d"%(dic['dist'].name,diff,dic['tolerance'],dic['nsamples'])
def check_loglik_importance(dic): """ Checks a log-likelihood function via importance sampling. Nsamples are drawn from the proposal_high distribution (given in the dictionary). Therefore, it is assumed that the log-likelihood function of this distribution is correct. If also the likelihood function of the distribution under test (dic['dist']) is correct, the importance sampling estimate of the partition function, should yield a value close to 1. Here, an absolute error of dic['tolerance'] is accepted to pass the test. Argument: :param dic: dictionary filled with (at least) proposal_high, nsamples, tolerance :type dic : dictionary """ dic = fill_dict_with_defaults(dic) data = dic['proposal_high'].sample(dic['nsamples']) logQ = dic['proposal_high'].loglik(data) logP = dic['dist'].loglik(data) logZ = logsumexp(logP -logQ) - np.log(dic['nsamples']) diff = np.abs(np.exp(logZ)-1) assert np.abs(np.exp(logZ)-1)<dic['tolerance'], "Testing loglik failed for %s, difference is %g, which is bigger than %g, number of samples are: %d "%(dic['dist'].name,diff,dic['tolerance'],dic['nsamples'])
def estimate(self, dat, errTol=1e-4, maxiter=1000): ''' Estimates the parameters from the data in dat. It is possible to only selectively fit parameters of the distribution by setting the primary array accordingly (see :doc:`Tutorial on the Distributions module <tutorial_Distributions>`). The estimation method uses EM to fit the mixture distribution. :param dat: Data points on which the Mixture of Dirichlet distributions will be estimated. :type dat: natter.DataModule.Data :param errTol: Stopping criterion for the iteration :type errTol: float :param maxiter: maximal number of EM iterations :param maxiter: int ''' if len(dat.X.shape) == 1: print "\tReshaping data to right shape" dat.X = reshape(dat.X, (1, dat.X.shape[0])) print "\tEstimating Mixture of Gaussians with EM ..." errTol = 1e-5 K = self.param['K'] mu = self.param['mu'].copy() s = self.param['s'].copy() m = dat.size(1) p = self.param['pi'].copy() X = dat.X H = zeros((K, m)) ALLold = ALL = Inf nr = floor(m / K) for k in range(K): mu[k] = mean(X[0, k * nr:(k + 1) * nr + 1]) for i in range(maxiter): ALLold = ALL sumH = zeros((1, m)) for j in range(K): if p[j] < 1e-3: p[j] = 1e-3 if s[j] < 1e-3: s[j] = 1e-3 # E-Step # the next few lines have been transferred to the log-domain for numerical stability for k in range(K): H[k, :] = log(p[k]) + squeeze(-.5 * log(pi * 2.0) - log(s[k]) - (dat.X - mu[k])**2 / (2.0 * s[k]**2.0)) sumH = logsumexp(H, 0) for k in range(K): H[k, :] = H[k, :] - sumH H = exp(H) # leave log-domain here sumHk = sum(H, 1) if 'mu' in self.primary: mu = squeeze(dot(H, X.T)) / sumHk self.param['mu'] = mu if 'pi' in self.primary: p = squeeze(mean(H, 1)) self.param['pi'] = p if 's' in self.primary: for k in range(K): s[k] = sqrt(sum(H[k, :] * (X - mu[k])**2) / sumHk[k]) self.param['s'] = s if i >= 2: ALL = self.all(dat) print "\r\t Mixture Of Gaussians ALL: %.8f [Bits]" % ALL, sys.stdout.flush() if abs(ALLold - ALL) < errTol: break print "\t[EM finished]"
def estimate(self,dat, errTol=1e-4,maxiter=1000): ''' Estimates the parameters from the data in dat. It is possible to only selectively fit parameters of the distribution by setting the primary array accordingly (see :doc:`Tutorial on the Distributions module <tutorial_Distributions>`). The estimation method uses EM to fit the mixture distribution. :param dat: Data points on which the Mixture of Dirichlet distributions will be estimated. :type dat: natter.DataModule.Data :param errTol: Stopping criterion for the iteration :type errTol: float :param maxiter: maximal number of EM iterations :param maxiter: int ''' if len(dat.X.shape) == 1: print "\tReshaping data to right shape" dat.X = reshape(dat.X,(1,dat.X.shape[0])) print "\tEstimating Mixture of Gaussians with EM ..." errTol=1e-5 K=self.param['K'] mu = self.param['mu'].copy() s = self.param['s'].copy() m = dat.size(1) p = self.param['pi'].copy() X = dat.X H = zeros((K,m)) ALLold = ALL = Inf nr = floor(m/K) for k in range(K): mu[k] = mean(X[0,k*nr:(k+1)*nr+1]) for i in range(maxiter): ALLold = ALL sumH = zeros((1,m)) for j in range(K): if p[j] < 1e-3: p[j] = 1e-3 if s[j] < 1e-3: s[j] = 1e-3 # E-Step # the next few lines have been transferred to the log-domain for numerical stability for k in range(K): H[k,:] = log(p[k]) + squeeze(-.5*log(pi*2.0) - log(s[k]) - (dat.X-mu[k])**2 / (2.0*s[k]**2.0)) sumH = logsumexp(H,0) for k in range(K): H[k,:] = H[k,:] - sumH H = exp(H) # leave log-domain here sumHk = sum(H,1) if 'mu' in self.primary: mu = squeeze(dot(H,X.T))/sumHk self.param['mu'] = mu if 'pi' in self.primary: p = squeeze(mean(H,1)) self.param['pi'] = p if 's' in self.primary: for k in range(K): s[k] = sqrt(sum(H[k,:]*(X-mu[k])**2)/sumHk[k]) self.param['s'] = s if i >= 2: ALL = self.all(dat) print "\r\t Mixture Of Gaussians ALL: %.8f [Bits]" % ALL, sys.stdout.flush() if abs(ALLold-ALL)<errTol: break print "\t[EM finished]"
def test_sample(self): nsamples = 10000 data = self.ECG.sample(nsamples) logWeights = self.Gaussian.loglik(data) -self.ECG.loglik(data) Z = logsumexp(logWeights)-log(nsamples) self.assertTrue(abs(exp(Z)-1)<1e-01)
def test_loglik(self): nsamples=100000 dataImportance = self.Gaussian.sample(nsamples) logweights = self.ECG.loglik(dataImportance)-self.Gaussian.loglik(dataImportance) Z = logsumexp(logweights)-log(nsamples) self.assertTrue(abs(exp(Z)-1)<1e-01)