def set_parameters(self, d, e, f, g, B=None): """Specify the tinker parameters and calculate quantities that only depend on them. Args: d (float): Tinker parameter. e (float): Tinker parameter. f (float): Tinker parameter. g (float): Tinker parameter. B (float; optional): Normalization coefficient. If B isn't specified then it's calculated from d,e,f,g such that the mass function is gauranteed to be normalized. """ self.params = np.array([d, e, f, g, B]) gamma_d2 = special.gamma(d*0.5) gamma_f2 = special.gamma(f*0.5) log_g = np.log(g) gnd2 = g**(-d*0.5) gnf2 = g**(-f*0.5) ed = e**d if not B: self.B_coefficient = 2.0/(ed * gnd2 * gamma_d2 + gnf2 * gamma_f2) B2 = self.B_coefficient**2 self.dBdd = 0.25 * B2 * ed * gnd2 * gamma_d2 * (log_g - 2.0 - special.digamma(d*0.5)) self.dBde = -0.5 * B2 * d * ed/e * gnd2 * gamma_d2 self.dBdf = 0.25 * B2 * gnf2 * gamma_f2 * (log_g - special.digamma(f*0.5)) self.dBdg = 0.25 * B2 * (d * ed * gnd2/g * gamma_d2 + f* gnf2/g * gamma_f2) else: self.B_coefficient = B self.dBdd = self.dBde = self.dBdf = self.dBdg = 0 self.make_dndlM_spline() return
def optAlpha(self, MAX_ALPHA_ITER=1000, NEWTON_THRESH=1e-5): """ Estimate new Dirichlet priors (actually just one scalar shared across all topics). """ initA = 100.0 logA = numpy.log(initA) # keep computations in log space logging.debug("optimizing old alpha %s" % self.alpha) for i in xrange(MAX_ALPHA_ITER): a = numpy.exp(logA) if not numpy.isfinite(a): initA = initA * 10.0 logging.warning("alpha is NaN; new init alpha=%f" % initA) a = initA logA = numpy.log(a) f = ( self.numDocs * (gammaln(self.numTopics * a) - self.numTopics * gammaln(a)) + (a - 1) * self.alphaSuffStats ) df = self.alphaSuffStats + self.numDocs * ( self.numTopics * digamma(self.numTopics * a) - self.numTopics * digamma(a) ) d2f = self.numDocs * ( self.numTopics * self.numTopics * trigamma(self.numTopics * a) - self.numTopics * trigamma(a) ) logA -= df / (d2f * a + df) # logging.debug("alpha maximization: f=%f, df=%f" % (f, df)) if numpy.abs(df) <= NEWTON_THRESH: break result = numpy.exp(logA) # convert back from log space logging.info("estimated old alpha %s to new alpha %s" % (self.alpha, result)) return result
def get_h(x, k=1, norm=np.inf, min_dist=0.): """ Estimates the entropy H of a random variable x (in nats) based on the kth-nearest neighbour distances between point samples. @reference: Kozachenko, L., & Leonenko, N. (1987). Sample estimate of the entropy of a random vector. Problemy Peredachi Informatsii, 23(2), 9–16. Arguments: ---------- x: (n, d) ndarray n samples from a d-dimensional multivariate distribution k: int (default 1) kth nearest neighbour to use in density estimate; imposes smoothness on the underlying probability distribution norm: 1, 2, or np.inf (default np.inf) p-norm used when computing k-nearest neighbour distances 1: absolute-value norm 2: euclidean norm 3: max norm min_dist: float (default 0.) minimum distance between data points; smaller distances will be capped using this value Returns: -------- h: float entropy H(X) """ n, d = x.shape # volume of the d-dimensional unit ball... # if norm == np.inf: # max norm: # log_c_d = 0 # elif norm == 2: # euclidean norm # log_c_d = (d/2.) * log(np.pi) -log(gamma(d/2. +1)) # elif norm == 1: # raise NotImplementedError # else: # raise NotImplementedError("Variable 'norm' either 1, 2 or np.inf") log_c_d = 0. kdtree = cKDTree(x) # query all points -- k+1 as query point also in initial set # distances, idx = kdtree.query(x, k + 1, eps=0, p=norm) distances, idx = kdtree.query(x, k + 1, eps=0, p=np.inf) distances = distances[:, -1] # enforce non-zero distances distances[distances < min_dist] = min_dist sum_log_dist = np.sum(log(2*distances)) # where did the 2 come from? radius -> diameter h = -digamma(k) + digamma(n) + log_c_d + (d / float(n)) * sum_log_dist return h
def getLPfromResp(self, Resp, smoothMass=0.001): ''' Create full local parameter (LP) dictionary for HDPModel, given responsibility matrix Resp Returns -------- LP : dict with fields word_variational, alphaPi, E_logPi, DocTopicCount ''' Data = self.Data D = Data.nDoc K = Resp.shape[1] # DocTopicCount matrix : D x K matrix DocTopicC = np.zeros((D, K)) for dd in range(D): start,stop = Data.doc_range[dd,:] DocTopicC[dd,:] = np.dot(Data.word_count[start:stop], Resp[start:stop,:] ) assert np.allclose(DocTopicC.sum(), Data.word_count.sum()) # Alpha and ElogPi : D x K+1 matrices padCol = smoothMass * np.ones((D,1)) alph = np.hstack( [DocTopicC + smoothMass, padCol]) ElogPi = digamma(alph) - digamma(alph.sum(axis=1))[:,np.newaxis] assert ElogPi.shape == (D,K+1) return dict(word_variational =Resp, E_logPi=ElogPi, alphaPi=alph, DocTopicCount=DocTopicC)
def objectiveGradient(lambda_k, nu, tau, Elog_eta_k, nDoc): ''' Calculate gradient of objectiveFunc, objective for HDP variational Returns ------- gvec : 2*K length vector, where each entry gives partial derivative with respect to the corresponding entry of Cvec ''' # lvec is the derivative of log(lambda_k) via chain rule lvec = 1/(lambda_k) W = lvec.size # Derivative of log eta digammaAll = digamma(np.sum(lambda_k)) Elog_lambda_k = digamma(lambda_k) - digammaAll # Derivative of Elog_phi_k and E_phi_k polygammaAll = polygamma(1,np.sum(lambda_k)) dElog_phi_k = polygamma(1,lambda_k) - polygammaAll lambda_k_sum = np.sum(lambda_k) dE_phi_k = (lambda_k_sum - lambda_k) / np.power(lambda_k_sum,2) gvec = dElog_phi_k * (N + tau - lambda_k) \ + dE_phi_k * nu * Elog_eta_k gvec = -1 * gvec # Apply chain rule! gvecC = lvec * gvec return gvecC
def E_step( self, X): N,D = X.shape lpr = np.zeros( (N, self.gmm.K) ) logdet = np.zeros( self.gmm.K ) dterms = np.arange( 1,D+1 ) # 1,2,3... D self.invWchol = list() for k in range(self.gmm.K): dXm = X - self.qMixComp[k].m L = scipy.linalg.cholesky( self.qMixComp[k].invW, lower=True) self.invWchol.append( L ) if np.any( np.isnan(L) | np.isinf(L) ): print 'NaN!', self.qMixComp[k] #invL = scipy.linalg.inv( L ) # want: Q = invL * X.T # so we solve for matrix Q s.t. L*Q = X.T lpr[:,k] = -0.5*self.qMixComp[k].dF \ * np.sum( scipy.linalg.solve_triangular( L, dXm.T,lower=True)**2, axis=0) lpr[:,k] -= 0.5*D/self.qMixComp[k].beta # det( W ) = 1/det(invW) # = 1/det( L )**2 # det of triangle matrix = prod of diag entries logdet[k] = -2*np.sum( np.log(np.diag(L) ) ) + D*np.log(2.0) logdet[k] += digamma( 0.5*(dterms+1+self.qMixComp[k].dF) ).sum() self.logwtilde = digamma( self.alpha ) - digamma( self.alpha.sum() ) self.logLtilde = logdet lpr += self.logwtilde lpr += logdet lprSUM = logsumexp(lpr, axis=1) resp = np.exp(lpr - lprSUM[:, np.newaxis]) resp /= resp.sum( axis=1)[:,np.newaxis] # row normalize return resp
def objective(X, Y, C, mu, a, b, e, f, a0, b0, e0, f0): log2pi = np.log(2*np.pi) N, D = X.shape # E(lnX) = digamma(a) - ln(b) for X ~ Gamma(a,b) E_ln_lambda = digamma(e) - np.log(f) E_ln_alpha = digamma(a) - np.log(b) # model likelihood total = (N/2.0)*(E_ln_lambda - log2pi) data_total = 0 for i in xrange(N): delta = Y[i] - X[i].dot(mu) data_total += delta*delta + X[i].dot(C).dot(X[i]) total -= (float(e)/f)/2.0 * data_total # print "total after model likelihood:", total # w likelihood total -= (D/2.0)*log2pi for k in xrange(D): total += 0.5*(E_ln_alpha[k] - (float(a[k])/b[k])*(C[k,k] + mu[k]*mu[k])) # print "total after w likelihood:", total # lambda likelihood total += e0*np.log(f0) - np.log(gamma(e0)) + (e0 - 1)*E_ln_lambda - f0*(float(e)/f) # print "total after lambda likelihood:", total # alpha likelihood for k in xrange(D): total += a0*np.log(b0) - np.log(gamma(a0)) + (a0 - 1)*E_ln_alpha[k] - b0*(float(a[k])/b[k]) # print "total after alpha likelihood:", total # entropy # TODO: calculate this manually # total -= mvn.entropy(mean=mu, cov=C) # e1 = mvn.entropy(cov=C) # e2 = 0.5*np.log( np.linalg.det(2*np.pi*np.e*C) ) # print "e1:", e1, "e2:", e2 # total += 0.5*np.log( np.linalg.det(2*np.pi*np.e*C) ) total += mvn.entropy(cov=C) # print "det(C):", np.linalg.det(C) # print "total after lnq(w):", total # total -= gamma_dist.entropy(e, scale=1.0/f) # e3 = gamma_dist.entropy(e, scale=1.0/f) # e4 = -e_ln_q_gamma(e, f) # print "e3:", e3, "e4:", e4 # assert(np.abs(e3 - e4) < 1e-8) total += gamma_dist.entropy(e, scale=1.0/f) # total -= e_ln_q_gamma(e, f) # print "total after lnq(lambda):", total for k in xrange(D): # total -= e_ln_q_gamma(a[k], b[k]) total += gamma_dist.entropy(a[k], scale=1.0/b[k]) return total
def _negative_binomial_gradient_sparse(X, counts, alpha=-3, beta=1., dispersion=None, bias=None, use_zero_counts=False): if use_zero_counts: raise NotImplementedError bias = bias.flatten() dis = np.sqrt(((X[counts.row] - X[counts.col])**2).sum(axis=1)) fdis = bias[counts.row] * bias[counts.col] * beta * dis ** alpha diff = X[counts.row] - X[counts.col] d = dispersion.predict(fdis) d_prime = (dispersion.derivate(fdis) * alpha * beta * bias[counts.row] * bias[counts.col] * dis ** (alpha - 2))[:, np.newaxis] * diff grad = -(special.digamma(counts.data + d)[:, np.newaxis] * d_prime) grad += special.digamma(d)[:, np.newaxis] * d_prime grad -= (counts.data * alpha / dis ** 2)[:, np.newaxis] * diff grad -= (np.log(d) + 1)[:, np.newaxis] * d_prime grad += np.log(d + fdis)[:, np.newaxis] * d_prime grad += ((counts.data + d) / (d + fdis))[:, np.newaxis] * ( (fdis * alpha / dis**2)[:, np.newaxis] * diff + d_prime) grad_ = np.zeros(X.shape) for i in range(X.shape[0]): grad_[i] += grad[counts.row == i].sum(axis=0) grad_[i] -= grad[counts.col == i].sum(axis=0) return grad_
def fit_betabinom_minka(counts, maxiter=1000, tol=1e-6, initial_guess=None): ''' See Estimating a Dirichlet Distribution, Thomas P. Minka, 2003, eq. 55. see also the code for polya_fit_simple.m in his fastfit matlab toolbox, which this code is a translation of. counts should be NxK with N samples over K classes.''' counts = matrix(counts).astype(float) # remove observations with no trials counts = counts[sum(counts.A, axis=1) > 0, :] if initial_guess == None: alpha = polya_moment_match(counts).T else: alpha = matrix(initial_guess).T # Abstraction barrier: now in Dirichlet/Polya mode, following naming in Minka's paper. n = counts.T N = n.shape[1] n_i = n.sum(axis=0) change = 2*tol iter = 0 while (change > tol) and (iter < maxiter): numerator = digamma(n + alpha.repeat(N, axis=1)).sum(axis=1) - N * digamma(alpha) denominator = digamma(n_i + alpha.sum()).sum() - N * digamma(alpha.sum()) old_alpha = alpha alpha = multiply(alpha, numerator / denominator) change = abs(old_alpha - alpha).max() iter = iter + 1 # now leaving Abstraction Barrier return array(alpha[:,0]).T, iter
def KL_divergence(self, variational_posterior): mu, S, gamma, tau = ( variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values, ) var_mean = np.square(mu) / self.variance var_S = S / self.variance - np.log(S) part1 = (gamma * (np.log(self.variance) - 1.0 + var_mean + var_S)).sum() / 2.0 ad = self.alpha / self.input_dim from scipy.special import betaln, digamma part2 = ( (gamma * np.log(gamma)).sum() + ((1.0 - gamma) * np.log(1.0 - gamma)).sum() + betaln(ad, 1.0) * self.input_dim - betaln(tau[:, 0], tau[:, 1]).sum() + ((tau[:, 0] - gamma - ad) * digamma(tau[:, 0])).sum() + ((tau[:, 1] + gamma - 2.0) * digamma(tau[:, 1])).sum() + ((2.0 + ad - tau[:, 0] - tau[:, 1]) * digamma(tau.sum(axis=1))).sum() ) return part1 + part2
def update_global_params( self, SS, rho=None, Ntotal=None, **kwargs ): ''' ''' ampF = 1 if Ntotal is not None: ampF = Ntotal/SS['Ntotal'] qalpha1 = self.alpha1 + ampF*SS['N'] qalpha0 = self.alpha0*np.ones( self.K ) qalpha0[:-1] += ampF*SS['N'][::-1].cumsum()[::-1][1:] if rho is None or rho==1: self.qalpha1 = qalpha1 self.qalpha0 = qalpha0 else: self.qalpha1 = rho*qalpha1 + (1-rho)*self.qalpha1 self.qalpha0 = rho*qalpha0 + (1-rho)*self.qalpha0 DENOM = digamma( self.qalpha0 + self.qalpha1 ) self.ElogV = digamma( self.qalpha1 ) - DENOM self.Elog1mV = digamma( self.qalpha0 ) - DENOM if self.truncType == 'v': self.qalpha1[-1] = 1 self.qalpha0[-1] = EPS #avoid digamma(0), which is way too HUGE self.ElogV[-1] = 0 # log(1) => 0 self.Elog1mV[-1] = np.log(1e-40) # log(0) => -INF, never used # Calculate expected mixture weights E[ log w_k ] self.Elogw = self.ElogV.copy() #copy so we can do += without modifying ElogV self.Elogw[1:] += self.Elog1mV[:-1].cumsum()
def testBetaBetaKL(self): with self.test_session() as sess: for shape in [(10,), (4,5)]: a1 = 6.0*np.random.random(size=shape) + 1e-4 b1 = 6.0*np.random.random(size=shape) + 1e-4 a2 = 6.0*np.random.random(size=shape) + 1e-4 b2 = 6.0*np.random.random(size=shape) + 1e-4 # Take inverse softplus of values to test BetaWithSoftplusAB a1_sp = np.log(np.exp(a1) - 1.0) b1_sp = np.log(np.exp(b1) - 1.0) a2_sp = np.log(np.exp(a2) - 1.0) b2_sp = np.log(np.exp(b2) - 1.0) d1 = tf.contrib.distributions.Beta(a=a1, b=b1) d2 = tf.contrib.distributions.Beta(a=a2, b=b2) d1_sp = tf.contrib.distributions.BetaWithSoftplusAB(a=a1_sp, b=b1_sp) d2_sp = tf.contrib.distributions.BetaWithSoftplusAB(a=a2_sp, b=b2_sp) kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) + (a1 - a2)*special.digamma(a1) + (b1 - b2)*special.digamma(b1) + (a2 - a1 + b2 - b1)*special.digamma(a1 + b1)) for dist1 in [d1, d1_sp]: for dist2 in [d2, d2_sp]: kl = tf.contrib.distributions.kl(dist1, dist2) kl_val = sess.run(kl) self.assertEqual(kl.get_shape(), shape) self.assertAllClose(kl_val, kl_expected) # Make sure KL(d1||d1) is 0 kl_same = sess.run(tf.contrib.distributions.kl(d1, d1)) self.assertAllClose(kl_same, np.zeros_like(kl_expected))
def computeLikelihood(self, doc, phi, gamma): """ Compute the document likelihood, given all model parameters. """ gammaSum = numpy.sum(gamma) digSum = digamma(gammaSum) dig = digamma(gamma) - digSum # precompute the difference likelihood = gammaln(self.alpha * self.numTopics) - \ self.numTopics * gammaln(self.alpha) - \ gammaln(gammaSum) likelihood += numpy.sum((self.alpha - 1) * dig + gammaln(gamma) - (gamma - 1) * dig) for n, (wordIndex, wordCount) in enumerate(doc): try: phin, lprob = phi[n], self.logProbW[:, wordIndex] code = """ const int num_terms = Nphin[0]; double result = 0.0; for (int i=0; i < num_terms; i++) { if (phin[i] > 1e-8 || phin[i] < -1e-8) result += phin[i] * (dig[i] - log(phin[i]) + LPROB1(i)); } return_val = wordCount * result; """ likelihood += weave.inline(code, ['dig', 'phin', 'lprob', 'wordCount']) except: partial = phi[n] * (dig - numpy.log(phi[n]) + self.logProbW[:, wordIndex]) partial[numpy.isnan(partial)] = 0.0 # replace NaNs (from 0 * log(0) in phi) with 0.0 likelihood += wordCount * numpy.sum(partial) return likelihood
def _score_nbp(y, X, beta, thet, Q): ''' Negative Binomial Score -- type P likelihood from Greene (2007) .. math:: \lambda_i = exp(X\beta)\\ g_i = \theta \lambda_i^Q \\ w_i = g_i/(g_i + \lambda_i) \\ r_i = \theta / (\theta+\lambda_i) \\ A_i = \left [ \Psi(y_i+g_i) - \Psi(g_i) + ln w_i \right ] \\ B_i = \left [ g_i (1-w_i) - y_iw_i \right ] \\ \partial ln \mathcal{L}_i / \partial \begin{pmatrix} \lambda_i \\ \theta \\ Q \end{pmatrix}= [A_i+B_i] \begin{pmatrix} Q/\lambda_i \\ 1/\theta \\ ln(\lambda_i) \end{pmatrix} -B_i \begin{pmatrix} 1/\lambda_i\\ 0 \\ 0 \end{pmatrix} \\ \frac{\partial \lambda}{\partial \beta} = \lambda_i \mathbf{x}_i \\ \frac{\partial \mathcal{L}_i}{\partial \beta} = \left (\frac{\partial\mathcal{L}_i}{\partial \lambda_i} \right ) \frac{\partial \lambda_i}{\partial \beta} ''' lamb = np.exp(np.dot(X, beta)) g = thet * lamb**Q w = g / (g + lamb) r = thet / (thet+lamb) A = digamma(y+g) - digamma(g) + np.log(w) B = g*(1-w) - y*w dl = (A+B) * Q/lamb - B * 1/lamb dt = (A+B) * 1/thet dq = (A+B) * np.log(lamb) db = X * (dl * lamb)[:,np.newaxis] sc = np.array([dt.sum(), dq.sum()]) sc = np.concatenate([db.sum(axis=0), sc]) return sc
def update_beta(state, a, b): # http://bit.ly/1yX1cZq i = 0 num_iterations = 200 alpha = state['beta'] alpha0 = 0 prec = 1 ** -5 for _ in range(num_iterations): summk = 0 summ = 0 for doc_index, _ in enumerate(state['docs']): summ += digamma(state['num_topics'] * alpha + state['ss']['doc'][doc_index]) for topic in state['used_topics']: summk += digamma(alpha + state['ss']['document_topic'][doc_index][topic]) summ -= state['num_docs'] * digamma(state['num_topics'] * alpha) summk -= state['num_docs'] * state['num_topics'] * digamma(alpha) alpha = (a - 1 + alpha * summk) / (b + state['num_topics'] * summ) assert not np.isnan(alpha) if abs(alpha - alpha0) < prec: break else: alpha0 = alpha if i == num_iterations - 1: raise Exception("update_beta did not converge.") state['beta'] = alpha return state
def local_update(self, metaobs=None): """ Local update that handles minibatches. This needed to be reimplemented because forward_msgs and backward_msgs need to be specialized. """ if metaobs is None: loff = 0 uoff = self.T-1 else: loff, uoff = metaobs.i1, metaobs.i2 # update the modified parameter tables (don't do emissions b/c # pybasicbayes takes care of those). # Don't overwrite mod_init b/c we stored something in it self.mod_init = digamma(self.var_init + eps) - digamma(np.sum(self.var_init) + eps) tran_sum = np.sum(self.var_tran, axis=1) self.mod_tran = digamma(self.var_tran + eps) - digamma(tran_sum[:,npa] + eps) obs = self.obs # Compute likelihoods for k, odist in enumerate(self.var_emit): self.lliks[:,k] = np.nan_to_num(odist.expected_log_likelihood(obs[loff:(uoff+1),:])) # update forward, backward and scale coefficient tables self.forward_msgs(metaobs=metaobs) self.backward_msgs(metaobs=metaobs) # update weights self.var_x = self.lalpha + self.lbeta self.var_x -= np.max(self.var_x, axis=1)[:,npa] self.var_x = np.exp(self.var_x) self.var_x /= np.sum(self.var_x, axis=1)[:,npa]
def local_update(self, obs=None, mask=None): """ This is the local update for the batch version. Here we're creating modified parameters to run the forward-backward algorithm on to update the variational q distribution over the hidden states. These are always the same, and if we really need to change them we'll override the function. """ if obs is None: obs = self.obs if mask is None: mask = self.mask self.mod_init = digamma(self.var_init + eps) - digamma(np.sum(self.var_init) + eps) tran_sum = np.sum(self.var_tran, axis=1) self.mod_tran = digamma(self.var_tran + eps) - digamma(tran_sum[:,npa] + eps) # Compute likelihoods for k, odist in enumerate(self.var_emit): self.lliks[:,k] = np.nan_to_num(odist.expected_log_likelihood(obs)) # update forward, backward and scale coefficient tables self.forward_msgs() self.backward_msgs() self.var_x = self.lalpha + self.lbeta self.var_x -= np.max(self.var_x, axis=1)[:,npa] self.var_x = np.exp(self.var_x) self.var_x /= np.sum(self.var_x, axis=1)[:,npa]
def FFBS(self, var_init): """ Forward Filter Backward Sampling to simulate state sequence. """ obs = self.obs T = self.T K = self.K A = self.var_tran mod_init = digamma(var_init + eps) - digamma(np.sum(var_init) + eps) tran_sum = np.sum(self.var_tran, axis=1) mod_tran = digamma(self.var_tran + eps) - digamma(tran_sum[:,npa] + eps) lalpha = np.empty((T, K)) lliks = np.empty((T, K)) # Compute likelihoods for k, odist in enumerate(self.var_emit): lliks[:,k] = np.nan_to_num(odist.expected_log_likelihood(obs)) lalpha[0,:] = mod_init + lliks[0,:] for t in xrange(1,self.T): lalpha[t] = np.logaddexp.reduce(lalpha[t-1] + np.log(A+eps).T, axis=1) + lliks[t] z = np.empty(T, dtype=np.int_) lp = lalpha[T-1,:] - np.max(lalpha[T-1,:]) p = np.exp(lp) p /= np.sum(p) z[T-1] = np.random.choice(K, p=p) for t in xrange(T-2, -1, -1): lp = lalpha[t,:] + np.log(A[:,z[t+1]]+eps) lp -= np.max(lp) z[t] = np.random.choice(K, p=p) return z
def sample_profiles(base, num): # pylint: disable=inconsistent-return-statements """Generate unique profiles from a game Parameters ---------- base : RsGame Game to generate random profiles from. num : int Number of profiles to sample from the game. """ if num == base.num_all_profiles: # pylint: disable=no-else-return return base.all_profiles() elif num == 0: return np.empty((0, base.num_strats), int) elif base.num_all_profiles <= np.iinfo(int).max: inds = rand.choice(base.num_all_profiles, num, replace=False) return base.profile_from_id(inds) else: # Number of times we have to re-query ratio = (sps.digamma(float(base.num_all_profiles)) - sps.digamma(float(base.num_all_profiles - num))) # Max is for underflow num_per = max(round(float(ratio * base.num_all_profiles)), num) profiles = set() while len(profiles) < num: profiles.update( utils.hash_array(p) for p in base.random_profiles(num_per)) profiles = np.stack([h.array for h in profiles]) inds = rand.choice(profiles.shape[0], num, replace=False) return profiles[inds]
def entropy_2(data, length=1): """ Estimate the entropy of length `length` subsequences in `data`. Parameters ---------- data : iterable An iterable of samples. length : int The length to group samples into. Returns ------- h2 : float An estimate of the entropy. Notes ----- If M is the alphabet size and N is the number of samples, then the bias of this estimator is: B ~ (M+1)/(2N) """ counts = get_counts(data, length) total = counts.sum() digamma_N = digamma(total) log2 = np.log(2) jss = [np.arange(1, count) for count in counts] alt_terms = np.array([(((-1)**js)/js).sum() for js in jss]) h2 = np.log2(np.e)*(counts/total*(digamma_N - digamma(counts) + log2 + alt_terms)).sum() return h2
def entropy_1(data, length=1): """ Estimate the entropy of length `length` subsequences in `data`. Parameters ---------- data : iterable An iterable of samples. length : int The length to group samples into. Returns ------- h1 : float An estimate of the entropy. Notes ----- If M is the alphabet size and N is the number of samples, then the bias of this estimator is: B ~ M/N """ counts = get_counts(data, length) total = counts.sum() digamma_N = digamma(total) h1 = np.log2(np.e)*(counts/total*(digamma_N - digamma(counts))).sum() return h1
def logL_dbl_prime(contrast): M = 1.0 / (contrast + 1e-100) n = np.arange( len(empirical_pmf) ) t1 = np.sum( pmf * ((np.square(k_bar) - n*M) / (M * np.square(k_bar + M))) ) t2 = - N * digamma(M) t3 = np.sum( pmf * digamma(n + M) ) return t1 + t2 + t3
def estimate_dirichlet_param(samples, param): """ Uses a Newton-Raphson scheme to estimating the parameter of a K-dimensional Dirichlet distribution :param samples: an NxK matrix of K-dimensional vectors drawn from a Dirichlet distribution :param param: the old value of the paramter. This is overwritten :return: a K-dimensional vector which is the new """ N, K = samples.shape p = np.sum(np.log(samples), axis=0) for _ in range(60): g = -N * fns.digamma(param) g += N * fns.digamma(param.sum()) g += p q = -N * fns.polygamma(1, param) np.reciprocal(q, out=q) z = N * fns.polygamma(1, param.sum()) b = np.sum(g * q) b /= 1 / z + q.sum() param -= (g - b) * q print("%.2f" % param.mean(), end=" --> ") print return param
def gradient(weights, k, W, sample_count, n_dk_samples, X, sigma): D, K = X.shape[0], W.shape[0] result = 0.0 alpha = np.empty((BatchSize, K), dtype=np.float64) scale = np.empty((BatchSize,), dtype=np.float64) for d in range(0, D, BatchSize): max_d = min(D, d + BatchSize) top = max_d - d alpha[:top,:] = X[d:max_d,:].dot(W.T) alpha[:top,k] = X[d:max_d,:].dot(weights) np.exp(alpha[:top], out=alpha[:top]) alpha_sum = alpha[:top].sum(axis=1) scale[:top] = fns.digamma(alpha_sum) scale[:top] -= fns.digamma(alpha_sum[:,np.newaxis] + n_dk_samples[d:max_d,:,:sample_count].sum(axis=1)).sum(axis=1) / sample_count scale[:top] += fns.digamma(alpha[:top,k,np.newaxis] + n_dk_samples[d:max_d,k,:sample_count]).sum(axis=1) / sample_count scale[:top] -= fns.digamma(alpha[:top,k]) P_1 = ssp.diags(alpha[:top,k], 0).dot(X[d:max_d,:]) P_2 = ssp.diags(scale[:top], 0).dot(P_1) batch_result = np.array(P_2.sum(axis=0)) result += batch_result result -= weights / sigma return -np.squeeze(np.asarray(result))
def full_local_update(self): """ Local update on full data set. Reimplements member functions because we don't want to use the object's internal variables. This is only useful if we can store the whole state sequence in memory. """ # update the modified parameter tables (don't do emissions b/c # pybasicbayes takes care of those). mod_init = digamma(self.var_init + eps) - digamma(np.sum(self.var_init) + eps) tran_sum = np.sum(self.var_tran, axis=1) mod_tran = digamma(self.var_tran + eps) - digamma(tran_sum[:,npa] + eps) T = self.T K = self.K obs = self.obs mask = self.mask # Mask out missing data (restored below) obs_full = obs.copy() obs[mask,:] = np.nan lalpha = np.empty((T, K)) lbeta = np.empty((T, K)) ll = np.empty((T, K)) # Compute likelihoods for k, odist in enumerate(self.var_emit): ll[:,k] = np.nan_to_num(odist.expected_log_likelihood(obs)) # Forward messages ltran = mod_tran lalpha[0,:] = mod_init + ll[0,:] for t in xrange(1,self.T): lalpha[t] = np.logaddexp.reduce(lalpha[t-1] + ltran.T, axis=1) + ll[t] # Backward messages ltran = mod_tran lbeta[self.T-1,:] = 0. for t in xrange(self.T-2,-1,-1): np.logaddexp.reduce(ltran + lbeta[t+1] + ll[t+1], axis=1, out=lbeta[t]) # Update weights var_x = lalpha + lbeta var_x -= np.max(var_x, axis=1)[:,npa] var_x = np.exp(var_x) var_x /= np.sum(var_x, axis=1)[:,npa] # Restore full observations self.obs = obs_full return var_x
def _compute_mi_cc(x, y, n_neighbors): """Compute mutual information between two continuous variables. Parameters ---------- x, y : ndarray, shape (n_samples,) Samples of two continuous random variables, must have an identical shape. n_neighbors : int Number of nearest neighbors to search for each point, see [1]_. Returns ------- mi : float Estimated mutual information. If it turned out to be negative it is replace by 0. Notes ----- True mutual information can't be negative. If its estimate by a numerical method is negative, it means (providing the method is adequate) that the mutual information is close to 0 and replacing it by 0 is a reasonable strategy. References ---------- .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual information". Phys. Rev. E 69, 2004. """ n_samples = x.size x = x.reshape((-1, 1)) y = y.reshape((-1, 1)) xy = np.hstack((x, y)) # Here we rely on NearestNeighbors to select the fastest algorithm. nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors) nn.fit(xy) radius = nn.kneighbors()[0] radius = np.nextafter(radius[:, -1], 0) # Algorithm is selected explicitly to allow passing an array as radius # later (not all algorithms support this). nn.set_params(algorithm='kd_tree') nn.fit(x) ind = nn.radius_neighbors(radius=radius, return_distance=False) nx = np.array([i.size for i in ind]) nn.fit(y) ind = nn.radius_neighbors(radius=radius, return_distance=False) ny = np.array([i.size for i in ind]) mi = (digamma(n_samples) + digamma(n_neighbors) - np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1))) return max(0, mi)
def Fprime(x): df = outsum(digamma(eta.total+x)*etaestim) \ - digamma(x)*outsum(etaestim) + C Df = -1.*df.ravel() if np.isnan(Df).any() or np.isinf(Df).any(): return np.array([np.inf, np.inf]) else: return Df
def exp_T(self, eta): """ @arg eta: The natural parameters. The expectation of T, the sufficient statistics, given eta. """ theta = self.theta(eta) return (digamma(theta) - digamma(theta.sum(axis=-1)).reshape(theta.shape[:-1] + (1,)))
def _dll(self, x): alpha = self.get_alpha(x) return -(np.sum(self._dll_common(x)\ * (special.digamma(np.sum(alpha, axis=1))[:,np.newaxis,np.newaxis]\ - special.digamma(np.sum(self.n_m_z+alpha, axis=1))[:,np.newaxis,np.newaxis]\ + special.digamma(self.n_m_z+alpha)[:,:,np.newaxis]\ - special.digamma(alpha)[:,:,np.newaxis]), axis=0)\ - x / (self.sigma ** 2))
def next_alpha(alpha): das = digamma(alpha.sum()) g = alpha * N * (das - digamma(alpha) + g_offset) h = alpha * N * (das + g_offset) z = N * das x = (alpha * g / h).sum() w = (alpha ** 2 / h).sum() return np.exp(np.log(alpha) - (g - x * alpha / (1/z + w)) / h)
def select_desired_action(self, tau, t, posterior_policies, actions, *args): npi = posterior_policies.shape[0] likelihood = args[0] prior = args[1] #np.ones_like(likelihood)/npi # # likelihood = np.array([0.5,0.5]) # prior = np.array([0.5,0.5]) # posterior_policies = prior * likelihood # posterior_policies /= posterior_policies.sum() #print(posterior_policies, prior, likelihood) self.accepted_pis = np.zeros(100000, dtype=np.int32) - 1 dir_counts = np.ones(npi, np.double) curr_ess = 0 i = 0 H_0 = + (dir_counts.sum()-npi)*scs.digamma(dir_counts.sum()) \ - ((dir_counts - 1)*scs.digamma(dir_counts)).sum() \ + logBeta(dir_counts) #print("H", H_0) pi = np.random.choice(npi, p=prior) self.accepted_pis[i] = pi dir_counts[pi] += 1 H_dir = + (dir_counts.sum()-npi)*scs.digamma(dir_counts.sum()) \ - ((dir_counts - 1)*scs.digamma(dir_counts)).sum() \ + logBeta(dir_counts) #print("H", H_dir) if t == 0: i += 1 while H_dir > H_0 - self.factor + self.factor * H_0: pi = np.random.choice(npi, p=prior) r = np.random.rand() #print(i, curr_ess) #acc_prob = min(1, posterior_policies[pi]/posterior_policies[self.accepted_pis[i-1]]) if likelihood[self.accepted_pis[i - 1]] > 0: acc_prob = min( 1, likelihood[pi] / likelihood[self.accepted_pis[i - 1]]) else: acc_prob = 1 if acc_prob >= r: #posterior_policies[pi]/posterior_policies[self.accepted_pis[i-1]] > r: self.accepted_pis[i] = pi dir_counts[pi] += 1 #acc_prob else: self.accepted_pis[i] = self.accepted_pis[i - 1] dir_counts[self.accepted_pis[i - 1]] += 1 #1-acc_prob H_dir = + (dir_counts.sum()-npi)*scs.digamma(dir_counts.sum()) \ - ((dir_counts - 1)*scs.digamma(dir_counts)).sum() \ + logBeta(dir_counts) #print("H", H_dir) i += 1 self.RT[tau, t] = i - 1 #print(tau, t, i-1) else: self.RT[tau, t] = 0 if self.draw_true_post: chosen_pol = np.random.choice(npi, p=posterior_policies) else: chosen_pol = self.accepted_pis[i - 1] u = actions[chosen_pol] #print(tau,t,iself.accepted_pis[i-1],u,H_rel) # if tau in range(100,110) and t==0: # plt.figure() # plt.plot(posterior_policies) # plt.show() if self.calc_dkl: # autocorr = acov(self.accepted_pis[:i+1]) # if autocorr[0] > 0: # ACT = 1 + 2*np.abs(autocorr[1:]).sum()/autocorr[0] # ess = i/ACT # ess = round(ess) # else: # ess = 1 dist = dir_counts / dir_counts.sum() D_KL = entropy(posterior_policies, dist) self.DKL_post[tau, t] = D_KL D_KL = entropy(prior, dist) self.DKL_prior[tau, t] = D_KL if self.calc_entropy: self.entropy_post[tau, t] = entropy(posterior_policies) self.entropy_prior[tau, t] = entropy(prior) self.entropy_like[tau, t] = entropy(likelihood) # if t==0: # print(tau) # n = 12 # ind = np.argpartition(posterior_policies, -n)[-n:] # print(np.sort(ind)) # print(np.sort(posterior_policies[ind])) #estimate action probability self.estimate_action_probability(tau, t, posterior_policies, actions) return u
def wnn_entropy(points, k=None, weights=True, n_est=None, gmm=None): r""" Weighted Kozachenko-Leonenko nearest-neighbour entropy calculation. *k* is the number of neighbours to consider, with default $k=n^{1/3}$ *n_est* is the number of points to use for estimating the entropy, with default $n_\rm{est} = n$ *weights* is True for default weights, False for unweighted (using the distance to the kth neighbour only), or a vector of weights of length *k*. *gmm* is the number of gaussians to use to model the distribution using a gaussian mixture model. Default is 0, and the points represent an empirical distribution. Returns entropy H in bits and its uncertainty. Berrett, T. B., Samworth, R.J., Yuan, M., 2016. Efficient multivariate entropy estimation via k-nearest neighbour distances. DOI:10.1214/18-AOS1688 https://arxiv.org/abs/1606.00304 """ from sklearn.neighbors import NearestNeighbors n, d = points.shape # Default to the full set if n_est is None: n_est = 10000 elif n_est == 0: n_est = n # reduce size of draw to n_est if n_est >= n: x = points n_est = n else: x = points[permutation(n)[:n_est]] n = n_est # Default k based on n if k is None: # Private communication: cube root of n is a good choice for k # Personal observation: k should be much bigger than d k = max(int(n**(1 / 3)), 3 * d) # If weights are given then use them (setting the appropriate k), # otherwise use the default weights. if isinstance(weights, bool): weights = _wnn_weights(k, d, weights) else: k = len(weights) #print("weights", weights, sum(weights)) # select knn algorithm algorithm = 'auto' #algorithm = 'kd_tree' #algorithm = 'ball_tree' #algorithm = 'brute' n_components = 0 if gmm is None else gmm # H = 1/n sum_i=1^n sum_j=1^k w_j log E_{j,i} # E_{j,i} = e^-Psi(j) V_d (n-1) z_{j,i}^d = C z^d # logC = -Psi(j) + log(V_d) + log(n-1) # H = 1/n sum sum w_j logC + d/n sum sum w_j log(z) # = sum w_j logC + d/n sum sum w_j log(z) # = A + d/n B # H^2 = 1/n sum Psi = digamma(np.arange(1, k + 1)) logVd = d / 2 * log(pi) - gammaln(1 + d / 2) logC = -Psi + logVd + log(n - 1) # TODO: standardizing points doesn't work. # Standardize the data so that distances conform. This is equivalent to # a u-substitution u = sigma x + mu, so the integral needs to be corrected # for dU = det(sigma) dx. Since the standardization squishes the dimensions # independently, sigma is a diagonal matrix, with the determinant equal to # the product of the diagonal elements. #x, mu, sigma = standardize(x) # Note: sigma may be zero #detDU = np.prod(sigma) detDU = 1. if n_components > 0: # Use Gaussian mixture to model the distribution from sklearn.mixture import GaussianMixture as GMM predictor = GMM(n_components=gmm, covariance_type='full') predictor.fit(x) eval_x, _ = predictor.sample(n_est) #weight_x = predictor.score_samples(eval_x) skip = 0 else: # Empirical distribution # TODO: should we use the full draw for kNN and a subset for eval points? # Choose a subset for evaluating the entropy estimate, if desired #print(n_est, n) #eval_x = x if n_est >= n else x[permutation(n)[:n_est]] eval_x = x #weight_x = 1 skip = 1 tree = NearestNeighbors(algorithm=algorithm, n_neighbors=k + skip) tree.fit(x) dist, _ind = tree.kneighbors(eval_x, n_neighbors=k + skip, return_distance=True) # Remove first column. Since test points are in x, the first column will # be a point from x with distance 0, and can be ignored. if skip: dist = dist[:, skip:] # Find log distances. This can be problematic for MCMC runs where a # step is rejected, and therefore identical points are in the distribution. # Ignore them by replacing these points with nan and using nanmean. # TODO: need proper analysis of duplicated points in MCMC chain dist[dist == 0] = nan logdist = log(dist) H_unweighted = logC + d * np.nanmean(logdist, axis=0) H = np.dot(H_unweighted, weights)[0] Hsq_k = np.nanmean((logC[-1] + d * logdist[:, -1])**2) # TODO: abs shouldn't be needed? if Hsq_k < H**2: print("warning: avg(H^2) < avg(H)^2") dH = sqrt(abs(Hsq_k - H**2) / n_est) #print("unweighted", H_unweighted) #print("weighted", H, Hsq_k, H**2, dH, detDU, LN2) return H * detDU / LN2, dH * detDU / LN2
def predict_s(x, K=3, iter_num=20, my_seed=0): #set seed np.random.seed(seed=my_seed) # sample num N = len(x) ### prior parameters # first value gamma distribution a = 200 b = 5 # first value parameter of Dirichlet distribution alpha = np.array([30, 20, 10]) # parameter of gammma distribution a_update = np.array([200, 200, 200]) b_update = np.array([5, 5, 5]) # parameter of Dirichlet distribution for \pi alpha_update = np.array([30, 20, 10]) # set s first value s_mean = [] for n in range(N): s_mean.append([0.4, 0.3, 0.3]) s_mean = np.array(s_mean) for i in range(iter_num): #print("\r Iteration:{}".format(i)) ##################################################### # expectation of λ、lnλ、π lam_mean = np.zeros(K) ln_lam_mean = np.zeros(K) ln_pi_mean = np.zeros(K) lam_mean = a_update / b_update ln_lam_mean = sp.digamma(a_update) - np.log(b_update) ln_pi_mean = sp.digamma(alpha_update) - sp.digamma( np.sum(alpha_update)) ##################################################### # q(sn) s_mean = np.exp( x.reshape(len(x), 1) * ln_lam_mean - lam_mean + ln_pi_mean) s_mean /= np.sum(s_mean, axis=1).reshape(N, 1) ########################################### # update a, b a_update = np.sum(x.reshape(len(x), 1) * s_mean, axis=0) + a b_update = np.sum(s_mean, axis=0) + b # update α alpha_update = np.sum(s_mean, axis=0) + alpha ##################################################### # determine group number by order of λ s_order = st.gamma(a=a_update, scale=1 / b_update).mean().argsort() s_mean_ordered = s_mean[:, s_order] return s_mean_ordered
def func_1vN(Ecb, mu, T, Dm, Dp, itype, limit): """ Function used when generating 1vN, Redfield approach kernel. Parameters ---------- Ecb : float Energy. mu : float Chemical potential. T : float Temperature. Dm, Dp : float Bandwidth. itype : int Type of integral for first order approach calculations. itype=0: the principal parts are evaluated using Fortran integration package QUADPACK routine dqawc through SciPy. itype=1: the principal parts are kept, but approximated by digamma function valid for large bandwidht D. itype=2: the principal parts are neglected. itype=3: the principal parts are neglected and infinite bandwidth D is assumed. limit : int For itype=0 dqawc_limit determines the maximum number of subintervals in the partition of the given integration interval. Returns ------- array Array of four complex numbers [cur0, cur1, en0, en1] containing momentum-integrated current amplitudes. cur0 - particle current amplitude. cur1 - hole current amplitude. en0 - particle energy current amplitude. en1 - hol energy current amplitude. """ if itype == 0: alpha, Rm, Rp = (Ecb - mu) / T, (Dm - mu) / T, (Dp - mu) / T cur0, err = quad(fermi_func, Rm, Rp, weight='cauchy', wvar=alpha, epsabs=1.0e-6, epsrel=1.0e-6, limit=limit) cur0 = cur0 + (-1.0j * pi * fermi_func(alpha) if alpha < Rp and alpha > Rm else 0) cur1 = cur0 + log(abs((Rm - alpha) / (Rp - alpha))) cur1 = cur1 + (1.0j * pi if alpha < Rp and alpha > Rm else 0) # const0 = T * ((-Rm if Rm < -40 else log(1 + exp(-Rm))) - (-Rp if Rp < -40 else log(1 + exp(-Rp)))) const1 = const0 + Dm - Dp # en0 = const0 + Ecb * cur0 en1 = const1 + Ecb * cur1 elif itype == 1: alpha, Rm, Rp = (Ecb - mu) / T, Dm / T, Dp / T cur0 = digamma(0.5 + 1.0j * alpha / (2 * pi)).real - log(abs(Rm) / (2 * pi)) cur0 = cur0 - 1.0j * pi * fermi_func(alpha) cur1 = cur0 + log(abs(Rm / Rp)) cur1 = cur1 + 1.0j * pi # en0 = -T * Rm + Ecb * cur0 en1 = -T * Rp + Ecb * cur1 elif itype == 2: alpha, Rm, Rp = (Ecb - mu) / T, (Dm - mu) / T, (Dp - mu) / T cur0 = -1.0j * pi * fermi_func( alpha) if alpha < Rp and alpha > Rm else 0 cur1 = cur0 + (1.0j * pi if alpha < Rp and alpha > Rm else 0) en0 = Ecb * cur0 en1 = Ecb * cur1 elif itype == 3: alpha = (Ecb - mu) / T cur0 = -1.0j * pi * fermi_func(alpha) cur1 = cur0 + 1.0j * pi en0 = Ecb * cur0 en1 = Ecb * cur1 #------------------------- return np.array([cur0, cur1, en0, en1])
def var_bound(data, model, query, z_dnk=None): ''' Determines the variational bounds. ''' bound = 0 # Unpack the the structs, for ease of access and efficiency K, topicPrior, wordPrior, wordDists, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype docLens, topicDists = \ query.docLens, query.topicDists # Initialize z matrix if necessary W, X = data.words, data.links D, T = W.shape # Perform the digamma transform for E[ln \theta] etc. topicDists = topicDists.copy() diTopicDists = fns.digamma(topicDists) diSumTopicDists = fns.digamma(topicDists.sum(axis=1)) diWordDists = fns.digamma(model.wordDists) diSumWordDists = fns.digamma(model.wordDists.sum(axis=1)) # E[ln p(topics|topicPrior)] according to q(topics) # prob_topics = D * (fns.gammaln(topicPrior.sum()) - fns.gammaln(topicPrior).sum()) \ + np.sum((topicPrior - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis])) bound += prob_topics # and its entropy ent_topics = _dirichletEntropy(topicDists) bound += ent_topics # E[ln p(vocabs|vocabPrior)] # if type(model.vocabPrior) is float or type(model.vocabPrior) is int: prob_vocabs = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \ + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:, np.newaxis] )) else: prob_vocabs = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \ + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:, np.newaxis] )) bound += prob_vocabs # and its entropy ent_vocabs = _dirichletEntropy(wordDists) bound += ent_vocabs # P(z|topic) is tricky as we don't actually store this. However # we make a single, simple estimate for this case. topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior) prob_words = 0 prob_z = 0 ent_z = 0 for d in range(D): wordIdx, z = _infer_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diSumWordDists) # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk] exLnTopic = diTopicDists[d, :] - diSumTopicDists[d] prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum() # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt] prob_words += np.sum( W[d, :].data[np.newaxis, :] * z * (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis])) # And finally the entropy of Z ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum() bound += (prob_z + ent_z + prob_words) _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return bound
def get_E_log_hi(i,k,v,mu,covariance,X): D = mu.shape[1] term1 = np.matmul((X[:,i].reshape(-1,1)-mu[k]).T , inv(covariance[k])) term2 = np.matmul(term1, X[:,i].reshape(-1,1)-mu[k])[0,0] val = digamma((v[k]+D)/2) - np.log( (v[k] + term2)/2 ) return val
def objFunc_constrained(rhoomega, sumLogPi=0, sumLogPiActiveVec=None, sumLogPiRemVec=None, nDoc=0, gamma=1.0, alpha=1.0, kappa=0.0, startAlphaLogPi=0.0, approx_grad=False, **kwargs): ''' Returns constrained objective function and its gradient. Args ------- rhoomega := 1D array, size 2*K Returns ------- f := -1 * L(rhoomega), where L is ELBO objective function (log posterior prob) g := gradient of f ''' assert not np.any(np.isnan(rhoomega)) assert not np.any(np.isinf(rhoomega)) rho, omega, K = _unpack(rhoomega) g1 = rho * omega g0 = (1 - rho) * omega digammaomega = digamma(omega) assert not np.any(np.isinf(digammaomega)) Elogu = digamma(g1) - digammaomega Elog1mu = digamma(g0) - digammaomega if nDoc > 0: # Any practical call to this will have nDoc > 0 if kappa > 0: scale = 1.0 ONcoef = K + 1.0 - g1 OFFcoef = K * kvec(K) + 1.0 + gamma - g0 Tvec = alpha * sumLogPi + startAlphaLogPi Tvec[:-1] += np.log(alpha + kappa) - np.log(kappa) # Calc local term Ebeta = np.hstack([rho, 1.0]) Ebeta[1:] *= np.cumprod(1 - rho) elbo_local = np.inner(Ebeta, Tvec) elif sumLogPiRemVec is not None: scale = nDoc ONcoef = 1 + (1.0 - g1) / scale OFFcoef = kvec(K) + (gamma - g0) / scale Pvec = alpha * sumLogPiActiveVec / scale Qvec = alpha * sumLogPiRemVec / scale # Calc local term Ebeta_gtm1 = np.hstack([1.0, np.cumprod(1 - rho[:-1])]) elbo_local = np.inner(rho * Ebeta_gtm1, Pvec) + \ np.inner((1-rho) * Ebeta_gtm1, Qvec) else: scale = nDoc ONcoef = 1 + (1.0 - g1) / scale OFFcoef = kvec(K) + (gamma - g0) / scale Tvec = alpha * sumLogPi / scale + startAlphaLogPi / scale # Calc local term Ebeta = np.hstack([rho, 1.0]) Ebeta[1:] *= np.cumprod(1 - rho) elbo_local = np.inner(Ebeta, Tvec) else: # This is special case for unit tests that make sure the optimizer # finds the parameters that set q(u) equal to its prior when nDoc=0 scale = 1 ONcoef = 1 - g1 OFFcoef = gamma - g0 elbo_local = 0 elbo = -1 * c_Beta(g1, g0) / scale \ + np.inner(ONcoef, Elogu) \ + np.inner(OFFcoef, Elog1mu) \ + elbo_local if approx_grad: return -1.0 * elbo # Gradient computation! trigamma_omega = polygamma(1, omega) trigamma_g1 = polygamma(1, g1) trigamma_g0 = polygamma(1, g0) assert np.all(np.isfinite(trigamma_omega)) assert np.all(np.isfinite(trigamma_g1)) gradrho = ONcoef * omega * trigamma_g1 \ - OFFcoef * omega * trigamma_g0 gradomega = ONcoef * (rho * trigamma_g1 - trigamma_omega) \ + OFFcoef * ((1 - rho) * trigamma_g0 - trigamma_omega) if nDoc > 0: if sumLogPiRemVec is None: # TODO make this line faster. This is the hot spot. Delta = calc_dEbeta_drho(Ebeta, rho, K) gradrho += np.dot(Delta, Tvec) else: Ebeta = np.hstack([rho, 1.0]) Ebeta[1:] *= np.cumprod(1 - rho) Psi = calc_Psi(Ebeta, rho, K) gradrho += np.dot(Psi, Qvec) Delta = calc_dEbeta_drho(Ebeta, rho, K)[:, :K] gradrho += np.dot(Delta, Pvec) grad = np.hstack([gradrho, gradomega]) return -1.0 * elbo, -1.0 * grad
def _grad_E_log_p_pi_given_beta(self, beta, gamma, alphatildes): # NOTE: switched argument name gamma <-> alpha retval = gamma*(digamma(alphatildes[:-1]) - digamma(alphatildes[-1])) \ - gamma * (digamma(gamma*beta) - digamma(gamma)) return retval
def bql_f(x_): return np.log(x_) - digamma(x_)
def e_ln_pi_k(gama0, Nk): gammak = gama0 + Nk return digamma(gammak) - digamma(gammak.sum())
for i in clustered_img.keys(): m0.append(np.asarray(clustered_img.get(i)).mean()) mk = np.asarray(m0) mk_list.append(mk) test_mk_num = z.copy() test_mk_den = z.copy() e_x_mean_lambda_ = z.copy() shape = np.asarray([2 for _ in range(k)]) gammak = gamma0 + Nk alphak = Nk / 2 + alpha0 - 1 betak = beta0 + (rnk * e_x_mean_lambda_).sum(axis=0) e_ln_pi = e_ln_pi_k(gammak, Nk) e_ln_precision_ = digamma(alphak) - np.log(betak) e_precision_ = alphak / betak # Feature term1 = (rnk * (digamma(alphak) - np.log(betak))).sum(axis=1) / 2 term2 = 1 / 2 * (rnk * (alphak / betak) * ( (x.reshape(-1, 1) - mk.reshape(-1, 1).T)**2 + 1 / sk)).sum(axis=1) row_in_e = np.exp(term1 - term2) w = np.asarray([1 for _ in range(k)]) epsolon = mk var_test = sk epsolon_in = np.exp(-1 / 2 * 1 / var_test * ( (x.reshape(-1, 1) - epsolon.reshape(-1, 1).T)**2) + 1 / 2 * np.log(1 / var_test))
def perform_E_step(self, Y, params, terms_in_int_approx=5): n,d = Y.shape a = np.amin(Y) # for computing E[x] for j in xrange(n): ### posterior membership prob for h in range(self.nb_components): params['tau'][j,h] = self.weights[h] * self.component_dists[h].pdf(Y[j]) params['tau'][j,:] /= params['tau'][j,:].sum() ### update e variables for h in range(self.nb_components): # S integral S = gamma((self.component_dists[h].df + 2.*self.dim)/2.) / gamma((self.component_dists[h].df + self.dim)/2.) for r in xrange(terms_in_int_approx): r += 1 for s in xrange(r): S += ((-1)**(2*r-s-1) / r) * (gamma(r+1)/(gamma(s+1)*gamma(r-s+1))) * gamma((self.component_dists[h].df + self.dim)/2. + s) / gamma((self.component_dists[h].df + 2.*self.dim)/2. + s) * self.component_dists[h].impSamp_cdf(self.component_dists[h].get_c(Y[j]), mu=np.zeros((Y[j].shape[0],)), Sigma=((self.component_dists[h].df + self.component_dists[h].get_d(Y[j]))/(self.component_dists[h].df + self.dim + 2.*s))*self.component_dists[h].Lambda, df=self.component_dists[h].df+self.dim+2.*s) params['e'][0][j,h] = digamma(self.component_dists[h].df/2. + self.dim) - np.log((self.component_dists[h].df + self.component_dists[h].get_d(Y[j]))/2.) - S/self.component_dists[h].impSamp_cdf(Y[j], mu=np.zeros((Y[j].shape[0],)), Sigma=self.component_dists[h].Lambda, df=self.component_dists[h].df+self.dim) params['e'][1][j,h] = (self.component_dists[h].df + self.dim)/(self.component_dists[h].df + self.component_dists[h].get_d(Y[j])) * \ self.component_dists[h].impSamp_cdf(Y[j], mu=np.zeros((Y[j].shape[0],)), Sigma=self.component_dists[h].Lambda, df=self.component_dists[h].df+self.dim+2)/\ self.component_dists[h].impSamp_cdf(Y[j], mu=np.zeros((Y[j].shape[0],)), Sigma=self.component_dists[h].Lambda, df=self.component_dists[h].df+self.dim) # compute moment E[x] c = self.component_dists[h].impSamp_cdf(self.component_dists[h].mu[0] - a, mu = np.zeros(self.component_dists[h].mu.shape)) xi = np.zeros(self.component_dists[h].mu.shape) for d in range(self.dim): mu_minus_d = np.delete(self.component_dists[h].mu, d, axis=1) Sigma_minus_d = np.delete(np.delete(self.component_dists[h].Sigma, d, axis=0), d, axis=1) sigma_d = np.delete(self.component_dists[h].Sigma[:,d], d, axis=0) a_star = (mu_minus_d-a) - (mu_minus_d-a) * 1./self.component_dists[h].Sigma[d,d] Sigma_star = (self.component_dists[h].df + 1./self.component_dists[h].Sigma[d,d] * (self.component_dists[h].mu[0,d] - a)**2)/(self.component_dists[h].df-1) *\ Sigma_minus_d - 1./self.component_dists[h].Sigma[d,d] * np.dot(sigma_d, sigma_d.T) xi[0,d] = 1./(2*np.pi*self.component_dists[h].Sigma[d,d]) * (self.component_dists[h].df/(self.component_dists[h].df+(1./self.component_dists[h].Sigma[d,d])*(self.component_dists[h].mu[0,d] - a)**2))**((self.component_dists[h].df-1)/2.) * np.sqrt(self.component_dists[h].df/2) * gamma((self.component_dists[h].df-1)/2.)/gamma(self.component_dists[h].df/2.) * self.component_dists[h].impSamp_cdf(a_star, mu = np.zeros((1, Sigma_star.shape[0])), Sigma = Sigma_star, df=self.component_dists[h].df-1) epsilon = 1./c * np.dot(xi, self.component_dists[h].Sigma) E_x = self.component_dists[h].mu + epsilon params['e'][2][j,h,:] = params['e'][1][j,h] * E_x # compute moment E[xx] H = np.zeros((self.dim, self.dim)) for i in range(self.dim): for j in range(self.dim): if self.dim < 3: break if j != i: # precompute the necessary slices mu_ij = np.array([[self.component_dists[h].mu[0,i], self.component_dists[h].mu[0,j]]]) Sigma_ij = np.array([[self.component_dists[h].Sigma[i,i], self.component_dists[h].Sigma[i,j]], [self.component_dists[h].Sigma[j,i], self.component_dists[h].Sigma[j,j]]]) if j > i: mu_negij = np.delete(np.delete(self.component_dists[h].mu, i, axis=1), j-1, axis=1) Sigma_parenij = np.delete(np.delete(np.array([self.component_dists[h].Sigma[:,i], self.component_dists[h].Sigma[:,j]]).T, i, axis=0), j-1, axis=0) Sigma_negij = np.delete(np.delete(np.delete(np.delete(self.component_dists[h].Sigma, i, axis=0), j-1, axis=0), i, axis=1), j-1, axis=1) else: mu_negij = np.delete(np.delete(self.component_dists[h].mu, i, axis=1), j, axis=1) Sigma_parenij = np.delete(np.delete(np.array([self.component_dists[h].Sigma[:,i], self.component_dists[h].Sigma[:,j]]).T, i, axis=0), j, axis=0) Sigma_negij = np.delete(np.delete(np.delete(np.delete(self.component_dists[h].Sigma, i, axis=0), j, axis=0), i, axis=1), j, axis=1) df_star = self.component_dists[h].df + np.dot(np.dot((mu_ij - a), inv(Sigma_ij)), (mu_ij - a).T) a_star_star = (mu_negij - a) - np.dot(np.dot(Sigma_parenij, inv(Sigma_ij)), mu_ij - a) Sigma_star_star = df_star/(self.component_dists[h].df - 2) * (Sigma_negij - np.dot(np.dot(Sigma_parenij, inv(Sigma_ij)), Sigma_parenij.T)) H[i,j] = 1./(2 * np.pi * np.sqrt(self.component_dists[h].Sigma[i,i]*self.component_dists[h].Sigma[j,j] - self.component_dists[h].Sigma[i,j]**2)) H[i,j] *= (self.component_dists[h].df)/(self.component_dists[h].df-2) * (self.component_dists[h].df/df_star)**(self.component_dists[h].df/2 - 1) H[i,j] *= self.component_dists[h].impSamp_cdf(a_star_star, mu = 0., Sigma = Sigma_star_star, df = self.component_dists[h].df-2) H[i,i] = 1./self.component_dists[h].Sigma[i,i] * ((self.component_dists[h].mu[0,i] - a) * xi[0,i] - np.sum([self.component_dists[h].Sigma[i,k]*H[i,k] for k in range(self.dim) if k!=i])) E_xx = np.dot(self.component_dists[h].mu.T, self.component_dists[h].mu) + np.dot(self.component_dists[h].mu.T, epsilon) + np.dot(epsilon.T, self.component_dists[h].mu) - 1./c * np.dot(np.dot(self.component_dists[h].Sigma, H), self.component_dists[h].Sigma) + 1./c * (self.component_dists[h].df)/(self.component_dists[h].df-2) * self.component_dists[h].impSamp_cdf(self.component_dists[h].mu[0] - a, mu = np.zeros(self.component_dists[h].mu.shape), Sigma = (self.component_dists[h].df)/(self.component_dists[h].df-2) * self.component_dists[h].Sigma, df = self.component_dists[h].df-2) * self.component_dists[h].Sigma params['e'][3][j, h, :, :] = params['e'][1][j, h] * E_xx return params
def e_ln_precision(alpha, beta): return digamma(alpha) - np.log(np.abs(beta))
def fit(self, curpus): word_indexes = [] word_counts = [] for row_curpus in curpus: row_indexes = [] row_counts = [] for w_i, w_c in row_curpus: row_indexes.append(w_i) row_counts.append(w_c) word_indexes.append(row_indexes) word_counts.append(row_counts) n_documents = len(word_indexes) max_index = 0 for d in range(n_documents): document_max = np.max(word_indexes[d]) if max_index < document_max: max_index = document_max n_word_types = max_index + 1 theta = np.random.uniform(size=(n_documents, self.n_topic)) old_theta = np.copy(theta) phi = np.random.uniform(size=(self.n_topic, n_word_types)) for n in range(self.n_iter): sum_phi = [] for k in range(self.n_topic): sum_phi.append(sum(phi[k])) ndk = theta nkv = np.zeros((self.n_topic, n_word_types)) sampe_X = [] for d in range(n_documents): n_words_in_doc = len(word_indexes[d]) sum_theta_d = sum(theta[d]) prob_d = digamma(theta[d]) - digamma(sum_theta_d) ndk[d, :] = 0. dummies = np.array([0.] * self.n_topic) for w in range(n_words_in_doc): word_no = word_indexes[d][w] prob_w = digamma(phi[:, word_no]) - digamma(sum_phi) latent_z = np.exp(prob_w + prob_d) latent_z /= np.sum(latent_z) ndk[d, :] += latent_z * word_counts[d][w] nkv[:, word_no] += latent_z * word_counts[d][w] z = np.argmax(latent_z) dummies[z] += 1. sampe_X.append(dummies / n_words_in_doc) theta = ndk + self.alpha phi = nkv + self.beta print(n, np.max(theta - old_theta)) old_theta = np.copy(theta) for k in range(self.n_topic): phi[k] = phi[k] / np.sum(phi[k]) for d in range(n_documents): theta[d] = theta[d] / np.sum(theta[d]) return phi, theta, np.array(sampe_X)
def myfunc(r, d): tot = 0 N = len(d) for thing in d: tot += digamma(r + thing) return N * np.log(r / (r + np.sum(d) / N)) - N * digamma(r) + tot
def _compute_mi_cd(c, d, n_neighbors): """Compute mutual information between continuous and discrete variables. Parameters ---------- c : ndarray, shape (n_samples,) Samples of a continuous random variable. d : ndarray, shape (n_samples,) Samples of a discrete random variable. n_neighbors : int Number of nearest neighbors to search for each point, see [1]_. Returns ------- mi : float Estimated mutual information. If it turned out to be negative it is replace by 0. Notes ----- True mutual information can't be negative. If its estimate by a numerical method is negative, it means (providing the method is adequate) that the mutual information is close to 0 and replacing it by 0 is a reasonable strategy. References ---------- .. [1] B. C. Ross "Mutual Information between Discrete and Continuous Data Sets". PLoS ONE 9(2), 2014. """ n_samples = c.shape[0] c = c.reshape((-1, 1)) radius = np.empty(n_samples) label_counts = np.empty(n_samples) k_all = np.empty(n_samples) nn = NearestNeighbors() for label in np.unique(d): mask = d == label count = np.sum(mask) if count > 1: k = min(n_neighbors, count - 1) nn.set_params(n_neighbors=k) nn.fit(c[mask]) r = nn.kneighbors()[0] radius[mask] = np.nextafter(r[:, -1], 0) k_all[mask] = k label_counts[mask] = count # Ignore points with unique labels. mask = label_counts > 1 n_samples = np.sum(mask) label_counts = label_counts[mask] k_all = k_all[mask] c = c[mask] radius = radius[mask] nn.set_params(algorithm='kd_tree') nn.fit(c) ind = nn.radius_neighbors(radius=radius, return_distance=False) m_all = np.array([i.size for i in ind]) mi = (digamma(n_samples) + np.mean(digamma(k_all)) - np.mean(digamma(label_counts)) - np.mean(digamma(m_all + 1))) return max(0, mi)
def _objective(l_count, dg, sub): alpha = np.exp(l_count) + 1 nom = digamma(alpha) result = nom - sub return (result - dg)**2
def makeDeleteMoveCandidate_LP(Data, curLP, curModel, targetCompID=10, deleteStrategy='truelabels', minResp=0.001, **curLPkwargs): ''' Returns ------- propcurLP : dict of local params Replaces targetCompID with K "new" states, each one tracking exactly one existing state. ''' curResp = curLP['resp'] maxRespValBelowThr = curResp[curResp < minResp].max() assert maxRespValBelowThr < 1e-90 Natom, Korig = curResp.shape remCompIDs = np.setdiff1d(np.arange(Korig), [targetCompID]) relDocIDs = np.flatnonzero( curLP['DocTopicCount'][:, targetCompID] > minResp) propResp = 1e-100 * np.ones((Natom, 2 * (Korig - 1))) propResp[:, :Korig - 1] = curResp[:, remCompIDs] if deleteStrategy.count('truelabels'): relAtoms = curResp[:, targetCompID] > minResp reltrueResp = Data.TrueParams['resp'][relAtoms].copy() reltrueResp[reltrueResp < minResp] = 1e-100 reltrueResp /= reltrueResp.sum(axis=1)[:, np.newaxis] propResp[relAtoms, Korig-1:] = \ reltrueResp * curResp[relAtoms, targetCompID][:,np.newaxis] propcurLP = curModel.allocModel.initLPFromResp(Data, dict(resp=propResp)) return propcurLP Lik = curLP['E_log_soft_ev'][:, remCompIDs].copy() # From-scratch strategy for d in relDocIDs: mask_d = np.arange(Data.doc_range[d], Data.doc_range[d + 1]) relAtomIDs_d = mask_d[curLP['resp'][mask_d, targetCompID] > minResp] fixedDocTopicCount_d = curLP['DocTopicCount'][d, remCompIDs] relLik_d = Lik[relAtomIDs_d, :] relwc_d = Data.word_count[relAtomIDs_d] targetsumResp_d = curLP['resp'][relAtomIDs_d, targetCompID] * relwc_d sumResp_d = np.zeros_like(targetsumResp_d) DocTopicCount_d = np.zeros_like(fixedDocTopicCount_d) DocTopicProb_d = np.zeros_like(DocTopicCount_d) sumalphaEbeta = curModel.allocModel.alpha_E_beta()[targetCompID] alphaEbeta = sumalphaEbeta * 1.0 / (Korig - 1.0) * np.ones(Korig - 1) for riter in range(10): np.add(DocTopicCount_d, alphaEbeta, out=DocTopicProb_d) digamma(DocTopicProb_d, out=DocTopicProb_d) DocTopicProb_d -= DocTopicProb_d.max() np.exp(DocTopicProb_d, out=DocTopicProb_d) # Update sumResp for all tokens in document np.dot(relLik_d, DocTopicProb_d, out=sumResp_d) # Update DocTopicCount_d: 1D array, shape K # sum(DocTopicCount_d) equals Nd[targetCompID] np.dot(targetsumResp_d / sumResp_d, relLik_d, out=DocTopicCount_d) DocTopicCount_d *= DocTopicProb_d DocTopicCount_d += fixedDocTopicCount_d DocTopicCount_dj = curLP['DocTopicCount'][d, targetCompID] DocTopicCount_dnew = np.sum(DocTopicCount_d) - \ fixedDocTopicCount_d.sum() assert np.allclose(DocTopicCount_dj, DocTopicCount_dnew, rtol=0, atol=1e-6) # Create proposal resp for relevant atoms in this doc only propResp_d = relLik_d.copy() propResp_d *= DocTopicProb_d[np.newaxis, :] propResp_d /= sumResp_d[:, np.newaxis] propResp_d *= curLP['resp'][relAtomIDs_d, targetCompID][:, np.newaxis] for n in range(propResp_d.shape[0]): size_n = curLP['resp'][relAtomIDs_d[n], targetCompID] sizeOrder_n = np.argsort(propResp_d[n, :]) for k, compID in enumerate(sizeOrder_n): if propResp_d[n, compID] > minResp: break propResp_d[n, compID] = 1e-100 biggerCompIDs = sizeOrder_n[k + 1:] propResp_d[n, biggerCompIDs] /= \ propResp_d[n,biggerCompIDs].sum() propResp_d[n, biggerCompIDs] *= size_n # Fill in huge resp matrix with specific values propResp[relAtomIDs_d, Korig - 1:] = propResp_d assert np.allclose(propResp.sum(axis=1), 1.0, rtol=0, atol=1e-8) propcurLP = curModel.allocModel.initLPFromResp(Data, dict(resp=propResp)) return propcurLP
def logL_prime(M): t1 = np.sum([ pmf[n] * digamma(n + M) for n in range(len(pmf)) ]) t2 = -N * digamma(M) t3 = N * np.log(M / (M + k_bar)) return t1 + t2 + t3
def _updateTopicHyperParamsFromMeans(model, query, max_iters=100): ''' Update the hyperparameters on the Dirichlet prior over topics. This is a Newton Raphson method. We iterate until convergence or the maximum number of iterations is hit. We converge if the 1-norm of the difference between the previous and current estimate is less than 0.001 / K where K is the number of topics. This is taken from Tom Minka's tech-note on "Estimating a Dirichlet Distribution", specifically the section on estimating a Polya distribution, which performed best in experiments. We'll be substituted in the expected count of topic assignments to variables. At each iteration, the new value of a_k is set to \sum_d \Psi(n_dk + a_k) - \Psi (a_k) -------------------------------------- * a_k \sum_d \Psi(n_d + \sum_j a_j) - \Psi(a_k) where the n_dk is the count of times topic k was assigned to tokens in document d, and its expected value is the same as the parameter of the posterior over topics for that document d, minus the hyper-parameter used to estimate that posterior. In this case, we assume that this method have been called from within the training routine, so we topicDists is essentially the mean of per-token topic-assignments, and thus needs to be scaled appropriately :param model: all the model parameters, notably the topicPrior, which is mutated IN-PLACE. :param query: all the document-level parameters, notably the topicDists, from which an appropriate prior is noted. It's expected that this contain the topic hyper-parameters, as usual, and not any intermediate reprsentations (i.e. means) used by the inference procedure. ''' print("Updating hyper-parameters") topic_prior = model.topicPrior old_topic_prior = topic_prior.copy() doc_lens = query.docLens doc_topic_counts = query.topicDists * doc_lens[:, np. newaxis] + old_topic_prior[ np.newaxis, :] D, K = doc_topic_counts.shape psi_old_tprior = np.ndarray(topic_prior.shape, dtype=topic_prior.dtype) for _ in range(max_iters): doc_topic_counts += (topic_prior - old_topic_prior)[np.newaxis, :] old_topic_prior[:] = topic_prior fns.digamma(old_topic_prior, out=psi_old_tprior) numer = fns.psi(doc_topic_counts).sum(axis=0) - D * psi_old_tprior denom = fns.psi(doc_lens + old_topic_prior.sum()).sum() - D * psi_old_tprior topic_prior[:] = old_topic_prior * (numer / denom) if la.norm(np.subtract(old_topic_prior, topic_prior), 1) < (0.001 * K): break # Undo the in-place changes we've been making to the topic distributions doc_topic_counts -= old_topic_prior[np.newaxis, :] doc_topic_counts /= doc_lens[:, np.newaxis] # Make sure it never is zero or negative for k in range(K): topic_prior[k] = max(topic_prior[k], 1E-6)
uci, lci = lib.compute_ci(int_uniform.values, axis=None) print("Mean, confidence interval for entire uniform grid: %.5f, [%.5f, %.5f]." %(m, uci, lci)) m = np.mean(int_normal.values, axis=None) uci, lci = lib.compute_ci(int_normal.values, axis=None) print("Mean, confidence interval for entire normal grid: %.5f, [%.5f, %.5f]." %(m, uci, lci)) # ============================================================================= # Chapter 3 - Difference between ln(n) and psi(n) # ============================================================================= n = np.arange(1, 51, 1) ln = np.log(n) psi = scp_sp.digamma(n) plt.figure() plt.plot(n, ln, label="$\ln(n)$") plt.plot(n, psi, label="$\psi(n)$") plt.xlabel("$n$") plt.legend() plt.savefig("output/ln_psi.png", dpi=500) plt.figure() plt.plot(n, np.abs(ln - psi)) plt.xlabel("$n$") plt.ylabel("$|\ln(n) - \psi(n)|$") plt.savefig("output/ln_psi_diff.png", dpi=500) # =============================================================================
def _entropy(self, x): return digamma(self.n) - digamma(self.n_neighbors) + self._epsilon(x)
def _old_train(data, model, query, plan, updateVocab=True): ''' Infers the topic distributions in general, and specifically for each individual datapoint, Params: data - the training data, we just use the DxT document-term matrix model - the initial model configuration. This is MUTATED IN-PLACE qyery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize = \ plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.batchSize docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists ,dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype # Quick sanity check if np.any(docLens < 1): raise ValueError( "Input document-term matrix contains at least one document with no words" ) assert model.dtype == np.float64, "Only implemented for 64-bit floats" # Prepare the data for inference topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior) W = data.words D, T = W.shape iters, bnds, likes = [], [], [] # A few parameters for handling adaptive step-sizes in SGD grad = 0 grad_inner = 0 grad_rate = 1 log_likely = 0 # complete dataset likelihood for gradient adjustments stepSize = np.array([1.] * K, dtype=model.dtype) # Instead of storing the full topic assignments for every individual word, we # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension, # we only store a 1xNxT = NxT part. diWordDistSums = np.empty((K, ), dtype=dtype) diWordDists = np.empty(wordDists.shape, dtype=dtype) wordUpdates = wordDists.copy() if batchSize > 0 else None batchProcessCount = 0 # Amend the name if batchSize == 0 implying we're using SGD modelName = "lda/svbp/%s" % _sgd_desc(plan) \ if batchSize > 0 else model.name print(modelName) for itr in range(iterations): diWordDistSums[:] = wordDists.sum(axis=1) fns.digamma(diWordDistSums, out=diWordDistSums) fns.digamma(wordDists, out=diWordDists) if updateVocab: # Perform inference, updating the vocab if batchSize == 0: wordDists[:, :] = vocabPrior else: wordUpdates[:, :] = 0 for d in range(D): batchProcessCount += 1 #if debug and d % 100 == 0: printAndFlushNoNewLine(".") wordIdx, z = _update_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums) wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z if plan.rate_algor == RateAlgorAmaria: log_likely += 0 elif plan.rate_algor == RateAlgorVariance: g = wordDists.mean(axis=0) + vocabPrior grad *= (1 - grad_rate) grad += grad_rate * wordDists grad += grad_rate * vocabPrior gg += 0 elif plan.rate_algor != RateAlgorTimeKappa: raise ValueError("Unknown rate algorithm " + str(plan.rate_algor)) if batchSize > 0 and batchProcessCount == batchSize: batch_index = ( itr * D + d ) / batchSize #TODO Will not be right if batchSize is not a multiple of D stepSize = _step_sizes(stepSize, batch_index, g, gg, log_likely, plan) wordDists *= (1 - stepSize) wordDists += stepSize * vocabPrior stepSize *= float(D) / batchSize wordUpdates *= stepSize wordDists += wordUpdates diWordDistSums[:] = wordDists.sum(axis=1) fns.digamma(diWordDistSums, out=diWordDistSums) fns.digamma(wordDists, out=diWordDists) wordUpdates[:, :] = 0 batchProcessCount = 0 log_likely = 0 if debug: bnds.append(_var_bound_internal(data, model, query)) likes.append( _log_likelihood_internal(data, model, query)) perp = perplexity_from_like(likes[-1], W.sum()) print( "Iteration %d, after %d docs: Train Perp = %4.0f Bound = %.3f" % (itr, batchSize, perp, bnds[-1])) sys.stdout.flush() # Log bound and the determine if we can stop early if itr % logFrequency == 0 or debug: iters.append(itr) bnds.append(_var_bound_internal(data, model, query)) likes.append(_log_likelihood_internal(data, model, query)) perp = perplexity_from_like(likes[-1], W.sum()) print("Iteration %d : Train Perp = %4.0f Bound = %.3f" % (itr, perp, bnds[-1])) if len(iters) > 2 and (iters[-1] > 20 or (iters[-1] > 2 and batchSize > 0)): lastPerp = perplexity_from_like(likes[-2], W.sum()) if lastPerp - perp < 1: print("Converged, existing early") break # Update hyperparameters (do this after bound, to make sure bound # calculation is internally consistent) if HyperUpdateEnabled and itr > 0 and itr % HyperParamUpdateInterval == 0: if debug: print("Topic Prior was " + str(topicPrior)) _updateTopicHyperParamsFromMeans(model, query) if debug: print("Topic Prior is now " + str(topicPrior)) else: for d in range(D): _ = _update_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums) topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), \ QueryState(docLens, topicMeans, True), \ (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
def updateExpectations(self): E = self.params['a'] / self.params['b'] lnE = special.digamma(self.params['a']) - s.log(self.params['b']) self.expectations = {'E': E, 'lnE': lnE}
def train(data, model, query, plan, updateVocab=True): ''' Infers the topic distributions in general, and specifically for each individual datapoint, Params: data - the training data, we just use the DxT document-term matrix model - the initial model configuration. This is MUTATED IN-PLACE qy=uery - the query results - essentially all the "local" variables matched to the given observations. Also MUTATED IN-PLACE plan - how to execute the training process (e.g. iterations, log-interval etc.) Return: The updated model object (note parameters are updated in place, so make a defensive copy if you want it) The query object with the update query parameters ''' iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize, rateAlgor = \ plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.batchSize, plan.rate_algor docLens, topicMeans = \ query.docLens, query.topicDists K, topicPrior, vocabPrior, wordDists, dtype = \ model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype # Quick sanity check if np.any(docLens < 1): raise ValueError( "Input document-term matrix contains at least one document with no words" ) assert model.dtype == np.float64, "Only implemented for 64-bit floats" # Prepare the data for inference topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior) W = data.words D, T = W.shape iters, bnds, likes = [], [], [] # A few parameters for handling adaptive step-sizes in SGD if plan.rate_algor == RateAlgorBatch: batchSize = D batchCount = 1 else: batchSize = plan.batchSize batchCount = D // batchSize + 1 gradStep = constantArray((K, ), 1. / float(batchSize), dtype=dtype) grad = np.zeros((K, T), dtype=dtype) ex_grad = grad.copy() exp_gtg = np.zeros((K, ), dtype=dtype) stepSize = np.ones((K, ), dtype=dtype) # The digamma terms for the vocabularly diWordDists = fns.digamma(wordDists) diWordDistSums = np.sum(wordDists, axis=1) fns.digamma(diWordDistSums, out=diWordDistSums) # Amend the name to incorporate training information rateAlgor = plan.rate_algor modelName = "lda/svbp/%s" % _sgd_desc(plan) print(modelName) # Start traininng d = -1 for b in range(batchCount * iterations): grad.fill(vocabPrior) # firstD = d for s in range(batchSize): d = d + 1 if (d + 1) < D else 0 wordIdx, z = _update_topics_at_d(d, data, docLens, topicMeans, topicPrior, diWordDists, diWordDistSums) grad[:, wordIdx] += W[d, :].data[np.newaxis, :] * z if rateAlgor == RateAlgorBatch: wordDists[:, :] = grad[:, :] else: if rateAlgor == RateAlgorTimeKappa: stepSize[:] = (b + plan.rate_delay)**(-plan.forgetting_rate) elif rateAlgor == RateAlgorVariance: update_inplace_v(gradStep, ex_grad, change=grad) gtg = stepSize.copy() for k in range(K): stepSize[k] = np.dot(ex_grad[k, :], ex_grad[k, :]) gtg[k] = np.dot(grad[k, :], grad[k, :]) update_inplace_s(gradStep, old=exp_gtg, change=gtg) stepSize /= exp_gtg gradStep = gradStep * (1 - stepSize) + 1 elif rateAlgor == RateAlgorAmaria: topicMeans = _convertMeansToDirichletParam( docLens, topicMeans, topicPrior) # doc_indices = np.linspace(firstD, firstD + batchSize -1, batchSize) % D log_likely = var_bound( data, # data._reorder(doc_indices), ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), QueryState(docLens, topicMeans, True)) p = stepSize[0] a, b = plan.rate_a, plan.rate_b p *= exp(a * (b * -log_likely - p)) stepSize[:] = p topicMeans = _convertMeansToDirichletParam( docLens, topicMeans, topicPrior) else: raise ValueError("No code to support the '" + str(plan.rate_algor) + "' learning rate adaptation algorithm") update_inplace_v(stepSize, old=wordDists, change=grad) if debug: print("%s : t=%d : step=%s" % (rateAlgor, b, str(stepSize))) if is_not_all_real(wordDists): print("Worddists nan") fns.digamma(wordDists, out=diWordDists) if is_not_all_real(diWordDists): print("Digamma worddists nan") np.sum(wordDists, axis=1, out=diWordDistSums) fns.digamma(diWordDistSums, out=diWordDistSums) topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior) return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), \ QueryState(docLens, topicMeans, True), \ (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
def gmm(X, K, max_iter=100): N, D = X.shape # parameters for pi, mu, and precision alphas = np.ones(K, dtype=np.float32) # prior parameter for pi (dirichlet) orig_alphas = np.ones( K, dtype=np.float32) # prior parameter for pi (dirichlet) # mu_means = np.zeros((K, D), dtype=np.float32) # prior mean for mu (normal) ### No! # mu_covs = np.empty((K, D, D), dtype=np.float32) # prior covariance for mu (normal) orig_c = 10.0 # for k in xrange(K): # mu_covs[k] = np.eye(D)*orig_c orig_a = np.ones(K, dtype=np.float32) * D a = np.ones(K, dtype=np.float32) * D # prior for precision (wishart) orig_B = np.empty((K, D, D)) B = np.empty((K, D, D)) # precision (wishart) empirical_cov = np.cov(X.T) for k in xrange(K): B[k] = (D / 10.0) * empirical_cov orig_B[k] = (D / 10.0) * empirical_cov # try random init instead # mu_means = np.random.randn(K, D)*orig_c mu_means = np.empty((K, D)) for j in xrange(K): mu_means[j] = X[np.random.choice(N)] mu_covs = wishart.rvs(df=orig_a[0], scale=np.linalg.inv(B[0]), size=K) costs = np.zeros(max_iter) for iter_idx in xrange(max_iter): # calculate q(c[i]) # phi = np.empty((N,K)) # index i = sample, index j = cluster t1 = np.empty(K) t2 = np.empty((N, K)) t3 = np.empty(K) t4 = np.empty(K) # calculate this first because we will use it multiple times Binv = np.empty((K, D, D)) for j in range(K): Binv[j] = np.linalg.inv(B[j]) for j in xrange(K): # calculate t1 t1[j] = -np.log(np.linalg.det(B[j])) for d in xrange(D): t1[j] += digamma((1 - d + a[j]) / 2.0) # calculate t2 for i in xrange(N): diff_ij = X[i] - mu_means[j] t2[i, j] = diff_ij.dot((a[j] * Binv[j]).dot(diff_ij)) # calculate t3 t3[j] = np.trace(a[j] * Binv[j].dot(mu_covs[j])) # calculate t4 t4[j] = digamma(alphas[j]) - digamma(alphas.sum()) # calculate phi from t's # MAKE SURE 1-d array gets added to 2-d array correctly phi = np.exp(0.5 * t1 - 0.5 * t2 - 0.5 * t3 + t4) # print "phi before normalize:", phi phi = phi / phi.sum(axis=1, keepdims=True) # print "phi:", phi cluster_assignments = phi.argmax(axis=1) n = phi.sum(axis=0) # there should be K of these # print "n[j]:", n # update q(pi) alphas = orig_alphas + n # print "alphas:", alphas # update q(mu) for j in xrange(K): mu_covs[j] = np.linalg.inv((1.0 / orig_c) * np.eye(D) + n[j] * a[j] * Binv[j]) mu_means[j] = mu_covs[j].dot(a[j] * Binv[j]).dot(phi[:, j].dot(X)) # print "means:", mu_means # print "mu_covs:", mu_covs # update q(lambda) a = orig_a + n for j in xrange(K): B[j] = orig_B[j].copy() for i in xrange(N): diff_ij = X[i] - mu_means[j] B[j] += phi[i, j] * (np.outer(diff_ij, diff_ij) + mu_covs[j]) # print "a[j]:", a # print "B[j]:", B costs[iter_idx] = get_cost(X, K, cluster_assignments, phi, alphas, mu_means, mu_covs, a, B, orig_alphas, orig_c, orig_a, orig_B) plt.plot(costs) plt.title("Costs") plt.show() print "cluster assignments:\n", cluster_assignments plt.scatter(X[:, 0], X[:, 1], c=cluster_assignments, s=100, alpha=0.7) plt.show()
def get_cost(X, K, cluster_assignments, phi, alphas, mu_means, mu_covs, a, B, orig_alphas, orig_c, orig_a, orig_B): N, D = X.shape total = 0 ln2pi = np.log(2 * np.pi) # calculate B inverse since we will need it Binv = np.empty((K, D, D)) for j in xrange(K): Binv[j] = np.linalg.inv(B[j]) # calculate expectations first Elnpi = digamma(alphas) - digamma(alphas.sum()) # E[ln(pi)] Elambda = np.empty((K, D, D)) Elnlambda = np.empty(K) for j in xrange(K): Elambda[j] = a[j] * Binv[j] Elnlambda[j] = D * np.log(2) - np.log(np.linalg.det(B[j])) for d in xrange(D): Elnlambda[j] += digamma(a[j] / 2.0 + (1 - d) / 2.0) # now calculate the log joint likelihood # Gaussian part # total -= N*D*ln2pi # total += 0.5*Elnlambda.sum() # for j in xrange(K): # # total += 0.5*Elnlambda[j] # vectorized # for i in xrange(N): # if cluster_assignments[i] == j: # diff_ij = X[i] - mu_means[j] # total -= 0.5*( diff_ij.dot(Elambda[j]).dot(diff_ij) + np.trace(Elambda[j].dot(mu_covs[j])) ) # mixture coefficient part # total += Elnpi.sum() # use phi instead for j in xrange(K): for i in xrange(N): diff_ij = X[i] - mu_means[j] inside = Elnlambda[j] - D * ln2pi inside += -diff_ij.dot(Elambda[j]).dot(diff_ij) - np.trace( Elambda[j].dot(mu_covs[j])) # inside += Elnpi[j] total += phi[i, j] * (0.5 * inside + Elnpi[j]) # E{lnp(mu)} - based on original prior for j in xrange(K): E_mu_dot_mu = np.trace(mu_covs[j]) + mu_means[j].dot(mu_means[j]) total += -0.5 * D * np.log( 2 * np.pi * orig_c) - 0.5 * E_mu_dot_mu / orig_c # print "total:", total # E{lnp(lambda)} - based on original prior for j in xrange(K): total += (orig_a[j] - D - 1) / 2.0 * Elnlambda[j] - 0.5 * np.trace( orig_B[j].dot(Elambda[j])) # print "total 1:", total total += -orig_a[j] * D / 2.0 * np.log(2) + 0.5 * orig_a[j] * np.log( np.linalg.det(orig_B[j])) # print "total 2:", total total -= D * (D - 1) / 4.0 * np.log(np.pi) # print "total 3:", total for d in xrange(D): total -= np.log(gamma(orig_a[j] / 2.0 + (1 - d) / 2.0)) # E{lnp(pi)} - based on original prior # - lnB(orig_alpha) + sum[j]{ orig_alpha[j] - 1}*E[lnpi_j] total += np.log(gamma(orig_alphas.sum())) - np.log( gamma(orig_alphas)).sum() total += ((orig_alphas - 1) * Elnpi).sum() # should be 0 since orig_alpha = 1 # calculate entropies of the q distributions # q(c) for i in xrange(N): total += stats.entropy(phi[i]) # categorical entropy # q(pi) total += dirichlet.entropy(alphas) # q(mu) for j in xrange(K): total += mvn.entropy(cov=mu_covs[j]) # q(lambda) for j in xrange(K): total += wishart.entropy(df=a[j], scale=Binv[j]) return total
def gamma_gradient(self, k): """ :param k: значение k Гамма-функции :return: значение """ return np.log(k) - special.digamma(k) - self.c
def df_eq(x): return tmp - (np.log(x/2.) - digamma(x/2.) + 1.)
def invwishart_entropy(sigma,nu,chol=None): D = sigma.shape[0] chol = np.linalg.cholesky(sigma) if chol is None else chol Elogdetlmbda = special.digamma((nu-np.arange(D))/2).sum() + D*np.log(2) - 2*np.log(chol.diagonal()).sum() return invwishart_log_partitionfunction(sigma,nu,chol)-(nu-D-1)/2*Elogdetlmbda + nu*D/2