def set_parameters(self, d, e, f, g, B=None):
        """Specify the tinker parameters and calculate
        quantities that only depend on them.

        Args:
            d (float): Tinker parameter.
            e (float): Tinker parameter.
            f (float): Tinker parameter.
            g (float): Tinker parameter.
            B (float; optional): Normalization coefficient. If B isn't specified then 
               it's calculated from d,e,f,g such that the mass function is gauranteed 
               to be normalized.
        """
        self.params = np.array([d, e, f, g, B])
        gamma_d2 = special.gamma(d*0.5)
        gamma_f2 = special.gamma(f*0.5)
        log_g = np.log(g)
        gnd2 = g**(-d*0.5)
        gnf2 = g**(-f*0.5)
        ed = e**d
        if not B:
            self.B_coefficient = 2.0/(ed * gnd2 * gamma_d2 + gnf2 * gamma_f2)
            B2 = self.B_coefficient**2
            self.dBdd = 0.25 * B2 * ed * gnd2 * gamma_d2 * (log_g - 2.0 - special.digamma(d*0.5))
            self.dBde = -0.5 * B2 * d * ed/e * gnd2 * gamma_d2
            self.dBdf = 0.25 * B2 * gnf2 * gamma_f2 * (log_g - special.digamma(f*0.5))
            self.dBdg = 0.25 * B2 * (d * ed * gnd2/g * gamma_d2 + f* gnf2/g * gamma_f2)
        else:
            self.B_coefficient = B
            self.dBdd = self.dBde = self.dBdf = self.dBdg = 0
        self.make_dndlM_spline()
        return
    def optAlpha(self, MAX_ALPHA_ITER=1000, NEWTON_THRESH=1e-5):
        """
        Estimate new Dirichlet priors (actually just one scalar shared across all
        topics).
        """
        initA = 100.0
        logA = numpy.log(initA)  # keep computations in log space
        logging.debug("optimizing old alpha %s" % self.alpha)

        for i in xrange(MAX_ALPHA_ITER):
            a = numpy.exp(logA)
            if not numpy.isfinite(a):
                initA = initA * 10.0
                logging.warning("alpha is NaN; new init alpha=%f" % initA)
                a = initA
                logA = numpy.log(a)
            f = (
                self.numDocs * (gammaln(self.numTopics * a) - self.numTopics * gammaln(a))
                + (a - 1) * self.alphaSuffStats
            )
            df = self.alphaSuffStats + self.numDocs * (
                self.numTopics * digamma(self.numTopics * a) - self.numTopics * digamma(a)
            )
            d2f = self.numDocs * (
                self.numTopics * self.numTopics * trigamma(self.numTopics * a) - self.numTopics * trigamma(a)
            )
            logA -= df / (d2f * a + df)
            #            logging.debug("alpha maximization: f=%f, df=%f" % (f, df))
            if numpy.abs(df) <= NEWTON_THRESH:
                break
        result = numpy.exp(logA)  # convert back from log space
        logging.info("estimated old alpha %s to new alpha %s" % (self.alpha, result))
        return result
def get_h(x, k=1, norm=np.inf, min_dist=0.):
    """
    Estimates the entropy H of a random variable x (in nats) based on
    the kth-nearest neighbour distances between point samples.

    @reference:
    Kozachenko, L., & Leonenko, N. (1987). Sample estimate of the entropy of a random vector. Problemy Peredachi Informatsii, 23(2), 9–16.

    Arguments:
    ----------
    x: (n, d) ndarray
        n samples from a d-dimensional multivariate distribution

    k: int (default 1)
        kth nearest neighbour to use in density estimate;
        imposes smoothness on the underlying probability distribution

    norm: 1, 2, or np.inf (default np.inf)
        p-norm used when computing k-nearest neighbour distances
            1: absolute-value norm
            2: euclidean norm
            3: max norm

    min_dist: float (default 0.)
        minimum distance between data points;
        smaller distances will be capped using this value

    Returns:
    --------
    h: float
        entropy H(X)
    """

    n, d = x.shape

    # volume of the d-dimensional unit ball...
    # if norm == np.inf: # max norm:
    #     log_c_d = 0
    # elif norm == 2: # euclidean norm
    #     log_c_d = (d/2.) * log(np.pi) -log(gamma(d/2. +1))
    # elif norm == 1:
    #     raise NotImplementedError
    # else:
    #     raise NotImplementedError("Variable 'norm' either 1, 2 or np.inf")
    log_c_d = 0.

    kdtree = cKDTree(x)

    # query all points -- k+1 as query point also in initial set
    # distances, idx = kdtree.query(x, k + 1, eps=0, p=norm)
    distances, idx = kdtree.query(x, k + 1, eps=0, p=np.inf)
    distances = distances[:, -1]

    # enforce non-zero distances
    distances[distances < min_dist] = min_dist

    sum_log_dist = np.sum(log(2*distances)) # where did the 2 come from? radius -> diameter
    h = -digamma(k) + digamma(n) + log_c_d + (d / float(n)) * sum_log_dist

    return h
  def getLPfromResp(self, Resp, smoothMass=0.001):
    ''' Create full local parameter (LP) dictionary for HDPModel,
          given responsibility matrix Resp

        Returns
        --------
        LP : dict with fields word_variational, alphaPi, E_logPi, DocTopicCount
    '''
    Data = self.Data
    D = Data.nDoc
    K = Resp.shape[1]
    # DocTopicCount matrix : D x K matrix
    DocTopicC = np.zeros((D, K))
    for dd in range(D):
      start,stop = Data.doc_range[dd,:]
      DocTopicC[dd,:] = np.dot(Data.word_count[start:stop],        
                               Resp[start:stop,:]
                               )
    assert np.allclose(DocTopicC.sum(), Data.word_count.sum())
    # Alpha and ElogPi : D x K+1 matrices
    padCol = smoothMass * np.ones((D,1))
    alph = np.hstack( [DocTopicC + smoothMass, padCol])    
    ElogPi = digamma(alph) - digamma(alph.sum(axis=1))[:,np.newaxis]
    assert ElogPi.shape == (D,K+1)
    return dict(word_variational =Resp, 
              E_logPi=ElogPi, alphaPi=alph,
              DocTopicCount=DocTopicC)    
def objectiveGradient(lambda_k, nu, tau, Elog_eta_k, nDoc):
  ''' Calculate gradient of objectiveFunc, objective for HDP variational 
      Returns
      -------
        gvec : 2*K length vector,
              where each entry gives partial derivative with respect to
                  the corresponding entry of Cvec
  '''
  # lvec is the derivative of log(lambda_k) via chain rule
  lvec = 1/(lambda_k)
  W = lvec.size
  
  # Derivative of log eta
  digammaAll = digamma(np.sum(lambda_k))
  Elog_lambda_k = digamma(lambda_k) - digammaAll

  # Derivative of Elog_phi_k and E_phi_k
  polygammaAll = polygamma(1,np.sum(lambda_k))
  dElog_phi_k = polygamma(1,lambda_k) - polygammaAll
  lambda_k_sum = np.sum(lambda_k)
  dE_phi_k = (lambda_k_sum - lambda_k) / np.power(lambda_k_sum,2)

  gvec = dElog_phi_k * (N + tau - lambda_k) \
       + dE_phi_k * nu * Elog_eta_k
  gvec = -1 * gvec

  # Apply chain rule!
  gvecC = lvec * gvec
  return gvecC
Beispiel #6
0
 def E_step( self, X):
   N,D = X.shape
   lpr = np.zeros( (N, self.gmm.K) )
   logdet = np.zeros( self.gmm.K )
   dterms = np.arange( 1,D+1 ) # 1,2,3... D
   self.invWchol = list()
   for k in range(self.gmm.K):
     dXm  = X - self.qMixComp[k].m
     L = scipy.linalg.cholesky(  self.qMixComp[k].invW, lower=True)
     self.invWchol.append( L )
     
     if np.any( np.isnan(L) | np.isinf(L) ):
       print 'NaN!', self.qMixComp[k]
     #invL = scipy.linalg.inv( L )
     #  want: Q =  invL * X.T
     #    so we solve for matrix Q s.t. L*Q = X.T
     lpr[:,k] = -0.5*self.qMixComp[k].dF \
                   * np.sum( scipy.linalg.solve_triangular( L, dXm.T,lower=True)**2, axis=0)
     lpr[:,k] -= 0.5*D/self.qMixComp[k].beta
     # det( W ) = 1/det(invW)
     #          = 1/det( L )**2 
     # det of triangle matrix = prod of diag entries
     logdet[k] = -2*np.sum( np.log(np.diag(L) ) ) + D*np.log(2.0) 
     logdet[k] += digamma( 0.5*(dterms+1+self.qMixComp[k].dF)  ).sum()
   self.logwtilde = digamma( self.alpha ) - digamma( self.alpha.sum() )
   self.logLtilde = logdet
   lpr += self.logwtilde
   lpr += logdet
   lprSUM = logsumexp(lpr, axis=1)
   resp   = np.exp(lpr - lprSUM[:, np.newaxis])
   resp   /= resp.sum( axis=1)[:,np.newaxis] # row normalize
   return resp
def objective(X, Y, C, mu, a, b, e, f, a0, b0, e0, f0):
  log2pi = np.log(2*np.pi)
  N, D = X.shape

  # E(lnX) = digamma(a) - ln(b) for X ~ Gamma(a,b)
  E_ln_lambda = digamma(e) - np.log(f)
  E_ln_alpha = digamma(a) - np.log(b)

  # model likelihood
  total = (N/2.0)*(E_ln_lambda - log2pi)
  data_total = 0
  for i in xrange(N):
    delta = Y[i] - X[i].dot(mu)
    data_total += delta*delta + X[i].dot(C).dot(X[i])
  total -= (float(e)/f)/2.0 * data_total

  # print "total after model likelihood:", total

  # w likelihood
  total -= (D/2.0)*log2pi
  for k in xrange(D):
    total += 0.5*(E_ln_alpha[k] - (float(a[k])/b[k])*(C[k,k] + mu[k]*mu[k]))

  # print "total after w likelihood:", total

  # lambda likelihood
  total += e0*np.log(f0) - np.log(gamma(e0)) + (e0 - 1)*E_ln_lambda - f0*(float(e)/f)

  # print "total after lambda likelihood:", total

  # alpha likelihood
  for k in xrange(D):
    total += a0*np.log(b0) - np.log(gamma(a0)) + (a0 - 1)*E_ln_alpha[k] - b0*(float(a[k])/b[k])

  # print "total after alpha likelihood:", total

  # entropy
  # TODO: calculate this manually
  # total -= mvn.entropy(mean=mu, cov=C)
  # e1 = mvn.entropy(cov=C)
  # e2 = 0.5*np.log( np.linalg.det(2*np.pi*np.e*C) )
  # print "e1:", e1, "e2:", e2
  # total += 0.5*np.log( np.linalg.det(2*np.pi*np.e*C) )

  total += mvn.entropy(cov=C)
  # print "det(C):", np.linalg.det(C)
  # print "total after lnq(w):", total

  # total -= gamma_dist.entropy(e, scale=1.0/f)
  # e3 = gamma_dist.entropy(e, scale=1.0/f)
  # e4 = -e_ln_q_gamma(e, f)
  # print "e3:", e3, "e4:", e4
  # assert(np.abs(e3 - e4) < 1e-8)
  total += gamma_dist.entropy(e, scale=1.0/f)
  # total -= e_ln_q_gamma(e, f)
  # print "total after lnq(lambda):", total
  for k in xrange(D):
    # total -= e_ln_q_gamma(a[k], b[k])
    total += gamma_dist.entropy(a[k], scale=1.0/b[k])
  return total
def _negative_binomial_gradient_sparse(X, counts, alpha=-3, beta=1.,
                                       dispersion=None,
                                       bias=None,
                                       use_zero_counts=False):
    if use_zero_counts:
        raise NotImplementedError
    bias = bias.flatten()

    dis = np.sqrt(((X[counts.row] - X[counts.col])**2).sum(axis=1))
    fdis = bias[counts.row] * bias[counts.col] * beta * dis ** alpha

    diff = X[counts.row] - X[counts.col]

    d = dispersion.predict(fdis)

    d_prime = (dispersion.derivate(fdis) * alpha * beta * bias[counts.row] *
               bias[counts.col] * dis ** (alpha - 2))[:, np.newaxis] * diff

    grad = -(special.digamma(counts.data + d)[:, np.newaxis] * d_prime)
    grad += special.digamma(d)[:, np.newaxis] * d_prime
    grad -= (counts.data * alpha / dis ** 2)[:, np.newaxis] * diff
    grad -= (np.log(d) + 1)[:, np.newaxis] * d_prime
    grad += np.log(d + fdis)[:, np.newaxis] * d_prime
    grad += ((counts.data + d) / (d + fdis))[:, np.newaxis] * (
        (fdis * alpha / dis**2)[:, np.newaxis] * diff + d_prime)

    grad_ = np.zeros(X.shape)

    for i in range(X.shape[0]):
        grad_[i] += grad[counts.row == i].sum(axis=0)
        grad_[i] -= grad[counts.col == i].sum(axis=0)

    return grad_
Beispiel #9
0
def fit_betabinom_minka(counts, maxiter=1000, tol=1e-6, initial_guess=None):
    ''' See Estimating a Dirichlet Distribution, Thomas P. Minka, 2003,
    eq. 55.  see also the code for polya_fit_simple.m in his fastfit
    matlab toolbox, which this code is a translation of.

    counts should be NxK with N samples over K classes.'''

    counts = matrix(counts).astype(float)

    # remove observations with no trials
    counts = counts[sum(counts.A, axis=1) > 0, :]
    if initial_guess == None:
        alpha = polya_moment_match(counts).T
    else:
        alpha = matrix(initial_guess).T

    # Abstraction barrier: now in Dirichlet/Polya mode, following naming in Minka's paper.
    n = counts.T
    N = n.shape[1]
    n_i = n.sum(axis=0)

    change = 2*tol
    iter = 0
    while (change > tol) and (iter < maxiter):
        numerator = digamma(n + alpha.repeat(N, axis=1)).sum(axis=1) - N * digamma(alpha)
        denominator = digamma(n_i + alpha.sum()).sum() - N * digamma(alpha.sum())
        old_alpha = alpha
        alpha = multiply(alpha, numerator / denominator)
        change = abs(old_alpha - alpha).max()
        iter = iter + 1

    # now leaving Abstraction Barrier

    return array(alpha[:,0]).T, iter
Beispiel #10
0
    def KL_divergence(self, variational_posterior):
        mu, S, gamma, tau = (
            variational_posterior.mean.values,
            variational_posterior.variance.values,
            variational_posterior.gamma_group.values,
            variational_posterior.tau.values,
        )

        var_mean = np.square(mu) / self.variance
        var_S = S / self.variance - np.log(S)
        part1 = (gamma * (np.log(self.variance) - 1.0 + var_mean + var_S)).sum() / 2.0

        ad = self.alpha / self.input_dim
        from scipy.special import betaln, digamma

        part2 = (
            (gamma * np.log(gamma)).sum()
            + ((1.0 - gamma) * np.log(1.0 - gamma)).sum()
            + betaln(ad, 1.0) * self.input_dim
            - betaln(tau[:, 0], tau[:, 1]).sum()
            + ((tau[:, 0] - gamma - ad) * digamma(tau[:, 0])).sum()
            + ((tau[:, 1] + gamma - 2.0) * digamma(tau[:, 1])).sum()
            + ((2.0 + ad - tau[:, 0] - tau[:, 1]) * digamma(tau.sum(axis=1))).sum()
        )

        return part1 + part2
Beispiel #11
0
  def update_global_params( self, SS, rho=None, Ntotal=None, **kwargs ):
    '''
    '''
    ampF = 1
    if Ntotal is not None:
      ampF = Ntotal/SS['Ntotal']
    qalpha1 = self.alpha1 + ampF*SS['N']
    qalpha0 = self.alpha0*np.ones( self.K )
    qalpha0[:-1] += ampF*SS['N'][::-1].cumsum()[::-1][1:]
    
    if rho is None or rho==1:
      self.qalpha1 = qalpha1
      self.qalpha0 = qalpha0
    else:
      self.qalpha1 = rho*qalpha1 + (1-rho)*self.qalpha1
      self.qalpha0 = rho*qalpha0 + (1-rho)*self.qalpha0
    
    DENOM = digamma( self.qalpha0 + self.qalpha1 )
    self.ElogV      = digamma( self.qalpha1 ) - DENOM
    self.Elog1mV    = digamma( self.qalpha0 ) - DENOM

    if self.truncType == 'v':
      self.qalpha1[-1] = 1
      self.qalpha0[-1] = EPS #avoid digamma(0), which is way too HUGE
      self.ElogV[-1] = 0  # log(1) => 0
      self.Elog1mV[-1] = np.log(1e-40) # log(0) => -INF, never used
		
		# Calculate expected mixture weights E[ log w_k ]	 
    self.Elogw = self.ElogV.copy() #copy so we can do += without modifying ElogV
    self.Elogw[1:] += self.Elog1mV[:-1].cumsum()
Beispiel #12
0
  def testBetaBetaKL(self):
    with self.test_session() as sess:
      for shape in [(10,), (4,5)]:
        a1 = 6.0*np.random.random(size=shape) + 1e-4
        b1 = 6.0*np.random.random(size=shape) + 1e-4 
        a2 = 6.0*np.random.random(size=shape) + 1e-4
        b2 = 6.0*np.random.random(size=shape) + 1e-4 
        # Take inverse softplus of values to test BetaWithSoftplusAB
        a1_sp = np.log(np.exp(a1) - 1.0)
        b1_sp = np.log(np.exp(b1) - 1.0)
        a2_sp = np.log(np.exp(a2) - 1.0)
        b2_sp = np.log(np.exp(b2) - 1.0)

        d1 = tf.contrib.distributions.Beta(a=a1, b=b1)
        d2 = tf.contrib.distributions.Beta(a=a2, b=b2)
        d1_sp = tf.contrib.distributions.BetaWithSoftplusAB(a=a1_sp, b=b1_sp)
        d2_sp = tf.contrib.distributions.BetaWithSoftplusAB(a=a2_sp, b=b2_sp)

        kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1)
                     + (a1 - a2)*special.digamma(a1)
                     + (b1 - b2)*special.digamma(b1)
                     + (a2 - a1 + b2 - b1)*special.digamma(a1 + b1))

        for dist1 in [d1, d1_sp]:
          for dist2 in [d2, d2_sp]:
            kl = tf.contrib.distributions.kl(dist1, dist2)
            kl_val = sess.run(kl)
            self.assertEqual(kl.get_shape(), shape)
            self.assertAllClose(kl_val, kl_expected)
        
        # Make sure KL(d1||d1) is 0
        kl_same = sess.run(tf.contrib.distributions.kl(d1, d1))
        self.assertAllClose(kl_same, np.zeros_like(kl_expected))
 def computeLikelihood(self, doc, phi, gamma):
     """
     Compute the document likelihood, given all model parameters.
     """
     gammaSum = numpy.sum(gamma)
     digSum = digamma(gammaSum)
     dig = digamma(gamma) - digSum # precompute the difference
     
     likelihood = gammaln(self.alpha * self.numTopics) - \
                  self.numTopics * gammaln(self.alpha) - \
                  gammaln(gammaSum)
     
     likelihood += numpy.sum((self.alpha - 1) * dig + gammaln(gamma) - (gamma - 1) * dig)
     
     for n, (wordIndex, wordCount) in enumerate(doc):
         try:
             phin, lprob = phi[n], self.logProbW[:, wordIndex]
             code = """
             const int num_terms = Nphin[0];
             double result = 0.0;
             for (int i=0; i < num_terms; i++) {
                 if (phin[i] > 1e-8 || phin[i] < -1e-8)
                     result += phin[i] * (dig[i] - log(phin[i]) + LPROB1(i));
             }
             return_val = wordCount * result;
             """
             likelihood += weave.inline(code, ['dig', 'phin', 'lprob', 'wordCount'])
         except:
             partial = phi[n] * (dig - numpy.log(phi[n]) + self.logProbW[:, wordIndex])
             partial[numpy.isnan(partial)] = 0.0 # replace NaNs (from 0 * log(0) in phi) with 0.0
             likelihood += wordCount * numpy.sum(partial)
     return likelihood
Beispiel #14
0
def _score_nbp(y, X, beta, thet, Q):
    '''
    Negative Binomial Score -- type P likelihood from Greene (2007)
    .. math::

        \lambda_i = exp(X\beta)\\
        g_i = \theta \lambda_i^Q \\
        w_i = g_i/(g_i + \lambda_i) \\
        r_i = \theta / (\theta+\lambda_i) \\
        A_i = \left [ \Psi(y_i+g_i) - \Psi(g_i) + ln w_i \right ] \\
        B_i = \left [ g_i (1-w_i) - y_iw_i \right ] \\
        \partial ln \mathcal{L}_i / \partial
            \begin{pmatrix} \lambda_i \\ \theta \\ Q \end{pmatrix}=
            [A_i+B_i]
            \begin{pmatrix} Q/\lambda_i \\ 1/\theta \\ ln(\lambda_i) \end{pmatrix}
            -B_i
            \begin{pmatrix} 1/\lambda_i\\ 0 \\ 0 \end{pmatrix} \\
        \frac{\partial \lambda}{\partial \beta} = \lambda_i \mathbf{x}_i \\
        \frac{\partial \mathcal{L}_i}{\partial \beta} =
            \left (\frac{\partial\mathcal{L}_i}{\partial \lambda_i} \right )
            \frac{\partial \lambda_i}{\partial \beta}
    '''
    lamb = np.exp(np.dot(X, beta))
    g = thet * lamb**Q
    w = g / (g + lamb)
    r = thet / (thet+lamb)
    A = digamma(y+g) - digamma(g) + np.log(w)
    B = g*(1-w) - y*w
    dl = (A+B) * Q/lamb - B * 1/lamb
    dt = (A+B) * 1/thet
    dq = (A+B) * np.log(lamb)
    db = X * (dl * lamb)[:,np.newaxis]
    sc = np.array([dt.sum(), dq.sum()])
    sc = np.concatenate([db.sum(axis=0), sc])
    return sc
Beispiel #15
0
def update_beta(state, a, b):
    # http://bit.ly/1yX1cZq
    i = 0
    num_iterations = 200
    alpha = state['beta']
    alpha0 = 0
    prec = 1 ** -5
    for _ in range(num_iterations):
        summk = 0
        summ = 0
        for doc_index, _ in enumerate(state['docs']):
            summ += digamma(state['num_topics'] * alpha + state['ss']['doc'][doc_index])
            for topic in state['used_topics']:
                summk += digamma(alpha + state['ss']['document_topic'][doc_index][topic])
        summ -= state['num_docs'] * digamma(state['num_topics'] * alpha)
        summk -= state['num_docs'] * state['num_topics'] * digamma(alpha)
        alpha = (a - 1 + alpha * summk) / (b + state['num_topics'] * summ)
        assert not np.isnan(alpha)
        if abs(alpha - alpha0) < prec:
            break
        else:
            alpha0 = alpha

        if i == num_iterations - 1:
            raise Exception("update_beta did not converge.")
    state['beta'] = alpha
    return state
Beispiel #16
0
    def local_update(self, metaobs=None):
        """ Local update that handles minibatches.  This needed to be
            reimplemented because forward_msgs and backward_msgs need to be
            specialized.
        """

        if metaobs is None:
            loff = 0
            uoff = self.T-1
        else:
            loff, uoff = metaobs.i1, metaobs.i2

        # update the modified parameter tables (don't do emissions b/c
        # pybasicbayes takes care of those).
        # Don't overwrite mod_init b/c we stored something in it
        self.mod_init = digamma(self.var_init + eps) - digamma(np.sum(self.var_init) + eps)
        tran_sum = np.sum(self.var_tran, axis=1)
        self.mod_tran = digamma(self.var_tran + eps) - digamma(tran_sum[:,npa] + eps)

        obs = self.obs
        # Compute likelihoods
        for k, odist in enumerate(self.var_emit):
            self.lliks[:,k] = np.nan_to_num(odist.expected_log_likelihood(obs[loff:(uoff+1),:]))

        # update forward, backward and scale coefficient tables
        self.forward_msgs(metaobs=metaobs)
        self.backward_msgs(metaobs=metaobs)

        # update weights
        self.var_x = self.lalpha + self.lbeta
        self.var_x -= np.max(self.var_x, axis=1)[:,npa]
        self.var_x = np.exp(self.var_x)
        self.var_x /= np.sum(self.var_x, axis=1)[:,npa]
Beispiel #17
0
    def local_update(self, obs=None, mask=None):
        """ This is the local update for the batch version. Here we're creating
            modified parameters to run the forward-backward algorithm on to
            update the variational q distribution over the hidden states.

            These are always the same, and if we really need to change them
            we'll override the function.
        """
        if obs is None:
            obs = self.obs
        if mask is None:
            mask = self.mask

        self.mod_init = digamma(self.var_init + eps) - digamma(np.sum(self.var_init) + eps)
        tran_sum = np.sum(self.var_tran, axis=1)
        self.mod_tran = digamma(self.var_tran + eps) - digamma(tran_sum[:,npa] + eps)

        # Compute likelihoods
        for k, odist in enumerate(self.var_emit):
            self.lliks[:,k] = np.nan_to_num(odist.expected_log_likelihood(obs))

        # update forward, backward and scale coefficient tables
        self.forward_msgs()
        self.backward_msgs()

        self.var_x = self.lalpha + self.lbeta
        self.var_x -= np.max(self.var_x, axis=1)[:,npa]
        self.var_x = np.exp(self.var_x)
        self.var_x /= np.sum(self.var_x, axis=1)[:,npa]
Beispiel #18
0
    def FFBS(self, var_init):
        """ Forward Filter Backward Sampling to simulate state sequence.
        """
        obs = self.obs
        T = self.T
        K = self.K
        A = self.var_tran

        mod_init = digamma(var_init + eps) - digamma(np.sum(var_init) + eps)
        tran_sum = np.sum(self.var_tran, axis=1)
        mod_tran = digamma(self.var_tran + eps) - digamma(tran_sum[:,npa] + eps)

        lalpha = np.empty((T, K))
        lliks = np.empty((T, K))
        # Compute likelihoods
        for k, odist in enumerate(self.var_emit):
            lliks[:,k] = np.nan_to_num(odist.expected_log_likelihood(obs))

        lalpha[0,:] = mod_init + lliks[0,:]

        for t in xrange(1,self.T):
            lalpha[t] = np.logaddexp.reduce(lalpha[t-1] + np.log(A+eps).T, axis=1) + lliks[t]

        z = np.empty(T, dtype=np.int_)
        lp = lalpha[T-1,:] - np.max(lalpha[T-1,:])
        p = np.exp(lp)
        p /= np.sum(p)
        z[T-1] = np.random.choice(K, p=p)
        for t in xrange(T-2, -1, -1):
            lp = lalpha[t,:] + np.log(A[:,z[t+1]]+eps)
            lp -= np.max(lp)
            z[t] = np.random.choice(K, p=p)

        return z
Beispiel #19
0
def sample_profiles(base, num): # pylint: disable=inconsistent-return-statements
    """Generate unique profiles from a game

    Parameters
    ----------
    base : RsGame
        Game to generate random profiles from.
    num : int
        Number of profiles to sample from the game.
    """
    if num == base.num_all_profiles: # pylint: disable=no-else-return
        return base.all_profiles()
    elif num == 0:
        return np.empty((0, base.num_strats), int)
    elif base.num_all_profiles <= np.iinfo(int).max:
        inds = rand.choice(base.num_all_profiles, num, replace=False)
        return base.profile_from_id(inds)
    else:
        # Number of times we have to re-query
        ratio = (sps.digamma(float(base.num_all_profiles)) -
                 sps.digamma(float(base.num_all_profiles - num)))
        # Max is for underflow
        num_per = max(round(float(ratio * base.num_all_profiles)), num)
        profiles = set()
        while len(profiles) < num:
            profiles.update(
                utils.hash_array(p) for p in base.random_profiles(num_per))
        profiles = np.stack([h.array for h in profiles])
        inds = rand.choice(profiles.shape[0], num, replace=False)
        return profiles[inds]
Beispiel #20
0
def entropy_2(data, length=1):
    """
    Estimate the entropy of length `length` subsequences in `data`.

    Parameters
    ----------
    data : iterable
        An iterable of samples.
    length : int
        The length to group samples into.

    Returns
    -------
    h2 : float
        An estimate of the entropy.

    Notes
    -----
    If M is the alphabet size and N is the number of samples, then the bias of this estimator is:
        B ~ (M+1)/(2N)
    """
    counts = get_counts(data, length)
    total = counts.sum()
    digamma_N = digamma(total)
    log2 = np.log(2)
    jss = [np.arange(1, count) for count in counts]

    alt_terms = np.array([(((-1)**js)/js).sum() for js in jss])

    h2 = np.log2(np.e)*(counts/total*(digamma_N - digamma(counts) + log2 + alt_terms)).sum()

    return h2
Beispiel #21
0
def entropy_1(data, length=1):
    """
    Estimate the entropy of length `length` subsequences in `data`.

    Parameters
    ----------
    data : iterable
        An iterable of samples.
    length : int
        The length to group samples into.

    Returns
    -------
    h1 : float
        An estimate of the entropy.

    Notes
    -----
    If M is the alphabet size and N is the number of samples, then the bias of this estimator is:
        B ~ M/N
    """
    counts = get_counts(data, length)
    total = counts.sum()
    digamma_N = digamma(total)

    h1 = np.log2(np.e)*(counts/total*(digamma_N - digamma(counts))).sum()

    return h1
Beispiel #22
0
 def logL_dbl_prime(contrast):
     M = 1.0 / (contrast + 1e-100)
     n = np.arange( len(empirical_pmf) )
     t1 = np.sum( pmf * ((np.square(k_bar) - n*M) / (M * np.square(k_bar + M))) )
     t2 = - N * digamma(M)
     t3 = np.sum( pmf * digamma(n + M) )
     return t1 + t2 + t3
def estimate_dirichlet_param(samples, param):
    """
    Uses a Newton-Raphson scheme to estimating the parameter of a
    K-dimensional Dirichlet distribution

    :param samples: an NxK matrix of K-dimensional vectors drawn from
    a Dirichlet distribution
    :param param: the old value of the paramter. This is overwritten
    :return: a K-dimensional vector which is the new
    """

    N, K = samples.shape
    p = np.sum(np.log(samples), axis=0)

    for _ in range(60):
        g = -N * fns.digamma(param)
        g += N * fns.digamma(param.sum())
        g += p

        q = -N * fns.polygamma(1, param)
        np.reciprocal(q, out=q)

        z = N * fns.polygamma(1, param.sum())

        b = np.sum(g * q)
        b /= 1 / z + q.sum()

        param -= (g - b) * q

        print("%.2f" % param.mean(), end=" --> ")
    print

    return param
Beispiel #24
0
def gradient(weights, k, W, sample_count, n_dk_samples, X, sigma):
    D, K = X.shape[0], W.shape[0]

    result = 0.0
    alpha = np.empty((BatchSize, K), dtype=np.float64)
    scale = np.empty((BatchSize,),   dtype=np.float64)
    for d in range(0, D, BatchSize):
        max_d = min(D, d + BatchSize)
        top   = max_d - d

        alpha[:top,:] = X[d:max_d,:].dot(W.T)
        alpha[:top,k] = X[d:max_d,:].dot(weights)
        np.exp(alpha[:top], out=alpha[:top])

        alpha_sum = alpha[:top].sum(axis=1)
        scale[:top]  = fns.digamma(alpha_sum)
        scale[:top] -= fns.digamma(alpha_sum[:,np.newaxis] + n_dk_samples[d:max_d,:,:sample_count].sum(axis=1)).sum(axis=1) / sample_count
        scale[:top] += fns.digamma(alpha[:top,k,np.newaxis] + n_dk_samples[d:max_d,k,:sample_count]).sum(axis=1) / sample_count
        scale[:top] -= fns.digamma(alpha[:top,k])

        P_1 = ssp.diags(alpha[:top,k], 0).dot(X[d:max_d,:])
        P_2 = ssp.diags(scale[:top], 0).dot(P_1)

        batch_result = np.array(P_2.sum(axis=0))
        result += batch_result

    result -= weights / sigma

    return -np.squeeze(np.asarray(result))
Beispiel #25
0
    def full_local_update(self):
        """ Local update on full data set.  Reimplements member functions
            because we don't want to use the object's internal variables.

            This is only useful if we can store the whole state sequence in
            memory.
        """

        # update the modified parameter tables (don't do emissions b/c
        # pybasicbayes takes care of those).
        mod_init = digamma(self.var_init + eps) - digamma(np.sum(self.var_init) + eps)
        tran_sum = np.sum(self.var_tran, axis=1)
        mod_tran = digamma(self.var_tran + eps) - digamma(tran_sum[:,npa] + eps)

        T = self.T
        K = self.K
        obs = self.obs
        mask = self.mask

        # Mask out missing data (restored below)
        obs_full = obs.copy()
        obs[mask,:] = np.nan

        lalpha = np.empty((T, K))
        lbeta = np.empty((T, K))

        ll = np.empty((T, K))
        # Compute likelihoods
        for k, odist in enumerate(self.var_emit):
            ll[:,k] = np.nan_to_num(odist.expected_log_likelihood(obs))

        # Forward messages
        ltran = mod_tran

        lalpha[0,:] = mod_init + ll[0,:]

        for t in xrange(1,self.T):
            lalpha[t] = np.logaddexp.reduce(lalpha[t-1] + ltran.T, axis=1) + ll[t]

        # Backward messages
        ltran = mod_tran

        lbeta[self.T-1,:] = 0.

        for t in xrange(self.T-2,-1,-1):
            np.logaddexp.reduce(ltran + lbeta[t+1] + ll[t+1], axis=1,
                                out=lbeta[t])


        # Update weights
        var_x = lalpha + lbeta
        var_x -= np.max(var_x, axis=1)[:,npa]
        var_x = np.exp(var_x)
        var_x /= np.sum(var_x, axis=1)[:,npa]

        # Restore full observations
        self.obs = obs_full

        return var_x
Beispiel #26
0
def _compute_mi_cc(x, y, n_neighbors):
    """Compute mutual information between two continuous variables.

    Parameters
    ----------
    x, y : ndarray, shape (n_samples,)
        Samples of two continuous random variables, must have an identical
        shape.

    n_neighbors : int
        Number of nearest neighbors to search for each point, see [1]_.

    Returns
    -------
    mi : float
        Estimated mutual information. If it turned out to be negative it is
        replace by 0.

    Notes
    -----
    True mutual information can't be negative. If its estimate by a numerical
    method is negative, it means (providing the method is adequate) that the
    mutual information is close to 0 and replacing it by 0 is a reasonable
    strategy.

    References
    ----------
    .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
           information". Phys. Rev. E 69, 2004.
    """
    n_samples = x.size

    x = x.reshape((-1, 1))
    y = y.reshape((-1, 1))
    xy = np.hstack((x, y))

    # Here we rely on NearestNeighbors to select the fastest algorithm.
    nn = NearestNeighbors(metric='chebyshev', n_neighbors=n_neighbors)

    nn.fit(xy)
    radius = nn.kneighbors()[0]
    radius = np.nextafter(radius[:, -1], 0)

    # Algorithm is selected explicitly to allow passing an array as radius
    # later (not all algorithms support this).
    nn.set_params(algorithm='kd_tree')

    nn.fit(x)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    nx = np.array([i.size for i in ind])

    nn.fit(y)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    ny = np.array([i.size for i in ind])

    mi = (digamma(n_samples) + digamma(n_neighbors) -
          np.mean(digamma(nx + 1)) - np.mean(digamma(ny + 1)))

    return max(0, mi)
 def Fprime(x):
     df = outsum(digamma(eta.total+x)*etaestim) \
         - digamma(x)*outsum(etaestim) + C
     Df = -1.*df.ravel()
     if np.isnan(Df).any() or np.isinf(Df).any():
         return np.array([np.inf, np.inf])
     else:
         return Df
Beispiel #28
0
 def exp_T(self, eta):
     """
     @arg eta: The natural parameters.
     The expectation of T, the sufficient statistics, given eta.
     """
     theta = self.theta(eta)
     return (digamma(theta)
             - digamma(theta.sum(axis=-1)).reshape(theta.shape[:-1] + (1,)))
Beispiel #29
0
 def _dll(self, x):
     alpha = self.get_alpha(x)
     return -(np.sum(self._dll_common(x)\
         * (special.digamma(np.sum(alpha, axis=1))[:,np.newaxis,np.newaxis]\
         - special.digamma(np.sum(self.n_m_z+alpha, axis=1))[:,np.newaxis,np.newaxis]\
         + special.digamma(self.n_m_z+alpha)[:,:,np.newaxis]\
         - special.digamma(alpha)[:,:,np.newaxis]), axis=0)\
         - x / (self.sigma ** 2))
Beispiel #30
0
 def next_alpha(alpha):
     das = digamma(alpha.sum())
     g = alpha * N * (das - digamma(alpha) + g_offset)
     h = alpha * N * (das + g_offset)
     z = N * das
     x = (alpha * g / h).sum()
     w = (alpha ** 2 / h).sum()
     return np.exp(np.log(alpha) - (g - x * alpha / (1/z + w)) / h)
Beispiel #31
0
    def select_desired_action(self, tau, t, posterior_policies, actions,
                              *args):

        npi = posterior_policies.shape[0]
        likelihood = args[0]
        prior = args[1]  #np.ones_like(likelihood)/npi #
        # likelihood = np.array([0.5,0.5])
        # prior = np.array([0.5,0.5])
        # posterior_policies = prior * likelihood
        # posterior_policies /= posterior_policies.sum()
        #print(posterior_policies, prior, likelihood)
        self.accepted_pis = np.zeros(100000, dtype=np.int32) - 1
        dir_counts = np.ones(npi, np.double)

        curr_ess = 0
        i = 0

        H_0 =         + (dir_counts.sum()-npi)*scs.digamma(dir_counts.sum()) \
                        - ((dir_counts - 1)*scs.digamma(dir_counts)).sum() \
                        + logBeta(dir_counts)
        #print("H", H_0)

        pi = np.random.choice(npi, p=prior)
        self.accepted_pis[i] = pi
        dir_counts[pi] += 1
        H_dir =         + (dir_counts.sum()-npi)*scs.digamma(dir_counts.sum()) \
                        - ((dir_counts - 1)*scs.digamma(dir_counts)).sum() \
                        + logBeta(dir_counts)
        #print("H", H_dir)

        if t == 0:
            i += 1
            while H_dir > H_0 - self.factor + self.factor * H_0:

                pi = np.random.choice(npi, p=prior)
                r = np.random.rand()
                #print(i, curr_ess)

                #acc_prob = min(1, posterior_policies[pi]/posterior_policies[self.accepted_pis[i-1]])
                if likelihood[self.accepted_pis[i - 1]] > 0:
                    acc_prob = min(
                        1,
                        likelihood[pi] / likelihood[self.accepted_pis[i - 1]])
                else:
                    acc_prob = 1
                if acc_prob >= r:  #posterior_policies[pi]/posterior_policies[self.accepted_pis[i-1]] > r:
                    self.accepted_pis[i] = pi
                    dir_counts[pi] += 1  #acc_prob
                else:
                    self.accepted_pis[i] = self.accepted_pis[i - 1]
                    dir_counts[self.accepted_pis[i - 1]] += 1  #1-acc_prob

                H_dir =     + (dir_counts.sum()-npi)*scs.digamma(dir_counts.sum()) \
                            - ((dir_counts - 1)*scs.digamma(dir_counts)).sum() \
                            + logBeta(dir_counts)
                #print("H", H_dir)

                i += 1

            self.RT[tau, t] = i - 1
            #print(tau, t, i-1)
        else:
            self.RT[tau, t] = 0

        if self.draw_true_post:
            chosen_pol = np.random.choice(npi, p=posterior_policies)
        else:
            chosen_pol = self.accepted_pis[i - 1]

        u = actions[chosen_pol]
        #print(tau,t,iself.accepted_pis[i-1],u,H_rel)
        # if tau in range(100,110) and t==0:
        #     plt.figure()
        #     plt.plot(posterior_policies)
        #     plt.show()

        if self.calc_dkl:
            # autocorr = acov(self.accepted_pis[:i+1])

            # if autocorr[0] > 0:
            #     ACT = 1 + 2*np.abs(autocorr[1:]).sum()/autocorr[0]
            #     ess = i/ACT
            #     ess = round(ess)
            # else:
            #     ess = 1

            dist = dir_counts / dir_counts.sum()
            D_KL = entropy(posterior_policies, dist)
            self.DKL_post[tau, t] = D_KL
            D_KL = entropy(prior, dist)
            self.DKL_prior[tau, t] = D_KL

        if self.calc_entropy:
            self.entropy_post[tau, t] = entropy(posterior_policies)
            self.entropy_prior[tau, t] = entropy(prior)
            self.entropy_like[tau, t] = entropy(likelihood)
            # if t==0:
            #     print(tau)
            #     n = 12
            #     ind = np.argpartition(posterior_policies, -n)[-n:]
            #     print(np.sort(ind))
            #     print(np.sort(posterior_policies[ind]))

        #estimate action probability
        self.estimate_action_probability(tau, t, posterior_policies, actions)

        return u
Beispiel #32
0
def wnn_entropy(points, k=None, weights=True, n_est=None, gmm=None):
    r"""
    Weighted Kozachenko-Leonenko nearest-neighbour entropy calculation.

    *k* is the number of neighbours to consider, with default $k=n^{1/3}$

    *n_est* is the number of points to use for estimating the entropy,
    with default $n_\rm{est} = n$

    *weights* is True for default weights, False for unweighted (using the
    distance to the kth neighbour only), or a vector of weights of length *k*.

    *gmm* is the number of gaussians to use to model the distribution using
    a gaussian mixture model.  Default is 0, and the points represent an
    empirical distribution.

    Returns entropy H in bits and its uncertainty.

    Berrett, T. B., Samworth, R.J., Yuan, M., 2016. Efficient multivariate
    entropy estimation via k-nearest neighbour distances.
    DOI:10.1214/18-AOS1688 https://arxiv.org/abs/1606.00304
    """
    from sklearn.neighbors import NearestNeighbors
    n, d = points.shape

    # Default to the full set
    if n_est is None:
        n_est = 10000
    elif n_est == 0:
        n_est = n

    # reduce size of draw to n_est
    if n_est >= n:
        x = points
        n_est = n
    else:
        x = points[permutation(n)[:n_est]]
        n = n_est

    # Default k based on n
    if k is None:
        # Private communication: cube root of n is a good choice for k
        # Personal observation: k should be much bigger than d
        k = max(int(n**(1 / 3)), 3 * d)

    # If weights are given then use them (setting the appropriate k),
    # otherwise use the default weights.
    if isinstance(weights, bool):
        weights = _wnn_weights(k, d, weights)
    else:
        k = len(weights)
    #print("weights", weights, sum(weights))

    # select knn algorithm
    algorithm = 'auto'
    #algorithm = 'kd_tree'
    #algorithm = 'ball_tree'
    #algorithm = 'brute'

    n_components = 0 if gmm is None else gmm

    # H = 1/n sum_i=1^n sum_j=1^k w_j log E_{j,i}
    # E_{j,i} = e^-Psi(j) V_d (n-1) z_{j,i}^d = C z^d
    # logC = -Psi(j) + log(V_d) + log(n-1)
    # H = 1/n sum sum w_j logC + d/n sum sum w_j log(z)
    #   = sum w_j logC + d/n sum sum w_j log(z)
    #   = A + d/n B
    # H^2 = 1/n sum
    Psi = digamma(np.arange(1, k + 1))
    logVd = d / 2 * log(pi) - gammaln(1 + d / 2)
    logC = -Psi + logVd + log(n - 1)

    # TODO: standardizing points doesn't work.
    # Standardize the data so that distances conform.  This is equivalent to
    # a u-substitution u = sigma x + mu, so the integral needs to be corrected
    # for dU = det(sigma) dx.  Since the standardization squishes the dimensions
    # independently, sigma is a diagonal matrix, with the determinant equal to
    # the product of the diagonal elements.
    #x, mu, sigma = standardize(x)  # Note: sigma may be zero
    #detDU = np.prod(sigma)
    detDU = 1.

    if n_components > 0:
        # Use Gaussian mixture to model the distribution
        from sklearn.mixture import GaussianMixture as GMM
        predictor = GMM(n_components=gmm, covariance_type='full')
        predictor.fit(x)
        eval_x, _ = predictor.sample(n_est)
        #weight_x = predictor.score_samples(eval_x)
        skip = 0
    else:
        # Empirical distribution
        # TODO: should we use the full draw for kNN and a subset for eval points?
        # Choose a subset for evaluating the entropy estimate, if desired
        #print(n_est, n)
        #eval_x = x if n_est >= n else x[permutation(n)[:n_est]]
        eval_x = x
        #weight_x = 1
        skip = 1

    tree = NearestNeighbors(algorithm=algorithm, n_neighbors=k + skip)
    tree.fit(x)
    dist, _ind = tree.kneighbors(eval_x,
                                 n_neighbors=k + skip,
                                 return_distance=True)
    # Remove first column. Since test points are in x, the first column will
    # be a point from x with distance 0, and can be ignored.
    if skip:
        dist = dist[:, skip:]
    # Find log distances.  This can be problematic for MCMC runs where a
    # step is rejected, and therefore identical points are in the distribution.
    # Ignore them by replacing these points with nan and using nanmean.
    # TODO: need proper analysis of duplicated points in MCMC chain
    dist[dist == 0] = nan
    logdist = log(dist)
    H_unweighted = logC + d * np.nanmean(logdist, axis=0)
    H = np.dot(H_unweighted, weights)[0]
    Hsq_k = np.nanmean((logC[-1] + d * logdist[:, -1])**2)
    # TODO: abs shouldn't be needed?
    if Hsq_k < H**2:
        print("warning: avg(H^2) < avg(H)^2")
    dH = sqrt(abs(Hsq_k - H**2) / n_est)
    #print("unweighted", H_unweighted)
    #print("weighted", H, Hsq_k, H**2, dH, detDU, LN2)
    return H * detDU / LN2, dH * detDU / LN2
def predict_s(x, K=3, iter_num=20, my_seed=0):

    #set seed
    np.random.seed(seed=my_seed)

    # sample num
    N = len(x)

    ### prior parameters
    # first value gamma distribution
    a = 200
    b = 5

    # first value parameter of Dirichlet distribution
    alpha = np.array([30, 20, 10])

    # parameter of gammma distribution
    a_update = np.array([200, 200, 200])
    b_update = np.array([5, 5, 5])

    # parameter of Dirichlet distribution for \pi
    alpha_update = np.array([30, 20, 10])

    # set s first value
    s_mean = []
    for n in range(N):
        s_mean.append([0.4, 0.3, 0.3])
    s_mean = np.array(s_mean)

    for i in range(iter_num):
        #print("\r Iteration:{}".format(i))

        #####################################################
        # expectation of λ、lnλ、π

        lam_mean = np.zeros(K)
        ln_lam_mean = np.zeros(K)
        ln_pi_mean = np.zeros(K)

        lam_mean = a_update / b_update
        ln_lam_mean = sp.digamma(a_update) - np.log(b_update)
        ln_pi_mean = sp.digamma(alpha_update) - sp.digamma(
            np.sum(alpha_update))

        #####################################################
        # q(sn)
        s_mean = np.exp(
            x.reshape(len(x), 1) * ln_lam_mean - lam_mean + ln_pi_mean)
        s_mean /= np.sum(s_mean, axis=1).reshape(N, 1)

        ###########################################
        # update a, b
        a_update = np.sum(x.reshape(len(x), 1) * s_mean, axis=0) + a
        b_update = np.sum(s_mean, axis=0) + b

        # update α
        alpha_update = np.sum(s_mean, axis=0) + alpha
        #####################################################

    # determine group number by order of λ
    s_order = st.gamma(a=a_update, scale=1 / b_update).mean().argsort()
    s_mean_ordered = s_mean[:, s_order]

    return s_mean_ordered
Beispiel #34
0
def func_1vN(Ecb, mu, T, Dm, Dp, itype, limit):
    """
    Function used when generating 1vN, Redfield approach kernel.

    Parameters
    ----------
    Ecb : float
        Energy.
    mu : float
        Chemical potential.
    T : float
        Temperature.
    Dm, Dp : float
        Bandwidth.
    itype : int
        Type of integral for first order approach calculations.
        itype=0: the principal parts are evaluated using Fortran integration package QUADPACK
                 routine dqawc through SciPy.
        itype=1: the principal parts are kept, but approximated by digamma function valid for
                 large bandwidht D.
        itype=2: the principal parts are neglected.
        itype=3: the principal parts are neglected and infinite bandwidth D is assumed.
    limit : int
        For itype=0 dqawc_limit determines the maximum number of subintervals
        in the partition of the given integration interval.

    Returns
    -------
    array
        Array of four complex numbers [cur0, cur1, en0, en1] containing
        momentum-integrated current amplitudes.
        cur0 - particle current amplitude.
        cur1 - hole current amplitude.
        en0 - particle energy current amplitude.
        en1 - hol energy current amplitude.
    """
    if itype == 0:
        alpha, Rm, Rp = (Ecb - mu) / T, (Dm - mu) / T, (Dp - mu) / T
        cur0, err = quad(fermi_func,
                         Rm,
                         Rp,
                         weight='cauchy',
                         wvar=alpha,
                         epsabs=1.0e-6,
                         epsrel=1.0e-6,
                         limit=limit)
        cur0 = cur0 + (-1.0j * pi *
                       fermi_func(alpha) if alpha < Rp and alpha > Rm else 0)
        cur1 = cur0 + log(abs((Rm - alpha) / (Rp - alpha)))
        cur1 = cur1 + (1.0j * pi if alpha < Rp and alpha > Rm else 0)
        #
        const0 = T * ((-Rm if Rm < -40 else log(1 + exp(-Rm))) -
                      (-Rp if Rp < -40 else log(1 + exp(-Rp))))
        const1 = const0 + Dm - Dp
        #
        en0 = const0 + Ecb * cur0
        en1 = const1 + Ecb * cur1
    elif itype == 1:
        alpha, Rm, Rp = (Ecb - mu) / T, Dm / T, Dp / T
        cur0 = digamma(0.5 + 1.0j * alpha /
                       (2 * pi)).real - log(abs(Rm) / (2 * pi))
        cur0 = cur0 - 1.0j * pi * fermi_func(alpha)
        cur1 = cur0 + log(abs(Rm / Rp))
        cur1 = cur1 + 1.0j * pi
        #
        en0 = -T * Rm + Ecb * cur0
        en1 = -T * Rp + Ecb * cur1
    elif itype == 2:
        alpha, Rm, Rp = (Ecb - mu) / T, (Dm - mu) / T, (Dp - mu) / T
        cur0 = -1.0j * pi * fermi_func(
            alpha) if alpha < Rp and alpha > Rm else 0
        cur1 = cur0 + (1.0j * pi if alpha < Rp and alpha > Rm else 0)
        en0 = Ecb * cur0
        en1 = Ecb * cur1
    elif itype == 3:
        alpha = (Ecb - mu) / T
        cur0 = -1.0j * pi * fermi_func(alpha)
        cur1 = cur0 + 1.0j * pi
        en0 = Ecb * cur0
        en1 = Ecb * cur1
    #-------------------------
    return np.array([cur0, cur1, en0, en1])
def var_bound(data, model, query, z_dnk=None):
    '''
    Determines the variational bounds.
    '''
    bound = 0

    # Unpack the the structs, for ease of access and efficiency
    K, topicPrior, wordPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype
    docLens, topicDists = \
        query.docLens, query.topicDists

    # Initialize z matrix if necessary
    W, X = data.words, data.links
    D, T = W.shape

    # Perform the digamma transform for E[ln \theta] etc.
    topicDists = topicDists.copy()
    diTopicDists = fns.digamma(topicDists)
    diSumTopicDists = fns.digamma(topicDists.sum(axis=1))
    diWordDists = fns.digamma(model.wordDists)
    diSumWordDists = fns.digamma(model.wordDists.sum(axis=1))

    # E[ln p(topics|topicPrior)] according to q(topics)
    #
    prob_topics = D * (fns.gammaln(topicPrior.sum()) - fns.gammaln(topicPrior).sum()) \
        + np.sum((topicPrior - 1)[np.newaxis, :] * (diTopicDists - diSumTopicDists[:, np.newaxis]))

    bound += prob_topics

    # and its entropy
    ent_topics = _dirichletEntropy(topicDists)
    bound += ent_topics

    # E[ln p(vocabs|vocabPrior)]
    #
    if type(model.vocabPrior) is float or type(model.vocabPrior) is int:
        prob_vocabs = K * (fns.gammaln(wordPrior * T) - T * fns.gammaln(wordPrior)) \
               + np.sum((wordPrior - 1) * (diWordDists - diSumWordDists[:, np.newaxis] ))
    else:
        prob_vocabs = K * (fns.gammaln(wordPrior.sum()) - fns.gammaln(wordPrior).sum()) \
               + np.sum((wordPrior - 1)[np.newaxis,:] * (diWordDists - diSumWordDists[:, np.newaxis] ))

    bound += prob_vocabs

    # and its entropy
    ent_vocabs = _dirichletEntropy(wordDists)
    bound += ent_vocabs

    # P(z|topic) is tricky as we don't actually store this. However
    # we make a single, simple estimate for this case.
    topicMeans = _convertDirichletParamToMeans(docLens, topicDists, topicPrior)

    prob_words = 0
    prob_z = 0
    ent_z = 0
    for d in range(D):
        wordIdx, z = _infer_topics_at_d(d, data, docLens, topicMeans,
                                        topicPrior, diWordDists,
                                        diSumWordDists)

        # E[ln p(Z|topics) = sum_d sum_n sum_k E[z_dnk] E[ln topicDist_dk]
        exLnTopic = diTopicDists[d, :] - diSumTopicDists[d]
        prob_z += np.dot(z * exLnTopic[:, np.newaxis], W[d, :].data).sum()

        # E[ln p(W|Z)] = sum_d sum_n sum_k sum_t E[z_dnk] w_dnt E[ln vocab_kt]
        prob_words += np.sum(
            W[d, :].data[np.newaxis, :] * z *
            (diWordDists[:, wordIdx] - diSumWordDists[:, np.newaxis]))

        # And finally the entropy of Z
        ent_z -= np.dot(z * safe_log(z), W[d, :].data).sum()

    bound += (prob_z + ent_z + prob_words)

    _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)
    return bound
Beispiel #36
0
def get_E_log_hi(i,k,v,mu,covariance,X):
    D = mu.shape[1]
    term1 = np.matmul((X[:,i].reshape(-1,1)-mu[k]).T , inv(covariance[k]))
    term2 = np.matmul(term1, X[:,i].reshape(-1,1)-mu[k])[0,0]
    val = digamma((v[k]+D)/2) - np.log( (v[k] + term2)/2 )
    return val
Beispiel #37
0
def objFunc_constrained(rhoomega,
                        sumLogPi=0,
                        sumLogPiActiveVec=None,
                        sumLogPiRemVec=None,
                        nDoc=0,
                        gamma=1.0,
                        alpha=1.0,
                        kappa=0.0,
                        startAlphaLogPi=0.0,
                        approx_grad=False,
                        **kwargs):
    ''' Returns constrained objective function and its gradient.

    Args
    -------
    rhoomega := 1D array, size 2*K

    Returns
    -------
    f := -1 * L(rhoomega),
         where L is ELBO objective function (log posterior prob)
    g := gradient of f
    '''
    assert not np.any(np.isnan(rhoomega))
    assert not np.any(np.isinf(rhoomega))
    rho, omega, K = _unpack(rhoomega)

    g1 = rho * omega
    g0 = (1 - rho) * omega
    digammaomega = digamma(omega)
    assert not np.any(np.isinf(digammaomega))

    Elogu = digamma(g1) - digammaomega
    Elog1mu = digamma(g0) - digammaomega

    if nDoc > 0:
        # Any practical call to this will have nDoc > 0
        if kappa > 0:
            scale = 1.0
            ONcoef = K + 1.0 - g1
            OFFcoef = K * kvec(K) + 1.0 + gamma - g0
            Tvec = alpha * sumLogPi + startAlphaLogPi
            Tvec[:-1] += np.log(alpha + kappa) - np.log(kappa)
            # Calc local term
            Ebeta = np.hstack([rho, 1.0])
            Ebeta[1:] *= np.cumprod(1 - rho)
            elbo_local = np.inner(Ebeta, Tvec)

        elif sumLogPiRemVec is not None:
            scale = nDoc
            ONcoef = 1 + (1.0 - g1) / scale
            OFFcoef = kvec(K) + (gamma - g0) / scale
            Pvec = alpha * sumLogPiActiveVec / scale
            Qvec = alpha * sumLogPiRemVec / scale

            # Calc local term
            Ebeta_gtm1 = np.hstack([1.0, np.cumprod(1 - rho[:-1])])
            elbo_local = np.inner(rho * Ebeta_gtm1, Pvec) + \
                np.inner((1-rho) * Ebeta_gtm1, Qvec)
        else:
            scale = nDoc
            ONcoef = 1 + (1.0 - g1) / scale
            OFFcoef = kvec(K) + (gamma - g0) / scale
            Tvec = alpha * sumLogPi / scale + startAlphaLogPi / scale

            # Calc local term
            Ebeta = np.hstack([rho, 1.0])
            Ebeta[1:] *= np.cumprod(1 - rho)
            elbo_local = np.inner(Ebeta, Tvec)

    else:
        # This is special case for unit tests that make sure the optimizer
        # finds the parameters that set q(u) equal to its prior when nDoc=0
        scale = 1
        ONcoef = 1 - g1
        OFFcoef = gamma - g0
        elbo_local = 0

    elbo = -1 * c_Beta(g1, g0) / scale \
        + np.inner(ONcoef, Elogu) \
        + np.inner(OFFcoef, Elog1mu) \
        + elbo_local

    if approx_grad:
        return -1.0 * elbo

    # Gradient computation!
    trigamma_omega = polygamma(1, omega)
    trigamma_g1 = polygamma(1, g1)
    trigamma_g0 = polygamma(1, g0)
    assert np.all(np.isfinite(trigamma_omega))
    assert np.all(np.isfinite(trigamma_g1))

    gradrho = ONcoef * omega * trigamma_g1 \
        - OFFcoef * omega * trigamma_g0
    gradomega = ONcoef * (rho * trigamma_g1 - trigamma_omega) \
        + OFFcoef * ((1 - rho) * trigamma_g0 - trigamma_omega)
    if nDoc > 0:
        if sumLogPiRemVec is None:
            # TODO make this line faster. This is the hot spot.
            Delta = calc_dEbeta_drho(Ebeta, rho, K)
            gradrho += np.dot(Delta, Tvec)
        else:
            Ebeta = np.hstack([rho, 1.0])
            Ebeta[1:] *= np.cumprod(1 - rho)

            Psi = calc_Psi(Ebeta, rho, K)
            gradrho += np.dot(Psi, Qvec)

            Delta = calc_dEbeta_drho(Ebeta, rho, K)[:, :K]
            gradrho += np.dot(Delta, Pvec)
    grad = np.hstack([gradrho, gradomega])

    return -1.0 * elbo, -1.0 * grad
Beispiel #38
0
 def _grad_E_log_p_pi_given_beta(self, beta, gamma, alphatildes):
     # NOTE: switched argument name gamma <-> alpha
     retval = gamma*(digamma(alphatildes[:-1]) - digamma(alphatildes[-1])) \
             - gamma * (digamma(gamma*beta) - digamma(gamma))
     return retval
 def bql_f(x_):
     return np.log(x_) - digamma(x_)
def e_ln_pi_k(gama0, Nk):
    gammak = gama0 + Nk
    return digamma(gammak) - digamma(gammak.sum())
        for i in clustered_img.keys():
            m0.append(np.asarray(clustered_img.get(i)).mean())
        mk = np.asarray(m0)
        mk_list.append(mk)

        test_mk_num = z.copy()
        test_mk_den = z.copy()

        e_x_mean_lambda_ = z.copy()
        shape = np.asarray([2 for _ in range(k)])
        gammak = gamma0 + Nk
        alphak = Nk / 2 + alpha0 - 1
        betak = beta0 + (rnk * e_x_mean_lambda_).sum(axis=0)

        e_ln_pi = e_ln_pi_k(gammak, Nk)
        e_ln_precision_ = digamma(alphak) - np.log(betak)
        e_precision_ = alphak / betak

        # Feature
        term1 = (rnk * (digamma(alphak) - np.log(betak))).sum(axis=1) / 2
        term2 = 1 / 2 * (rnk * (alphak / betak) * (
            (x.reshape(-1, 1) - mk.reshape(-1, 1).T)**2 + 1 / sk)).sum(axis=1)

        row_in_e = np.exp(term1 - term2)
        w = np.asarray([1 for _ in range(k)])

        epsolon = mk
        var_test = sk
        epsolon_in = np.exp(-1 / 2 * 1 / var_test * (
            (x.reshape(-1, 1) - epsolon.reshape(-1, 1).T)**2) +
                            1 / 2 * np.log(1 / var_test))
Beispiel #42
0
    def perform_E_step(self, Y, params, terms_in_int_approx=5):
        n,d = Y.shape
        a = np.amin(Y) # for computing E[x]
        
        for j in xrange(n):
            ### posterior membership prob
            for h in range(self.nb_components):
                params['tau'][j,h] = self.weights[h] * self.component_dists[h].pdf(Y[j])
            params['tau'][j,:] /= params['tau'][j,:].sum()

            ### update e variables
            for h in range(self.nb_components):

                # S integral
                S = gamma((self.component_dists[h].df + 2.*self.dim)/2.) / gamma((self.component_dists[h].df + self.dim)/2.)
                for r in xrange(terms_in_int_approx):
                    r += 1
                    for s in xrange(r):
                        S += ((-1)**(2*r-s-1) / r) * (gamma(r+1)/(gamma(s+1)*gamma(r-s+1))) * gamma((self.component_dists[h].df + self.dim)/2. + s) / gamma((self.component_dists[h].df + 2.*self.dim)/2. + s) * self.component_dists[h].impSamp_cdf(self.component_dists[h].get_c(Y[j]), mu=np.zeros((Y[j].shape[0],)), Sigma=((self.component_dists[h].df + self.component_dists[h].get_d(Y[j]))/(self.component_dists[h].df + self.dim + 2.*s))*self.component_dists[h].Lambda, df=self.component_dists[h].df+self.dim+2.*s)  

                params['e'][0][j,h] = digamma(self.component_dists[h].df/2. + self.dim) - np.log((self.component_dists[h].df + self.component_dists[h].get_d(Y[j]))/2.) - S/self.component_dists[h].impSamp_cdf(Y[j], mu=np.zeros((Y[j].shape[0],)), Sigma=self.component_dists[h].Lambda, df=self.component_dists[h].df+self.dim)

                params['e'][1][j,h] = (self.component_dists[h].df + self.dim)/(self.component_dists[h].df + self.component_dists[h].get_d(Y[j])) * \
                    self.component_dists[h].impSamp_cdf(Y[j], mu=np.zeros((Y[j].shape[0],)), Sigma=self.component_dists[h].Lambda, df=self.component_dists[h].df+self.dim+2)/\
                    self.component_dists[h].impSamp_cdf(Y[j], mu=np.zeros((Y[j].shape[0],)), Sigma=self.component_dists[h].Lambda, df=self.component_dists[h].df+self.dim)

                # compute moment E[x]
                c = self.component_dists[h].impSamp_cdf(self.component_dists[h].mu[0] - a, mu = np.zeros(self.component_dists[h].mu.shape))
                xi = np.zeros(self.component_dists[h].mu.shape)

                for d in range(self.dim):
                    mu_minus_d = np.delete(self.component_dists[h].mu, d, axis=1)
                    Sigma_minus_d = np.delete(np.delete(self.component_dists[h].Sigma, d, axis=0), d, axis=1)
                    sigma_d = np.delete(self.component_dists[h].Sigma[:,d], d, axis=0)
                    
                    a_star = (mu_minus_d-a) - (mu_minus_d-a) * 1./self.component_dists[h].Sigma[d,d]
                    Sigma_star = (self.component_dists[h].df + 1./self.component_dists[h].Sigma[d,d] * (self.component_dists[h].mu[0,d] - a)**2)/(self.component_dists[h].df-1) *\
                        Sigma_minus_d - 1./self.component_dists[h].Sigma[d,d] * np.dot(sigma_d, sigma_d.T)

                    xi[0,d] = 1./(2*np.pi*self.component_dists[h].Sigma[d,d]) * (self.component_dists[h].df/(self.component_dists[h].df+(1./self.component_dists[h].Sigma[d,d])*(self.component_dists[h].mu[0,d] - a)**2))**((self.component_dists[h].df-1)/2.) * np.sqrt(self.component_dists[h].df/2) * gamma((self.component_dists[h].df-1)/2.)/gamma(self.component_dists[h].df/2.) * self.component_dists[h].impSamp_cdf(a_star, mu = np.zeros((1, Sigma_star.shape[0])), Sigma = Sigma_star, df=self.component_dists[h].df-1)

                epsilon = 1./c * np.dot(xi, self.component_dists[h].Sigma)
                E_x = self.component_dists[h].mu + epsilon

                params['e'][2][j,h,:] = params['e'][1][j,h] * E_x

                # compute moment E[xx]
                H = np.zeros((self.dim, self.dim))
                for i in range(self.dim):
                    for j in range(self.dim):

                        if self.dim < 3: break

                        if j != i:
                            # precompute the necessary slices
                            mu_ij = np.array([[self.component_dists[h].mu[0,i], self.component_dists[h].mu[0,j]]])
                            Sigma_ij = np.array([[self.component_dists[h].Sigma[i,i], self.component_dists[h].Sigma[i,j]], [self.component_dists[h].Sigma[j,i], self.component_dists[h].Sigma[j,j]]])
                            if j > i:
                                mu_negij = np.delete(np.delete(self.component_dists[h].mu, i, axis=1), j-1, axis=1)
                                Sigma_parenij = np.delete(np.delete(np.array([self.component_dists[h].Sigma[:,i], self.component_dists[h].Sigma[:,j]]).T, i, axis=0), j-1, axis=0)
                                Sigma_negij = np.delete(np.delete(np.delete(np.delete(self.component_dists[h].Sigma, i, axis=0), j-1, axis=0), i, axis=1), j-1, axis=1)
                            else:
                                mu_negij = np.delete(np.delete(self.component_dists[h].mu, i, axis=1), j, axis=1)
                                Sigma_parenij = np.delete(np.delete(np.array([self.component_dists[h].Sigma[:,i], self.component_dists[h].Sigma[:,j]]).T, i, axis=0), j, axis=0)
                                Sigma_negij = np.delete(np.delete(np.delete(np.delete(self.component_dists[h].Sigma, i, axis=0), j, axis=0), i, axis=1), j, axis=1)

                            df_star = self.component_dists[h].df + np.dot(np.dot((mu_ij - a), inv(Sigma_ij)), (mu_ij - a).T)
 
                            
                            a_star_star = (mu_negij - a) - np.dot(np.dot(Sigma_parenij, inv(Sigma_ij)), mu_ij - a)
                            Sigma_star_star = df_star/(self.component_dists[h].df - 2) * (Sigma_negij - np.dot(np.dot(Sigma_parenij, inv(Sigma_ij)), Sigma_parenij.T))

                            H[i,j] = 1./(2 * np.pi * np.sqrt(self.component_dists[h].Sigma[i,i]*self.component_dists[h].Sigma[j,j] - self.component_dists[h].Sigma[i,j]**2))
                            H[i,j] *= (self.component_dists[h].df)/(self.component_dists[h].df-2) * (self.component_dists[h].df/df_star)**(self.component_dists[h].df/2 - 1)
                            H[i,j] *= self.component_dists[h].impSamp_cdf(a_star_star, mu = 0., Sigma = Sigma_star_star, df = self.component_dists[h].df-2)
                        
                    H[i,i] = 1./self.component_dists[h].Sigma[i,i] * ((self.component_dists[h].mu[0,i] - a) * xi[0,i] - np.sum([self.component_dists[h].Sigma[i,k]*H[i,k] for k in range(self.dim) if k!=i])) 

                E_xx = np.dot(self.component_dists[h].mu.T, self.component_dists[h].mu) + np.dot(self.component_dists[h].mu.T, epsilon) + np.dot(epsilon.T, self.component_dists[h].mu) - 1./c * np.dot(np.dot(self.component_dists[h].Sigma, H), self.component_dists[h].Sigma) + 1./c * (self.component_dists[h].df)/(self.component_dists[h].df-2) * self.component_dists[h].impSamp_cdf(self.component_dists[h].mu[0] - a, mu = np.zeros(self.component_dists[h].mu.shape), Sigma = (self.component_dists[h].df)/(self.component_dists[h].df-2) * self.component_dists[h].Sigma, df = self.component_dists[h].df-2) * self.component_dists[h].Sigma 
                
                params['e'][3][j, h, :, :] = params['e'][1][j, h] * E_xx

        return params
Beispiel #43
0
def e_ln_precision(alpha, beta):
    return digamma(alpha) - np.log(np.abs(beta))
Beispiel #44
0
    def fit(self, curpus):
        word_indexes = []
        word_counts = []
        for row_curpus in curpus:
            row_indexes = []
            row_counts = []
            for w_i, w_c in row_curpus:
                row_indexes.append(w_i)
                row_counts.append(w_c)
            word_indexes.append(row_indexes)
            word_counts.append(row_counts)

        n_documents = len(word_indexes)

        max_index = 0
        for d in range(n_documents):
            document_max = np.max(word_indexes[d])
            if max_index < document_max:
                max_index = document_max

        n_word_types = max_index + 1

        theta = np.random.uniform(size=(n_documents, self.n_topic))
        old_theta = np.copy(theta)
        phi = np.random.uniform(size=(self.n_topic, n_word_types))

        for n in range(self.n_iter):
            sum_phi = []
            for k in range(self.n_topic):
                sum_phi.append(sum(phi[k]))
            ndk = theta
            nkv = np.zeros((self.n_topic, n_word_types))

            sampe_X = []
            for d in range(n_documents):
                n_words_in_doc = len(word_indexes[d])
                sum_theta_d = sum(theta[d])
                prob_d = digamma(theta[d]) - digamma(sum_theta_d)
                ndk[d, :] = 0.
                dummies = np.array([0.] * self.n_topic)
                for w in range(n_words_in_doc):
                    word_no = word_indexes[d][w]
                    prob_w = digamma(phi[:, word_no]) - digamma(sum_phi)
                    latent_z = np.exp(prob_w + prob_d)
                    latent_z /= np.sum(latent_z)

                    ndk[d, :] += latent_z * word_counts[d][w]
                    nkv[:, word_no] += latent_z * word_counts[d][w]
                    z = np.argmax(latent_z)
                    dummies[z] += 1.
                sampe_X.append(dummies / n_words_in_doc)

            theta = ndk + self.alpha
            phi = nkv + self.beta
            print(n, np.max(theta - old_theta))
            old_theta = np.copy(theta)

        for k in range(self.n_topic):
            phi[k] = phi[k] / np.sum(phi[k])

        for d in range(n_documents):
            theta[d] = theta[d] / np.sum(theta[d])

        return phi, theta, np.array(sampe_X)
Beispiel #45
0
 def myfunc(r, d):
     tot = 0
     N = len(d)
     for thing in d:
         tot += digamma(r + thing)
     return N * np.log(r / (r + np.sum(d) / N)) - N * digamma(r) + tot
Beispiel #46
0
def _compute_mi_cd(c, d, n_neighbors):
    """Compute mutual information between continuous and discrete variables.

    Parameters
    ----------
    c : ndarray, shape (n_samples,)
        Samples of a continuous random variable.

    d : ndarray, shape (n_samples,)
        Samples of a discrete random variable.

    n_neighbors : int
        Number of nearest neighbors to search for each point, see [1]_.

    Returns
    -------
    mi : float
        Estimated mutual information. If it turned out to be negative it is
        replace by 0.

    Notes
    -----
    True mutual information can't be negative. If its estimate by a numerical
    method is negative, it means (providing the method is adequate) that the
    mutual information is close to 0 and replacing it by 0 is a reasonable
    strategy.

    References
    ----------
    .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
       Data Sets". PLoS ONE 9(2), 2014.
    """
    n_samples = c.shape[0]
    c = c.reshape((-1, 1))

    radius = np.empty(n_samples)
    label_counts = np.empty(n_samples)
    k_all = np.empty(n_samples)
    nn = NearestNeighbors()
    for label in np.unique(d):
        mask = d == label
        count = np.sum(mask)
        if count > 1:
            k = min(n_neighbors, count - 1)
            nn.set_params(n_neighbors=k)
            nn.fit(c[mask])
            r = nn.kneighbors()[0]
            radius[mask] = np.nextafter(r[:, -1], 0)
            k_all[mask] = k
        label_counts[mask] = count

    # Ignore points with unique labels.
    mask = label_counts > 1
    n_samples = np.sum(mask)
    label_counts = label_counts[mask]
    k_all = k_all[mask]
    c = c[mask]
    radius = radius[mask]

    nn.set_params(algorithm='kd_tree')
    nn.fit(c)
    ind = nn.radius_neighbors(radius=radius, return_distance=False)
    m_all = np.array([i.size for i in ind])

    mi = (digamma(n_samples) + np.mean(digamma(k_all)) -
          np.mean(digamma(label_counts)) - np.mean(digamma(m_all + 1)))

    return max(0, mi)
Beispiel #47
0
def _objective(l_count, dg, sub):
    alpha = np.exp(l_count) + 1
    nom = digamma(alpha)
    result = nom - sub
    return (result - dg)**2
Beispiel #48
0
def makeDeleteMoveCandidate_LP(Data,
                               curLP,
                               curModel,
                               targetCompID=10,
                               deleteStrategy='truelabels',
                               minResp=0.001,
                               **curLPkwargs):
    '''

    Returns
    -------
    propcurLP : dict of local params
        Replaces targetCompID with K "new" states,
        each one tracking exactly one existing state.
    '''

    curResp = curLP['resp']
    maxRespValBelowThr = curResp[curResp < minResp].max()
    assert maxRespValBelowThr < 1e-90

    Natom, Korig = curResp.shape
    remCompIDs = np.setdiff1d(np.arange(Korig), [targetCompID])
    relDocIDs = np.flatnonzero(
        curLP['DocTopicCount'][:, targetCompID] > minResp)
    propResp = 1e-100 * np.ones((Natom, 2 * (Korig - 1)))
    propResp[:, :Korig - 1] = curResp[:, remCompIDs]

    if deleteStrategy.count('truelabels'):
        relAtoms = curResp[:, targetCompID] > minResp

        reltrueResp = Data.TrueParams['resp'][relAtoms].copy()
        reltrueResp[reltrueResp < minResp] = 1e-100
        reltrueResp /= reltrueResp.sum(axis=1)[:, np.newaxis]

        propResp[relAtoms, Korig-1:] = \
            reltrueResp * curResp[relAtoms, targetCompID][:,np.newaxis]
        propcurLP = curModel.allocModel.initLPFromResp(Data,
                                                       dict(resp=propResp))
        return propcurLP

    Lik = curLP['E_log_soft_ev'][:, remCompIDs].copy()

    # From-scratch strategy
    for d in relDocIDs:
        mask_d = np.arange(Data.doc_range[d], Data.doc_range[d + 1])
        relAtomIDs_d = mask_d[curLP['resp'][mask_d, targetCompID] > minResp]
        fixedDocTopicCount_d = curLP['DocTopicCount'][d, remCompIDs]
        relLik_d = Lik[relAtomIDs_d, :]
        relwc_d = Data.word_count[relAtomIDs_d]

        targetsumResp_d = curLP['resp'][relAtomIDs_d, targetCompID] * relwc_d
        sumResp_d = np.zeros_like(targetsumResp_d)

        DocTopicCount_d = np.zeros_like(fixedDocTopicCount_d)
        DocTopicProb_d = np.zeros_like(DocTopicCount_d)
        sumalphaEbeta = curModel.allocModel.alpha_E_beta()[targetCompID]
        alphaEbeta = sumalphaEbeta * 1.0 / (Korig - 1.0) * np.ones(Korig - 1)
        for riter in range(10):
            np.add(DocTopicCount_d, alphaEbeta, out=DocTopicProb_d)
            digamma(DocTopicProb_d, out=DocTopicProb_d)
            DocTopicProb_d -= DocTopicProb_d.max()
            np.exp(DocTopicProb_d, out=DocTopicProb_d)

            # Update sumResp for all tokens in document
            np.dot(relLik_d, DocTopicProb_d, out=sumResp_d)

            # Update DocTopicCount_d: 1D array, shape K
            #     sum(DocTopicCount_d) equals Nd[targetCompID]
            np.dot(targetsumResp_d / sumResp_d, relLik_d, out=DocTopicCount_d)
            DocTopicCount_d *= DocTopicProb_d
            DocTopicCount_d += fixedDocTopicCount_d

        DocTopicCount_dj = curLP['DocTopicCount'][d, targetCompID]
        DocTopicCount_dnew = np.sum(DocTopicCount_d) - \
            fixedDocTopicCount_d.sum()
        assert np.allclose(DocTopicCount_dj,
                           DocTopicCount_dnew,
                           rtol=0,
                           atol=1e-6)

        # Create proposal resp for relevant atoms in this doc only
        propResp_d = relLik_d.copy()
        propResp_d *= DocTopicProb_d[np.newaxis, :]
        propResp_d /= sumResp_d[:, np.newaxis]
        propResp_d *= curLP['resp'][relAtomIDs_d, targetCompID][:, np.newaxis]

        for n in range(propResp_d.shape[0]):
            size_n = curLP['resp'][relAtomIDs_d[n], targetCompID]
            sizeOrder_n = np.argsort(propResp_d[n, :])
            for k, compID in enumerate(sizeOrder_n):
                if propResp_d[n, compID] > minResp:
                    break
                propResp_d[n, compID] = 1e-100
                biggerCompIDs = sizeOrder_n[k + 1:]
                propResp_d[n, biggerCompIDs] /= \
                    propResp_d[n,biggerCompIDs].sum()
                propResp_d[n, biggerCompIDs] *= size_n

        # Fill in huge resp matrix with specific values
        propResp[relAtomIDs_d, Korig - 1:] = propResp_d
        assert np.allclose(propResp.sum(axis=1), 1.0, rtol=0, atol=1e-8)

    propcurLP = curModel.allocModel.initLPFromResp(Data, dict(resp=propResp))
    return propcurLP
Beispiel #49
0
 def logL_prime(M):
     t1 = np.sum([ pmf[n] * digamma(n + M) for n in range(len(pmf)) ])
     t2 = -N * digamma(M)
     t3 = N * np.log(M / (M + k_bar))
     return t1 + t2 + t3
def _updateTopicHyperParamsFromMeans(model, query, max_iters=100):
    '''
    Update the hyperparameters on the Dirichlet prior over topics.

    This is a Newton Raphson method. We iterate until convergence or
    the maximum number of iterations is hit. We converge if the 1-norm of
    the difference between the previous and current estimate is less than
    0.001 / K where K is the number of topics.

    This is taken from Tom Minka's tech-note on "Estimating a Dirichlet
    Distribution", specifically the section on estimating a Polya distribution,
    which performed best in experiments. We'll be substituted in the expected
    count of topic assignments to variables.

    At each iteration, the new value of a_k is set to

     \sum_d \Psi(n_dk + a_k) - \Psi (a_k)
    -------------------------------------- * a_k
     \sum_d \Psi(n_d + \sum_j a_j) - \Psi(a_k)

    where the n_dk is the count of times topic k was assigned to tokens in
    document d, and its expected value is the same as the parameter of the
    posterior over topics for that document d, minus the hyper-parameter used
    to estimate that posterior. In this case, we assume that this method have
    been called from within the training routine, so we topicDists is essentially
    the mean of per-token topic-assignments, and thus needs to be scaled
    appropriately

    :param model: all the model parameters, notably the topicPrior, which is
    mutated IN-PLACE.
    :param query: all the document-level parameters, notably the topicDists,
    from which an appropriate prior is noted. It's expected that this contain
    the topic hyper-parameters, as usual, and not any intermediate reprsentations
    (i.e. means) used by the inference procedure.
    '''
    print("Updating hyper-parameters")
    topic_prior = model.topicPrior
    old_topic_prior = topic_prior.copy()

    doc_lens = query.docLens
    doc_topic_counts = query.topicDists * doc_lens[:, np.
                                                   newaxis] + old_topic_prior[
                                                       np.newaxis, :]

    D, K = doc_topic_counts.shape

    psi_old_tprior = np.ndarray(topic_prior.shape, dtype=topic_prior.dtype)

    for _ in range(max_iters):
        doc_topic_counts += (topic_prior - old_topic_prior)[np.newaxis, :]
        old_topic_prior[:] = topic_prior

        fns.digamma(old_topic_prior, out=psi_old_tprior)

        numer = fns.psi(doc_topic_counts).sum(axis=0) - D * psi_old_tprior
        denom = fns.psi(doc_lens +
                        old_topic_prior.sum()).sum() - D * psi_old_tprior
        topic_prior[:] = old_topic_prior * (numer / denom)

        if la.norm(np.subtract(old_topic_prior, topic_prior), 1) < (0.001 * K):
            break

    # Undo the in-place changes we've been making to the topic distributions

    doc_topic_counts -= old_topic_prior[np.newaxis, :]
    doc_topic_counts /= doc_lens[:, np.newaxis]

    # Make sure it never is zero or negative
    for k in range(K):
        topic_prior[k] = max(topic_prior[k], 1E-6)
Beispiel #51
0
uci, lci = lib.compute_ci(int_uniform.values, axis=None)
print("Mean, confidence interval for entire uniform grid: %.5f, [%.5f, %.5f]."
                                                                %(m, uci, lci))

m = np.mean(int_normal.values, axis=None)
uci, lci = lib.compute_ci(int_normal.values, axis=None)
print("Mean, confidence interval for entire normal grid: %.5f, [%.5f, %.5f]."
                                                                %(m, uci, lci))

# =============================================================================
# Chapter 3 - Difference between ln(n) and psi(n)
# =============================================================================

n = np.arange(1, 51, 1)
ln = np.log(n)
psi = scp_sp.digamma(n)

plt.figure()
plt.plot(n, ln, label="$\ln(n)$")
plt.plot(n, psi, label="$\psi(n)$")
plt.xlabel("$n$")
plt.legend()
plt.savefig("output/ln_psi.png", dpi=500)

plt.figure()
plt.plot(n, np.abs(ln - psi))
plt.xlabel("$n$")
plt.ylabel("$|\ln(n) - \psi(n)|$")
plt.savefig("output/ln_psi_diff.png", dpi=500)

# =============================================================================
 def _entropy(self, x):
     return digamma(self.n) - digamma(self.n_neighbors) + self._epsilon(x)
def _old_train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qyery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.batchSize
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists ,dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )
    assert model.dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior)

    W = data.words
    D, T = W.shape

    iters, bnds, likes = [], [], []

    # A few parameters for handling adaptive step-sizes in SGD
    grad = 0
    grad_inner = 0
    grad_rate = 1
    log_likely = 0  # complete dataset likelihood for gradient adjustments
    stepSize = np.array([1.] * K, dtype=model.dtype)

    # Instead of storing the full topic assignments for every individual word, we
    # re-estimate from scratch. I.e for the memberships z which is DxNxT in dimension,
    # we only store a 1xNxT = NxT part.
    diWordDistSums = np.empty((K, ), dtype=dtype)
    diWordDists = np.empty(wordDists.shape, dtype=dtype)
    wordUpdates = wordDists.copy() if batchSize > 0 else None
    batchProcessCount = 0

    # Amend the name if batchSize == 0 implying we're using SGD
    modelName = "lda/svbp/%s" % _sgd_desc(plan) \
                if batchSize > 0 else model.name
    print(modelName)

    for itr in range(iterations):
        diWordDistSums[:] = wordDists.sum(axis=1)
        fns.digamma(diWordDistSums, out=diWordDistSums)
        fns.digamma(wordDists, out=diWordDists)

        if updateVocab:
            # Perform inference, updating the vocab
            if batchSize == 0:
                wordDists[:, :] = vocabPrior
            else:
                wordUpdates[:, :] = 0

            for d in range(D):
                batchProcessCount += 1
                #if debug and d % 100 == 0: printAndFlushNoNewLine(".")
                wordIdx, z = _update_topics_at_d(d, data, docLens, topicMeans,
                                                 topicPrior, diWordDists,
                                                 diWordDistSums)
                wordDists[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

                if plan.rate_algor == RateAlgorAmaria:
                    log_likely += 0
                elif plan.rate_algor == RateAlgorVariance:
                    g = wordDists.mean(axis=0) + vocabPrior
                    grad *= (1 - grad_rate)
                    grad += grad_rate * wordDists
                    grad += grad_rate * vocabPrior
                    gg += 0
                elif plan.rate_algor != RateAlgorTimeKappa:
                    raise ValueError("Unknown rate algorithm " +
                                     str(plan.rate_algor))

                if batchSize > 0 and batchProcessCount == batchSize:
                    batch_index = (
                        itr * D + d
                    ) / batchSize  #TODO  Will not be right if batchSize is not a multiple of D
                    stepSize = _step_sizes(stepSize, batch_index, g, gg,
                                           log_likely, plan)
                    wordDists *= (1 - stepSize)
                    wordDists += stepSize * vocabPrior

                    stepSize *= float(D) / batchSize
                    wordUpdates *= stepSize
                    wordDists += wordUpdates

                    diWordDistSums[:] = wordDists.sum(axis=1)
                    fns.digamma(diWordDistSums, out=diWordDistSums)
                    fns.digamma(wordDists, out=diWordDists)

                    wordUpdates[:, :] = 0
                    batchProcessCount = 0
                    log_likely = 0

                    if debug:
                        bnds.append(_var_bound_internal(data, model, query))
                        likes.append(
                            _log_likelihood_internal(data, model, query))

                        perp = perplexity_from_like(likes[-1], W.sum())
                        print(
                            "Iteration %d, after %d docs: Train Perp = %4.0f  Bound = %.3f"
                            % (itr, batchSize, perp, bnds[-1]))
                        sys.stdout.flush()

            # Log bound and the determine if we can stop early
            if itr % logFrequency == 0 or debug:
                iters.append(itr)
                bnds.append(_var_bound_internal(data, model, query))
                likes.append(_log_likelihood_internal(data, model, query))

                perp = perplexity_from_like(likes[-1], W.sum())
                print("Iteration %d : Train Perp = %4.0f  Bound = %.3f" %
                      (itr, perp, bnds[-1]))

                if len(iters) > 2 and (iters[-1] > 20 or
                                       (iters[-1] > 2 and batchSize > 0)):
                    lastPerp = perplexity_from_like(likes[-2], W.sum())
                    if lastPerp - perp < 1:
                        print("Converged, existing early")
                        break

            # Update hyperparameters (do this after bound, to make sure bound
            # calculation is internally consistent)
            if HyperUpdateEnabled and itr > 0 and itr % HyperParamUpdateInterval == 0:
                if debug: print("Topic Prior was " + str(topicPrior))
                _updateTopicHyperParamsFromMeans(model, query)
                if debug: print("Topic Prior is now " + str(topicPrior))
        else:
            for d in range(D):
                _ = _update_topics_at_d(d, data, docLens, topicMeans,
                                        topicPrior, diWordDists,
                                        diWordDistSums)

    topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)

    return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), \
           QueryState(docLens, topicMeans, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
Beispiel #54
0
 def updateExpectations(self):
     E = self.params['a'] / self.params['b']
     lnE = special.digamma(self.params['a']) - s.log(self.params['b'])
     self.expectations = {'E': E, 'lnE': lnE}
def train(data, model, query, plan, updateVocab=True):
    '''
    Infers the topic distributions in general, and specifically for
    each individual datapoint,

    Params:
    data - the training data, we just use the DxT document-term matrix
    model - the initial model configuration. This is MUTATED IN-PLACE
    qy=uery - the query results - essentially all the "local" variables
            matched to the given observations. Also MUTATED IN-PLACE
    plan  - how to execute the training process (e.g. iterations,
            log-interval etc.)

    Return:
    The updated model object (note parameters are updated in place, so make a
    defensive copy if you want it)
    The query object with the update query parameters
    '''
    iterations, epsilon, logFrequency, fastButInaccurate, debug, batchSize, rateAlgor = \
        plan.iterations, plan.epsilon, plan.logFrequency, plan.fastButInaccurate, plan.debug, plan.batchSize, plan.rate_algor
    docLens, topicMeans = \
        query.docLens, query.topicDists
    K, topicPrior, vocabPrior, wordDists, dtype = \
        model.K, model.topicPrior, model.vocabPrior, model.wordDists, model.dtype

    # Quick sanity check
    if np.any(docLens < 1):
        raise ValueError(
            "Input document-term matrix contains at least one document with no words"
        )
    assert model.dtype == np.float64, "Only implemented for 64-bit floats"

    # Prepare the data for inference
    topicMeans = _convertDirichletParamToMeans(docLens, topicMeans, topicPrior)

    W = data.words
    D, T = W.shape

    iters, bnds, likes = [], [], []

    # A few parameters for handling adaptive step-sizes in SGD
    if plan.rate_algor == RateAlgorBatch:
        batchSize = D
        batchCount = 1
    else:
        batchSize = plan.batchSize
        batchCount = D // batchSize + 1

    gradStep = constantArray((K, ), 1. / float(batchSize), dtype=dtype)
    grad = np.zeros((K, T), dtype=dtype)
    ex_grad = grad.copy()
    exp_gtg = np.zeros((K, ), dtype=dtype)
    stepSize = np.ones((K, ), dtype=dtype)

    # The digamma terms for the vocabularly
    diWordDists = fns.digamma(wordDists)
    diWordDistSums = np.sum(wordDists, axis=1)
    fns.digamma(diWordDistSums, out=diWordDistSums)

    # Amend the name to incorporate training information
    rateAlgor = plan.rate_algor
    modelName = "lda/svbp/%s" % _sgd_desc(plan)
    print(modelName)

    # Start traininng
    d = -1
    for b in range(batchCount * iterations):
        grad.fill(vocabPrior)
        # firstD = d
        for s in range(batchSize):
            d = d + 1 if (d + 1) < D else 0

            wordIdx, z = _update_topics_at_d(d, data, docLens, topicMeans,
                                             topicPrior, diWordDists,
                                             diWordDistSums)
            grad[:, wordIdx] += W[d, :].data[np.newaxis, :] * z

        if rateAlgor == RateAlgorBatch:
            wordDists[:, :] = grad[:, :]
        else:
            if rateAlgor == RateAlgorTimeKappa:
                stepSize[:] = (b + plan.rate_delay)**(-plan.forgetting_rate)
            elif rateAlgor == RateAlgorVariance:
                update_inplace_v(gradStep, ex_grad, change=grad)
                gtg = stepSize.copy()
                for k in range(K):
                    stepSize[k] = np.dot(ex_grad[k, :], ex_grad[k, :])
                    gtg[k] = np.dot(grad[k, :], grad[k, :])
                update_inplace_s(gradStep, old=exp_gtg, change=gtg)
                stepSize /= exp_gtg
                gradStep = gradStep * (1 - stepSize) + 1
            elif rateAlgor == RateAlgorAmaria:
                topicMeans = _convertMeansToDirichletParam(
                    docLens, topicMeans, topicPrior)
                # doc_indices = np.linspace(firstD, firstD + batchSize -1, batchSize) % D
                log_likely = var_bound(
                    data,  # data._reorder(doc_indices),
                    ModelState(K, topicPrior, vocabPrior, wordDists, True,
                               dtype, modelName),
                    QueryState(docLens, topicMeans, True))
                p = stepSize[0]
                a, b = plan.rate_a, plan.rate_b
                p *= exp(a * (b * -log_likely - p))
                stepSize[:] = p
                topicMeans = _convertMeansToDirichletParam(
                    docLens, topicMeans, topicPrior)
            else:
                raise ValueError("No code to support the '" +
                                 str(plan.rate_algor) +
                                 "' learning rate adaptation algorithm")

            update_inplace_v(stepSize, old=wordDists, change=grad)

        if debug:
            print("%s : t=%d : step=%s" % (rateAlgor, b, str(stepSize)))

        if is_not_all_real(wordDists):
            print("Worddists nan")
        fns.digamma(wordDists, out=diWordDists)
        if is_not_all_real(diWordDists):
            print("Digamma worddists nan")
        np.sum(wordDists, axis=1, out=diWordDistSums)
        fns.digamma(diWordDistSums, out=diWordDistSums)

    topicMeans = _convertMeansToDirichletParam(docLens, topicMeans, topicPrior)

    return ModelState(K, topicPrior, vocabPrior, wordDists, True, dtype, modelName), \
           QueryState(docLens, topicMeans, True), \
           (np.array(iters, dtype=np.int32), np.array(bnds), np.array(likes))
def gmm(X, K, max_iter=100):
    N, D = X.shape

    # parameters for pi, mu, and precision
    alphas = np.ones(K, dtype=np.float32)  # prior parameter for pi (dirichlet)
    orig_alphas = np.ones(
        K, dtype=np.float32)  # prior parameter for pi (dirichlet)
    # mu_means = np.zeros((K, D), dtype=np.float32) # prior mean for mu (normal) ### No!
    # mu_covs = np.empty((K, D, D), dtype=np.float32) # prior covariance for mu (normal)

    orig_c = 10.0
    # for k in xrange(K):
    #   mu_covs[k] = np.eye(D)*orig_c

    orig_a = np.ones(K, dtype=np.float32) * D
    a = np.ones(K, dtype=np.float32) * D  # prior for precision (wishart)
    orig_B = np.empty((K, D, D))
    B = np.empty((K, D, D))  # precision (wishart)
    empirical_cov = np.cov(X.T)
    for k in xrange(K):
        B[k] = (D / 10.0) * empirical_cov
        orig_B[k] = (D / 10.0) * empirical_cov

    # try random init instead
    # mu_means = np.random.randn(K, D)*orig_c
    mu_means = np.empty((K, D))
    for j in xrange(K):
        mu_means[j] = X[np.random.choice(N)]
    mu_covs = wishart.rvs(df=orig_a[0], scale=np.linalg.inv(B[0]), size=K)

    costs = np.zeros(max_iter)
    for iter_idx in xrange(max_iter):
        # calculate q(c[i])
        # phi = np.empty((N,K)) # index i = sample, index j = cluster
        t1 = np.empty(K)
        t2 = np.empty((N, K))
        t3 = np.empty(K)
        t4 = np.empty(K)

        # calculate this first because we will use it multiple times
        Binv = np.empty((K, D, D))
        for j in range(K):
            Binv[j] = np.linalg.inv(B[j])

        for j in xrange(K):
            # calculate t1
            t1[j] = -np.log(np.linalg.det(B[j]))
            for d in xrange(D):
                t1[j] += digamma((1 - d + a[j]) / 2.0)

            # calculate t2
            for i in xrange(N):
                diff_ij = X[i] - mu_means[j]
                t2[i, j] = diff_ij.dot((a[j] * Binv[j]).dot(diff_ij))

            # calculate t3
            t3[j] = np.trace(a[j] * Binv[j].dot(mu_covs[j]))

            # calculate t4
            t4[j] = digamma(alphas[j]) - digamma(alphas.sum())

        # calculate phi from t's
        # MAKE SURE 1-d array gets added to 2-d array correctly
        phi = np.exp(0.5 * t1 - 0.5 * t2 - 0.5 * t3 + t4)
        # print "phi before normalize:", phi
        phi = phi / phi.sum(axis=1, keepdims=True)

        # print "phi:", phi

        cluster_assignments = phi.argmax(axis=1)

        n = phi.sum(axis=0)  # there should be K of these
        # print "n[j]:", n

        # update q(pi)
        alphas = orig_alphas + n
        # print "alphas:", alphas

        # update q(mu)
        for j in xrange(K):
            mu_covs[j] = np.linalg.inv((1.0 / orig_c) * np.eye(D) +
                                       n[j] * a[j] * Binv[j])
            mu_means[j] = mu_covs[j].dot(a[j] * Binv[j]).dot(phi[:, j].dot(X))

        # print "means:", mu_means
        # print "mu_covs:", mu_covs

        # update q(lambda)
        a = orig_a + n
        for j in xrange(K):
            B[j] = orig_B[j].copy()
            for i in xrange(N):
                diff_ij = X[i] - mu_means[j]
                B[j] += phi[i, j] * (np.outer(diff_ij, diff_ij) + mu_covs[j])

        # print "a[j]:", a
        # print "B[j]:", B

        costs[iter_idx] = get_cost(X, K, cluster_assignments, phi, alphas,
                                   mu_means, mu_covs, a, B, orig_alphas,
                                   orig_c, orig_a, orig_B)

    plt.plot(costs)
    plt.title("Costs")
    plt.show()

    print "cluster assignments:\n", cluster_assignments
    plt.scatter(X[:, 0], X[:, 1], c=cluster_assignments, s=100, alpha=0.7)
    plt.show()
def get_cost(X, K, cluster_assignments, phi, alphas, mu_means, mu_covs, a, B,
             orig_alphas, orig_c, orig_a, orig_B):
    N, D = X.shape
    total = 0
    ln2pi = np.log(2 * np.pi)

    # calculate B inverse since we will need it
    Binv = np.empty((K, D, D))
    for j in xrange(K):
        Binv[j] = np.linalg.inv(B[j])

    # calculate expectations first
    Elnpi = digamma(alphas) - digamma(alphas.sum())  # E[ln(pi)]
    Elambda = np.empty((K, D, D))
    Elnlambda = np.empty(K)
    for j in xrange(K):
        Elambda[j] = a[j] * Binv[j]
        Elnlambda[j] = D * np.log(2) - np.log(np.linalg.det(B[j]))
        for d in xrange(D):
            Elnlambda[j] += digamma(a[j] / 2.0 + (1 - d) / 2.0)

    # now calculate the log joint likelihood
    # Gaussian part
    # total -= N*D*ln2pi
    # total += 0.5*Elnlambda.sum()
    # for j in xrange(K):
    #   # total += 0.5*Elnlambda[j] # vectorized
    #   for i in xrange(N):
    #     if cluster_assignments[i] == j:
    #       diff_ij = X[i] - mu_means[j]
    #       total -= 0.5*( diff_ij.dot(Elambda[j]).dot(diff_ij) + np.trace(Elambda[j].dot(mu_covs[j])) )

    # mixture coefficient part
    # total += Elnpi.sum()

    # use phi instead
    for j in xrange(K):
        for i in xrange(N):
            diff_ij = X[i] - mu_means[j]
            inside = Elnlambda[j] - D * ln2pi
            inside += -diff_ij.dot(Elambda[j]).dot(diff_ij) - np.trace(
                Elambda[j].dot(mu_covs[j]))
            # inside += Elnpi[j]
            total += phi[i, j] * (0.5 * inside + Elnpi[j])

    # E{lnp(mu)} - based on original prior
    for j in xrange(K):
        E_mu_dot_mu = np.trace(mu_covs[j]) + mu_means[j].dot(mu_means[j])
        total += -0.5 * D * np.log(
            2 * np.pi * orig_c) - 0.5 * E_mu_dot_mu / orig_c

    # print "total:", total

    # E{lnp(lambda)} - based on original prior
    for j in xrange(K):
        total += (orig_a[j] - D - 1) / 2.0 * Elnlambda[j] - 0.5 * np.trace(
            orig_B[j].dot(Elambda[j]))
        # print "total 1:", total
        total += -orig_a[j] * D / 2.0 * np.log(2) + 0.5 * orig_a[j] * np.log(
            np.linalg.det(orig_B[j]))
        # print "total 2:", total
        total -= D * (D - 1) / 4.0 * np.log(np.pi)
        # print "total 3:", total
        for d in xrange(D):
            total -= np.log(gamma(orig_a[j] / 2.0 + (1 - d) / 2.0))

    # E{lnp(pi)} - based on original prior
    # - lnB(orig_alpha) + sum[j]{ orig_alpha[j] - 1}*E[lnpi_j]
    total += np.log(gamma(orig_alphas.sum())) - np.log(
        gamma(orig_alphas)).sum()
    total += ((orig_alphas - 1) *
              Elnpi).sum()  # should be 0 since orig_alpha = 1

    # calculate entropies of the q distributions
    # q(c)
    for i in xrange(N):
        total += stats.entropy(phi[i])  # categorical entropy

    # q(pi)
    total += dirichlet.entropy(alphas)

    # q(mu)
    for j in xrange(K):
        total += mvn.entropy(cov=mu_covs[j])

    # q(lambda)
    for j in xrange(K):
        total += wishart.entropy(df=a[j], scale=Binv[j])

    return total
Beispiel #58
0
 def gamma_gradient(self, k):
     """
     :param k: значение k Гамма-функции
     :return: значение
     """
     return np.log(k) - special.digamma(k) - self.c
Beispiel #59
0
 def df_eq(x):
     return tmp - (np.log(x/2.) - digamma(x/2.) + 1.)
Beispiel #60
0
def invwishart_entropy(sigma,nu,chol=None):
    D = sigma.shape[0]
    chol = np.linalg.cholesky(sigma) if chol is None else chol
    Elogdetlmbda = special.digamma((nu-np.arange(D))/2).sum() + D*np.log(2) - 2*np.log(chol.diagonal()).sum()
    return invwishart_log_partitionfunction(sigma,nu,chol)-(nu-D-1)/2*Elogdetlmbda + nu*D/2