def test_parameter_estimation(): N = 100 # number of observations K = 50 # dimension of Dirichlet _alpha = np.random.gamma(1, 1) * np.random.dirichlet( [1.] * K) # ground truth alpha obs = np.random.dirichlet(_alpha, size=N) + eps # draw N samples from Dir(_alpha) obs /= np.sum(obs, 1)[:, np.newaxis] # renormalize for added eps initial_alpha = np.ones(K) # first guess on alpha # estimating est_alpha = parameter_estimation(obs, initial_alpha) g_ll = 0 # log-likelihood with ground truth parameter b_ll = 0 # log-likelihood with initial guess of alpha ll = 0 # log-likelihood with estimated parameter for i in xrange(N): g_ll += dirichlet.logpdf(obs[i], _alpha) b_ll += dirichlet.logpdf(obs[i], initial_alpha) ll += dirichlet.logpdf(obs[i], est_alpha) print('Test with parameter estimation') print('likelihood p(obs|_alpha) = %.3f' % g_ll) print('likelihood p(obs|initial_alpha) = %.3f' % b_ll) print('likelihood p(obs|estimate_alpha) = %.3f' % ll) print('likelihood difference = %.3f' % (g_ll - ll))
def _sample_alpha( self, step_size=0.01): # sampling the hyperparamter for the dirichlets if not self.unannealed_dists: pass else: old_f_val = 0.0 new_f_val = 0.0 for dist in self.unannealed_dists.values(): alpha_vec = np.zeros_like(dist) alpha_vec.fill(self.alpha) old_f_val += dirichlet.logpdf(dist, alpha_vec) new_alpha = 0 while new_alpha < self.alpha_range[ 0] or new_alpha > self.alpha_range[1]: new_alpha = self.alpha + np.random.normal(0.0, step_size) for dist in self.unannealed_dists.values(): alpha_vec = np.zeros_like(dist) alpha_vec.fill(new_alpha) new_f_val += dirichlet.logpdf(dist, alpha_vec) acceptance_thres = np.log(np.random.uniform(0.0, 1.0)) mh_ratio = new_f_val - old_f_val if mh_ratio > acceptance_thres: self.alpha = new_alpha logging.info( 'pcfg alpha samples a new value {} with log ratio {}/{}'. format(new_alpha, mh_ratio, acceptance_thres))
def test_parameter_estimation(): N = 100 # number of observations K = 50 # dimension of Dirichlet _alpha = np.random.gamma(1,1) * np.random.dirichlet([1.]*K) # ground truth alpha obs = np.random.dirichlet(_alpha, size=N) + eps # draw N samples from Dir(_alpha) obs /= np.sum(obs, 1)[:,np.newaxis] #renormalize for added eps initial_alpha = np.ones(K) # first guess on alpha #estimating est_alpha = parameter_estimation(obs, initial_alpha) g_ll = 0 #log-likelihood with ground truth parameter b_ll = 0 #log-likelihood with initial guess of alpha ll = 0 #log-likelihood with estimated parameter for i in xrange(N): g_ll += dirichlet.logpdf(obs[i], _alpha) b_ll += dirichlet.logpdf(obs[i], initial_alpha) ll += dirichlet.logpdf(obs[i], est_alpha) print 'Test with parameter estimation' print 'likelihood p(obs|_alpha) = %.3f' % g_ll print 'likelihood p(obs|initial_alpha) = %.3f' % b_ll print 'likelihood p(obs|estimate_alpha) = %.3f' % ll print 'likelihood difference = %.3f' % (g_ll - ll)
def propose(self): if len(self.value) == 1: return PriorDirichletDistribution(value=self.value,alpha=self.value), 0.0 # handle singleton rules ret = PriorDirichletDistribution(value = numpy.random.dirichlet(self.alpha), alpha=self.alpha) fb = dirichlet.logpdf(ret.value, self.alpha) - dirichlet.logpdf(self.value, self.alpha) return ret, fb
def fit(self, data, p_transitions, p_emissions, p_start, start_pseudocounts, transition_pseudocounts, emission_pseudocounts, verbose=True): """ Run the EM algorithm to find the maximum likelihood or maximum a posteriori (if pseudocounts >0) estimates of the model parameters :param data: Training data. Shape = (number of sequences X length of sequence) :param p_transitions: Initial guess for transition probability matrix. Shape = (num_states X num_states) :param p_emissions: Initial guess for emission probability matrix. Shape = (num_states X num_emissions) :param p_start: Initial guess for first state occupancy probability matrix. Shape = (num_states) :param start_pseudocounts: Parameters for Beta prior on first state occupancy. Shape = (num_states) :param transition_pseudocounts: Parameters for Beta priors on transition probabilities. Shape = (num_states X num_states) :param emission_pseudocounts: Parameters for Dirichlet priors on emission probabilities. Shape = (num_states X num_emissions) :param verbose: Show the improvement in log likelihood/log posterior through training. Default = True :return: """ self.p_transitions = p_transitions / np.sum( p_transitions, axis=1, keepdims=True) self.p_emissions = p_emissions / np.sum( p_emissions, axis=1, keepdims=True) self.p_start = p_start / np.sum(p_start) self.start_pseudocounts = start_pseudocounts self.transition_pseudocounts = transition_pseudocounts self.emission_pseudocounts = emission_pseudocounts self.converged = False for iter in range(self.max_iter): self.get_state_likelihood(data.astype("int")) alpha, beta, scaling, expected_latent_state, expected_latent_state_pair = self.E_step( ) self.M_step(data.astype("int"), expected_latent_state, expected_latent_state_pair) current_log_likelihood = np.sum(np.log(scaling)) current_log_prior = np.sum([dirichlet.logpdf(self.p_transitions[state, :], self.transition_pseudocounts[state, :]) + dirichlet.logpdf(self.p_emissions[state, :], self.emission_pseudocounts[state, :]) for state in range(self.num_states)]) \ + dirichlet.logpdf(self.p_start, self.start_pseudocounts) self.log_posterior.append(current_log_likelihood + current_log_prior) if iter >= 1: improvement = self.log_posterior[-1] - self.log_posterior[-2] if verbose: print("Training improvement in log posterior:", improvement) if improvement <= self.threshold: self.converged = True break
def test_numpy_rvs_shape_compatibility(self): np.random.seed(2846) alpha = np.array([1.0, 2.0, 3.0]) x = np.random.dirichlet(alpha, size=7) assert_equal(x.shape, (7, 3)) assert_raises(ValueError, dirichlet.pdf, x, alpha) assert_raises(ValueError, dirichlet.logpdf, x, alpha) dirichlet.pdf(x.T, alpha) dirichlet.pdf(x.T[:-1], alpha) dirichlet.logpdf(x.T, alpha) dirichlet.logpdf(x.T[:-1], alpha)
def log_likelihood(cls, x, params=None, nat_params=None): # Compute P( x | Ѳ; α ) assert (params is None) ^ (nat_params is None) (alpha, ) = params if params is not None else cls.natToStandard( *nat_params) if (isinstance(x, tuple)): assert len(x) == 1 x, = x assert isinstance(x, np.ndarray) if (x.ndim == 2): return sum([dirichlet.logpdf(_x, alpha=alpha) for _x in x]) assert isinstance(x, np.ndarray) and x.ndim == 1 return dirichlet.logpdf(x, alpha=alpha)
def propose(self): if len(self.value) == 1: return PriorDirichletDistribution( value=self.value, alpha=self.value), 0.0 # handle singleton rules ret = PriorDirichletDistribution(value=numpy.random.dirichlet( self.alpha), alpha=self.alpha) fb = dirichlet.logpdf(ret.value, self.alpha) - dirichlet.logpdf( self.value, self.alpha) return ret, fb
def log_prior(self): lp = 0 for k in range(self.nb_states): alpha = self.prior['alpha'] * np.ones(self.nb_states)\ + self.prior['kappa'] * (np.arange(self.nb_states) == k) lp += dirichlet.logpdf(self.matrix[k], alpha) return lp
def logprob(phi_c, phi_parent, root, alpha): x = mkarray(phi_c.values()) if phi_parent == root: a = np.ones_like(x) else: a = alpha * mkarray(phi_parent.values()) return dirichlet.logpdf(x, a)
def calculate_nav_log_joint(self): log_joint = 0 for nav_topic_id in range(self.num_nav_topics): log_joint += calculate_multivariate_normal_logpdf( self.nav_topic_means[-1][nav_topic_id], self.nav_topic_mean_prior_means[nav_topic_id], CovarianceMatrix.scaled( self.nav_topic_covariances[-1][nav_topic_id], (1 / self.nav_topic_mean_prior_kappa))) log_joint += invwishart.logpdf( self.nav_topic_covariances[-1][nav_topic_id].matrix, self.nav_topic_covariance_prior_dof, self.nav_topic_covariance_prior_scale) for article_id in range(len(self.training_data.articles)): log_joint += dirichlet.logpdf( self.nav_article_proportions[-1][article_id], self.nav_article_topic_proportions_prior_alpha) for article_nav_id, nav_id in enumerate( self.training_data.article_navs[article_id]): nav_topic_assignment = self.nav_article_nav_assignments[-1][ article_id][article_nav_id] log_joint += np.log(self.nav_article_proportions[-1] [article_id][nav_topic_assignment]) log_joint += self.calculate_topic_nav_logprob( nav_topic_assignment, nav_id) return log_joint
def propose(self): if len(self.value) == 1: return copy(self), 0.0 # handle singleton rules v = numpy.random.dirichlet(self.value * self.proposal_scale) # add a tiny bit of smoothing away from 0/1 v = (1.0 - DirichletDistribution.SMOOTHING) * v + DirichletDistribution.SMOOTHING / 2.0 # and renormalize it (both slightly breaking MCMC) v = v / sum(v) ret = copy(self) ret.set_value(v) fb = dirichlet.logpdf(ret.value, self.value * self.proposal_scale) -\ dirichlet.logpdf(self.value, ret.value * self.proposal_scale) return ret, fb
def _log_joint(self, x, Z): temp = (dirichlet.logpdf(self.pi, self.alpha) + np.sum(np.log(self.pi[Z])) + np.sum(norm.logpdf(self.beta, 0, self.sigma)) + np.sum(norm.logpdf(self.rho, 0, self.sigma))) loc = np.array([ self.beta[Z[v], :] + self.epsilon_up * np.sum(self.rho[Z[v], Z[self.graph[v]], :], axis=0) for v in range(self.n_cells) ]) return temp + np.sum(norm.logpdf(x, loc, self.S * np.ones(loc.shape)))
def likelihood(position): pars = np.copy(position) # pars = 10 ** pars print(pars) try: cost = np.sum([dirichlet.logpdf(sample, pars) for sample in data_3states]) print('cost', cost) except ValueError as e: print(e) cost = -np.inf return cost
def propose(self): if len(self.value) == 1: return copy(self), 0.0 # handle singleton rules v = numpy.random.dirichlet(self.value * self.proposal_scale) # add a tiny bit of smoothing away from 0/1 v = (1.0 - DirichletDistribution.SMOOTHING ) * v + DirichletDistribution.SMOOTHING / 2.0 # and renormalize it (both slightly breaking MCMC) v = v / sum(v) ret = copy(self) ret.set_value(v) fb = dirichlet.logpdf(ret.value, self.value * self.proposal_scale) -\ dirichlet.logpdf(self.value, ret.value * self.proposal_scale) return ret, fb
def sampleproposal(B,D,K): #sample pis from dirichlet(1,1...1,1) pi_alpha = 0.1 pi_pr = np.random.dirichlet(pi_alpha* np.ones(K),size = B) logpidensity =dirichlet.logpdf(np.transpose(pi_pr),pi_alpha*np.ones(K)) #sample mus from normal(0,25) mu_std = 5 mu_pr = mu_std*np.random.randn(B,K,D) logmudensity = np.sum(norm.logpdf(mu_pr, scale = mu_std),axis = (1,2)) #sample sigmas from lognormal(0,1) sigma_pr = np.exp(np.random.randn(B,K,D)) logsigmadensity = np.sum(lognorm.logpdf(sigma_pr, s = 1),axis = (1,2)) logpropdensity = logpidensity + logmudensity + logsigmadensity logpipriordensity = dirichlet.logpdf(np.transpose(pi_pr),np.ones(K)) logpriordensity = logpipriordensity + np.sum(norm.logpdf(mu_pr, scale = 1),axis = (1,2)) +logsigmadensity return pi_pr,mu_pr,sigma_pr, logpriordensity, logpropdensity
def semisupervisedEM(self, Xl, Yl, Xu, tol=1e-3): # semi-supervised training by EM # stack labelled + unlabelled inputs X = np.row_stack([Xl, Xu]) Yl = np.squeeze(Yl) # init model (post) with labelled dataset self.train_supervised(Xl, Yl) self.N = X.shape[0] # update n labelled from train_supervised # init responsibility rl = self.r ru = self.predict(Xu) # unlabelled data r = np.row_stack([rl, ru]) self.ll = [] # joint-log-likelihood # EM iterations while (np.size(self.ll) < 5 or abs(sum(self.ll[-1] - self.ll[-5:-1])) > tol): # M-step for k in range(self.K): # update for each class # clusters self.base[k].M_step(X, r[:, k]) # update posterior params # dirichlet nk = np.sum(r[:, k]) self.pi_map[k] = ((nk + self.alpha[k] - 1) / (self.N + np.sum(self.alpha) - self.K)) # E-step ru = self.predict(Xu) # update resp. for unlabelled r = np.row_stack([rl, ru]) # log-lik unlabelled ll_ul = np.sum(self.lpx) # log-lik of base params lpth_D = np.array([ self.base[k].post_logpdf(self.base[k].mu_map, self.base[k].Sig_map) for k in range(self.K) ]).sum() # mixing props self.alpha_n = np.sum(r, 0) + self.alpha # posterior alpha lpi_D = dirichlet.logpdf(self.pi_map, self.alpha_n) # track log-lik of the joint of the model self.ll = np.append(self.ll, ll_ul + self.lpX + lpth_D + lpi_D) print('log-joint-likelihood:' + '%.4f' % self.ll[-1]) # store the final responsibility and likelihood self.r # resp. for whole semisupervised set # for Xu self.r_ul = ru self.lpx_ul = self.lpx
def calculate_ne_log_joint(self): log_joint = 0 for ne_topic_id in range(self.num_ne_topics): log_joint += dirichlet.logpdf( self.ne_topic_proportions[-1][ne_topic_id], self.ne_topic_vocab_prior_alpha) for article_id in range(len(self.training_data.articles)): log_joint += dirichlet.logpdf( self.ne_article_proportions[-1][article_id], self.ne_article_topic_proportions_prior_alpha) for article_ne_id, ne_id in enumerate( self.training_data.article_nes[article_id]): ne_topic_assignment = self.ne_article_ne_assignments[-1][ article_id][article_ne_id] log_joint += np.log(self.ne_article_proportions[-1][article_id] [ne_topic_assignment]) log_joint += np.log( self.ne_topic_proportions[-1][ne_topic_assignment][ne_id]) return log_joint
def test_frozen_dirichlet(): np.random.seed(2846) n = np.random.randint(1, 32) alpha = np.random.uniform(10e-10, 100, n) d = dirichlet(alpha) assert_equal(d.var(), dirichlet.var(alpha)) assert_equal(d.mean(), dirichlet.mean(alpha)) assert_equal(d.entropy(), dirichlet.entropy(alpha)) num_tests = 10 for i in range(num_tests): x = np.random.uniform(10e-10, 100, n) x /= np.sum(x) assert_equal(d.pdf(x[:-1]), dirichlet.pdf(x[:-1], alpha)) assert_equal(d.logpdf(x[:-1]), dirichlet.logpdf(x[:-1], alpha))
def EM(self, X, tol=1e-3): # train unsupervised via MAP EM self.N = X.shape[0] # init mixing props self.pi_map = self.alpha / np.sum(self.alpha) # init posterior each cluster (offsetting the prior).. m = X[np.random.choice(self.N, self.K, replace=False), :] # data as mu [self.base[k].post_init(m[k, :]) for k in range(self.K)] # init responsibility r = self.predict(X) self.lml = [] # log-marg-lik self.ll = [] # log-joint-lik # EM iterations while (np.size(self.lml) < 5 or abs(sum(self.lml[-1] - self.lml[-5:-1])) > tol): # M-step for k in range(self.K): # update for each class # clusters self.base[k].M_step(X, r[:, k]) # update posterior params. # dirichlet nk = np.sum(r[:, k]) self.pi_map[k] = ((nk + self.alpha[k] - 1) / (self.N + np.sum(self.alpha) - self.K)) # E-step r = self.predict(X) # update responsibility # log-lik of base params lpth_D = np.array([ self.base[k].post_logpdf(self.base[k].mu_map, self.base[k].Sig_map) for k in range(self.K) ]).sum() # mixing props self.alpha_n = np.sum(r, 0) + self.alpha # posterior alpha lpi_D = dirichlet.logpdf(self.pi_map, self.alpha_n) # append lml/ll self.lml = np.append(self.lml, np.sum(self.lpx)) self.ll = np.append(self.lml, np.sum(self.lpx) + lpth_D + lpi_D) print('log-marginal-likelihood:' + '%.4f' % self.lml[-1]) # store the final responsibility self.r = r
def test_log_pdf_with_broadcast(self, dtype, a, a_is_samples, rv, rv_is_samples, num_samples): # Add sample dimension if varaible is not samples a_mx = mx.nd.array(a, dtype=dtype) if not a_is_samples: a_mx = add_sample_dimension(mx.nd, a_mx) a = a_mx.asnumpy() rv_mx = mx.nd.array(rv, dtype=dtype) if not rv_is_samples: rv_mx = add_sample_dimension(mx.nd, rv_mx) rv = rv_mx.asnumpy() is_samples_any = a_is_samples or rv_is_samples rv_shape = rv.shape[1:] n_dim = 1 + len(rv.shape) if is_samples_any and not rv_is_samples else len(rv.shape) a_np = np.broadcast_to(a, (num_samples, 3, 2)) rv_np = numpy_array_reshape(rv, is_samples_any, n_dim) # Initialize rand_gen rand = np.random.rand(num_samples, *rv_shape) rand_gen = MockMXNetRandomGenerator(mx.nd.array(rand.flatten(), dtype=dtype)) # Calculate correct Dirichlet logpdf r = [] for s in range(len(rv_np)): a = [] for i in range(len(rv_np[s])): a.append(scipy_dirichlet.logpdf(rv_np[s][i]/sum(rv_np[s][i]), a_np[s][i])) r.append(a) log_pdf_np = np.array(r) dirichlet = Dirichlet.define_variable(alpha=Variable(), shape=rv_shape, dtype=dtype, rand_gen=rand_gen).factor variables = {dirichlet.alpha.uuid: a_mx, dirichlet.random_variable.uuid: rv_mx} log_pdf_rt = dirichlet.log_pdf(F=mx.nd, variables=variables) assert np.issubdtype(log_pdf_rt.dtype, dtype) assert array_has_samples(mx.nd, log_pdf_rt) == is_samples_any if is_samples_any: assert get_num_samples(mx.nd, log_pdf_rt) == num_samples, (get_num_samples(mx.nd, log_pdf_rt), num_samples) assert np.allclose(log_pdf_np, log_pdf_rt.asnumpy())
def score_words(train_tokens): words = set() all_counts = {} for label, docs in train_tokens.iteritems(): counts = defaultdict(int) for d in docs: for t in d: counts[t] += 1 words.add(t) all_counts[label] = counts # d filter word_score = [] for w in words: get_count = lambda d: d.get(w, 0) x = normalize(np.array(map(get_count, all_counts.values()))) score = dirichlet.logpdf(normalize(x), np.ones(len(all_counts)) * 2) word_score.append((w, score)) return word_score
def _calc_expectation(Mu, P, V, Gamma, A, Alpha, W): """Calculates the conditional expectation in the E-step of the EM-Algorithm, given the observations and the current estimates of the classifier. Parameters ---------- Mu : numpy.ndarray, shape (n_samples, n_classes) Mu[i,k] contains the probability of a sample X[i] to be of class classes_[k] estimated according to the EM-algorithm. V : numpy.ndarray, shape (n_samples, n_classes) Describes an intermediate result. P : numpy.ndarray, shape (n_samples, n_classes) P[i,k] contains the probabilities of sample X[i] belonging to class classes_[k], as estimated by the classifier (i.e., sigmoid(W.T, X[i])). Returns ------- expectation : float The conditional expectation. """ # Evaluate prior of weight vectors. all_zeroes = not np.any(Gamma) Gamma = Gamma if all_zeroes else np.linalg.inv(Gamma) prior_W = np.sum([ multi_normal.logpdf(x=W[:, k], cov=Gamma, allow_singular=True) for k in range(W.shape[1]) ]) # Evaluate prior of alpha matrices. prior_Alpha = np.sum([[ dirichlet.logpdf(x=Alpha[j, k, :], alpha=A[j, k, :]) for k in range(Alpha.shape[1]) ] for j in range(Alpha.shape[0])]) # Evaluate log-likelihood for data. log_likelihood = np.sum(Mu * np.log(P * V + np.finfo(float).eps)) expectation = log_likelihood + prior_W + prior_Alpha return expectation
def sample_delta(self, nodepair, eventtime, ite): a_delta = 0.1 b_delta = 0.1 a_taus = 0.1 b_taus = 0.1 delta_old = self.Delta_pis delta_new = delta_old + norm.rvs() * ((np.sqrt(ite + 1))**(-1)) taus_old = self.Taus_kij taus_new = taus_old + norm.rvs() * ((np.sqrt(ite + 1))**(-1)) if delta_new > 0: ll_old = 0 ll_new = 0 for tt in range(len(eventtime)): i_parameter_contribute_from_J_old = np.zeros(self.KK) i_parameter_contribute_from_J_new = np.zeros(self.KK) if len(self.receiving_j_list[tt]) > 0: for nn in range(len(self.receiving_j_list[tt])): exp_time_current_s_old = np.exp( -delta_old * (eventtime[tt] - self.receiving_j_list[tt][nn][2])) val_1_old = self.betas[ self.receiving_j_list[tt][nn][0], (nodepair[tt, 0])] * exp_time_current_s_old * ( self.pis_list[self.receiving_j_list[tt][nn][0]] [self.receiving_j_list[tt][nn][1]]) i_parameter_contribute_from_J_old += val_1_old exp_time_current_s_new = np.exp( -delta_new * (eventtime[tt] - self.receiving_j_list[tt][nn][2])) val_1_new = self.betas[ self.receiving_j_list[tt][nn][0], (nodepair[tt, 0])] * exp_time_current_s_new * ( self.pis_list[self.receiving_j_list[tt][nn][0]] [self.receiving_j_list[tt][nn][1]]) i_parameter_contribute_from_J_new += val_1_new i_parameter_contribute_from_prei = self.betas[ (nodepair[tt, 0]), (nodepair[tt, 0])] * (self.pis_list[ (nodepair[tt, 0])][self.sender_receiver_num[tt][0]]) psi_i_s_old = i_parameter_contribute_from_J_old + i_parameter_contribute_from_prei psi_i_s_new = i_parameter_contribute_from_J_new + i_parameter_contribute_from_prei ll_old += dirichlet.logpdf( self.pis_list[( nodepair[tt, 0])][self.sender_receiver_num[tt][0] + 1], psi_i_s_old) ll_new += dirichlet.logpdf( self.pis_list[( nodepair[tt, 0])][self.sender_receiver_num[tt][0] + 1], psi_i_s_new) ll_old += gamma.logpdf(delta_old, a=a_delta, scale=b_delta) ll_new += gamma.logpdf(delta_new, a=a_delta, scale=b_delta) if np.log(np.random.rand()) < (ll_new - ll_old): self.Delta_pis = delta_new if taus_new > 0: ll_old = 0 ll_new = 0 judge = np.where(self.b_ij > 0)[0] b_nonzero = self.b_ij[judge] receiving_nozero1 = [ self.mutually_exciting_pair[judge_i] for judge_i in judge ] receiving_time = [ eventtime[receiving_nozero1[it][b_nonzero[it] - 1]] for it in range(len(b_nonzero)) ] ll_old += np.sum(-taus_old * (eventtime[judge] - receiving_time)) ll_new += np.sum(-taus_new * (eventtime[judge] - receiving_time)) ll_old -= self.alpha * np.sum( (taus_old**(-1)) * (1 - np.exp(-taus_old * (eventtime[-1] - eventtime)))) ll_new -= self.alpha * np.sum( (taus_new**(-1)) * (1 - np.exp(-taus_new * (eventtime[-1] - eventtime)))) ll_old += gamma.logpdf(taus_old, a=a_taus, scale=b_taus) ll_new += gamma.logpdf(taus_new, a=a_taus, scale=b_taus) if np.log(np.random.rand()) < (ll_new - ll_old): self.Taus_kij = taus_new
def _learn_global_mixture_weights(alpha, multinomials, val_data, num_em_iter=100, tol=0.001): """ Learning the mixing weights for mixture of two multinomials. Each observation is considered as a data point and the mixing weights (\pi) are learned using all the points. NOTE: In order for the algorithm to work, there can be no location that can get 0 probability by both the mem_mult and the mf_mult. In my runs, I use MPE to estimate the mf_mult while using MLE for the mum_mul. That way the mf_mult has no 0 values. INPUT: ------- 1. alpha: <float / (2, ) ndarray> Dirichlet prior for the pi learning. If <float> is given it is treated as a flat prior. Has to be bigger than 1. 2. multinomials: list[<(U, C) ndarray>] each row is the multinomial parameter according to the "self" data 4. val_data: <(N, 3) ndarray> each row is [ind_id, loc_id, counts] 5. num_em_iter: <int> number of em iterations 6. tol: <float> convergence threshold OUTPUT: -------- 1. pi: <(N, ) ndarray> mixing weights. 2. log likelihood reached. RAISE: ------- 1. ValueError: a. alphas are not bigger than 1 b. the multinomial's rows don't sum to 1 c. _There is a location with both mults 0 (see NOTE) """ num_comp = len(multinomials) if np.any(alpha <= 1): raise ValueError('alpha values have to be bigger than 1') for i, mult in enumerate(multinomials): if np.any(np.abs(np.sum(mult, axis=1) - 1) > 0.001): raise ValueError('component %d param is not a proper multinomial -- all rows must sum to 1' % i) if type(alpha) == float or type(alpha) == int: alpha = np.ones(num_comp) * alpha * 1. # Creating responsibility matrix and initializing it hard assignment on random log_like_tracker = [-np.inf] pi = np.ones(num_comp) / num_comp start = time.time() em_iter = 0 for em_iter in xrange(1, num_em_iter + 1): # Evey 5 iteration we will compute the posterior log probability to see if we converged. if em_iter % 2 == 0: event_prob = _data_prob(pi, multinomials, val_data) event_prob = np.sum(event_prob, axis=0) # prob # The data likelihood was computed for each location, but it should be in the power of the number # of observations there, or a product in the log space. data_likelihood = np.log(np.array(event_prob)) * val_data[:, 2] prior_probability = dirichlet.logpdf(pi, alpha=alpha) log_likelihood = np.sum(data_likelihood + prior_probability) / np.sum(val_data[:, 2]) if np.abs(log_likelihood - log_like_tracker[-1]) < tol: log.debug('[iter %d] [Reached convergence.]' % em_iter) break log.debug('[iter %d] [Likelihood: [%.4f -> %.4f]]' % (em_iter, log_like_tracker[-1], log_likelihood)) log_like_tracker.append(log_likelihood) # E-Step resp = _data_prob(pi, multinomials, val_data) if np.all(resp == 0): raise ValueError('0 mix probability') resp = np.array(resp).T resp = normalize(resp, 'l1', axis=1) resp = np.multiply(resp, val_data[:, 2][:, np.newaxis]) pi = np.sum(resp, axis=0) pi += alpha - 1 pi /= np.sum(pi) total_time = time.time() - start log.debug('Finished EM. Total time = %d secs -- %.3f per iteration' % (total_time, total_time / em_iter)) data_log_like = _data_prob(pi, multinomials, val_data) data_log_like = np.sum(data_log_like, axis=0) ll = np.sum(np.log(np.array(data_log_like)) * val_data[:, 2]) / np.sum(val_data[:, 2]) return pi, ll
def lda_inference(doc, lda_model, adagrad=True): S = 10 # samples converged = 100.0 rho = 1e-4 # learning rate if adagrad: epsilon = 1e-6 # fudge factor g_phi = np.zeros([doc.length, lda_model.num_topics]) g_var_gamma = np.zeros([lda_model.num_topics]) # variational parameters phi = np.ones([doc.length, lda_model.num_topics]) \ / lda_model.num_topics # N * k matrix var_gamma = np.ones([lda_model.num_topics]) * lda_model.alpha \ + doc.total / float(lda_model.num_topics) likelihood_old = 0 var_ite = 0 while (converged > 1e-3 and var_ite < 1e3): var_ite += 1 # sample S theta sample_theta = np.random.dirichlet(var_gamma, S) # sample S z for each word n sample_zs = np.zeros([doc.length, S], dtype=np.int32) for n in range(doc.length): # sample S z for each word sample_z = np.random.multinomial(1, phi[n, :], S) # S * k matrix which_j = np.argmax(sample_z, 1) # S length vector sample_zs[n, :] = which_j # compute gamma gradient dig = digamma(var_gamma) var_gamma_sum = np.sum(var_gamma) digsum = digamma(var_gamma_sum) ln_theta = np.log(sample_theta) # S * k matrix dqdg = ln_theta - dig + digsum # S * k matrix ln_p_theta = dirichlet.logpdf(np.transpose(sample_theta), \ [lda_model.alpha] * lda_model.num_topics) # S length vector ln_q_theta = dirichlet.logpdf(np.transpose(sample_theta), var_gamma) # S length vector # explicitly evaluate expectation # E_p_z = np.sum(ln_theta * np.sum(phi, 0), 1) # S length vector # monte-carlo estimated expectation E_p_z = np.zeros(S) # S length vector for sample_id in range(S): cur_ln_theta = ln_theta[sample_id, :] sampled_ln_theta = [] for n in range(doc.length): which_j = sample_zs[n, :] sampled_ln_theta += list( cur_ln_theta[which_j] ) # (doc.counts[n] * list(cur_ln_theta[which_j])) E_p_z[sample_id] = np.average(sampled_ln_theta) grad_gamma = np.average( dqdg * np.reshape(ln_p_theta - ln_q_theta + E_p_z, (S, 1)), 0) # update if adagrad: g_var_gamma += grad_gamma**2 grad_gamma = grad_gamma / (np.sqrt(g_var_gamma) + epsilon) var_gamma = var_gamma + rho * grad_gamma # for phi # for explicit evaluation of expectation # dig = digamma(var_gamma) # var_gamma_sum = np.sum(var_gamma) # digsum = digamma(var_gamma_sum) # resample from updated gamma sample_theta = np.random.dirichlet(var_gamma, S) ln_theta = np.log(sample_theta) # S * k matrix for n in range(doc.length): # compute phi gradient which_j = sample_zs[n, :] dqdphi = 1 / phi[n][which_j] # S length vector ln_p_w = lda_model.log_prob_w[which_j][:, doc. words[n]] # S length vector ln_q_phi = np.log(phi[n][which_j]) # S length vector # explicitly evaluate expectation # E_p_z_theta = dig[which_j] - digsum # S length vector # monte-carlo estimated expectation E_p_z_theta = np.zeros(S) # S length vector for sample_id in range(S): cur_ln_theta = ln_theta[sample_id, :] E_p_z_theta += cur_ln_theta[which_j] E_p_z_theta = E_p_z_theta / S # print( dqdphi.shape, ln_p_w.shape, ln_q_phi.shape, E_p_z_theta.shape) # print (lda_model.log_prob_w[which_j][:,doc.words[n]]) # print ln_p_w,ln_q_phi,E_p_z_theta grad_phi = doc.counts[n] * dqdphi * (ln_p_w - ln_q_phi + E_p_z_theta) # update phi for i, j in enumerate(which_j): if adagrad: g_phi[n][j] += grad_phi[i]**2 grad_phi[i] = grad_phi[i] / (np.sqrt(g_phi[n][j]) + epsilon) # print grad_phi[i] phi[n][j] = phi[n][j] + rho * grad_phi[i] if phi[n][j] < 0: # bound phi phi[n][j] = 0 phi[n] /= np.sum(phi[n]) # normalization # compute likelihood likelihood = compute_likelihood(doc, lda_model, phi, var_gamma) assert (not isnan(likelihood)) converged = abs((likelihood_old - likelihood) / likelihood_old) likelihood_old = likelihood # print likelihood, converged return likelihood
def compute_prior(self): return dirichlet.logpdf(self.value, self.alpha)
def test_alpha_correct_depth(self): alpha = np.array([1.0, 2.0, 3.0]) x = np.ones((3, 7)) / 3 dirichlet.pdf(x, alpha) dirichlet.logpdf(x, alpha)
def _learn_mix_mult(alpha, mem_mult, mf_mult, val_data, num_em_iter=100, tol=0.00001): """ Learning the mixing weights for mixture of two multinomials. Each observation is considered as a data point and the mixing weights (\pi) are learned using all the points. NOTE: In order for the algorithm to work, there can be no location that can get 0 probability by both the mem_mult and the mf_mult. In my runs, I use MPE to estimate the mf_mult while using MLE for the mum_mul. That way the mf_mult has no 0 values. INPUT: ------- 1. alpha: <float / (2, ) ndarray> Dirichlet prior for the pi learning. If <float> is given it is treated as a flat prior. Has to be bigger than 1. 2. mem_mult: <(I, L) ndarray> each row is the multinomial parameter according to the "self" data 3. mf_mult: <(I, L) ndarray> each row is the multinomial parameter according to the matrix factorization 4. val_data: <(N, 3) ndarray> each row is [ind_id, loc_id, counts] 5. num_em_iter: <int> number of em iterations 6. tol: <float> convergence threshold OUTPUT: -------- 1. pi: <(2, ) ndarray> mixing weights. RAISE: ------- 1. ValueError: a. alphas are not bigger than 1 b. the multinomial's rows don't sum to 1 c. There is a location with both mults 0 (see NOTE) """ if np.any(alpha <= 1): raise ValueError('alpha values have to be bigger than 1') if np.any(np.abs(np.sum(mem_mult, axis=1) - 1) > 0.001): raise ValueError('mem_mult param is not a multinomial -- all rows must sum to 1') if np.any(np.abs(np.sum(mf_mult, axis=1) - 1) > 0.001): raise ValueError('mf_mult param is not a multinomial -- all rows must sum to 1') if type(alpha) == float or type(alpha) == int: alpha = np.array([alpha, alpha]) # Creating responsibility matrix and initializing it hard assignment on random log_like_tracker = [-np.inf] pi = np.array([0.5, 0.5]) start = time.time() for em_iter in range(1, num_em_iter + 1): # Evey 5 iteration we will compute the posterior log probability to see if we converged. if em_iter % 5 == 0: data_log_like = pi[0] * mem_mult[val_data[:, 0].astype(int), val_data[:, 1].astype(int)] + \ pi[1] * mf_mult[val_data[:, 0].astype(int), val_data[:, 1].astype(int)] # The data likelihood was computed for each location, but it should be in the power of the number # of observations there, or a product in the log space. data_likelihood = np.log(data_log_like) * val_data[:, 2] prior_probability = dirch.logpdf(pi, alpha=alpha) log_likelihood = np.mean(data_likelihood + prior_probability) if np.abs(log_likelihood - log_like_tracker[-1]) < tol: break log_like_tracker.append(log_likelihood) # E-Step resp = [pi[0] * mem_mult[val_data[:, 0].astype(int), val_data[:, 1].astype(int)], pi[1] * mf_mult[val_data[:, 0].astype(int), val_data[:, 1].astype(int)]] if np.all(resp == 0): raise ValueError('0 mix probability') resp = np.array(resp).T resp = normalize_mat_row(resp) # M-Step. Only on the \pi with Dirichlet prior alpha > 1 pi = np.sum(resp * col_vector(val_data[:, 2]), axis=0) pi += alpha - 1 pi /= np.sum(pi) total_time = time.time() - start log.debug('Finished EM. Total time = %d secs -- %.3f per iteration' % (total_time, total_time / em_iter)) return pi
def compute_marg_likelihood_and_NSE_galaxies(y, iters, init, hypers): ''' Compute the marginal likelihood from the Gibbs Sampler output according to Chib (1995) y : (array-like) endogeneous variables iters: (int) length of the MCMC init: (array-like) initialisation parameters hypers: (array-like) hyper-parameters returns: (float) the marginal likelihood/normalizing constant ''' # Initialisation d = init['d'] mu_params, sigma_square_params, q_params = init['mu_params'], init[ 'sigma_square_params'], init['q_params'] mu, sigma_square, q, mu_hat, B, n_for_estim_sigma, delta, n_for_estim_q = GibbsSampler_galaxies( y, iters, init, hypers) mu_star = np.array(mu).mean(axis=0) sigma_square_star = np.array(sigma_square).mean(axis=0) q_star = np.array(q).mean(axis=0) ## Marginal likelihood computation P7, right column # First term: y_given_mu_and_sigma2_stars_pdf = np.stack([ norm.pdf(x=y, loc=mu_star[i], scale=sigma_square_star[i]) for i in range(d) ])[:, :, 0].T log_like = np.log( (q_star * y_given_mu_and_sigma2_stars_pdf).sum(axis=1)).sum() # Second term mu_prior = multivariate_normal.logpdf( x=mu_star, mean=mu_params[0], cov=mu_params[1]).sum( ) # Sum because of a the use of logpdf instead of pdf sigma_square_prior = invgamma.logpdf(x=sigma_square_star, a=sigma_square_params[0], scale=np.sqrt( sigma_square_params[1])).sum() q_square_prior = dirichlet.logpdf(x=q_star, alpha=q_params).sum() log_prior = mu_prior + sigma_square_prior + q_square_prior # Third term conditional_densities_mu = np.array([ np.prod(multivariate_normal.pdf(x=mu_star, mean=mu_hat[i], cov=B[i])) for i in range(iters) ]) conditional_densities_sigma = np.array([np.prod(invgamma.pdf(x=sigma_square_star, a=(sigma_square_params[0]+n_for_estim_sigma[i])/2,\ scale=(sigma_square_params[1]+delta[i])/2)) for i in range(iters)]) conditional_densities_q = np.array([ dirichlet.pdf(x=q_star, alpha=q_params + n_for_estim_q[i]) for i in range(iters) ]) conditional_densities = conditional_densities_mu * conditional_densities_sigma * conditional_densities_q log_posterior = np.log(conditional_densities.mean()) log_marg_likelihood = log_like + log_prior - log_posterior #Numerical Standard Error Computation h = np.array([ conditional_densities_mu, conditional_densities_sigma, conditional_densities_q ]) h_hat = np.array([ np.mean(conditional_densities_mu), np.mean(conditional_densities_sigma), np.mean(conditional_densities_q) ]) var = compute_var_h_hat(h, h_hat) NSE = np.dot(np.dot((1 / h_hat).reshape(1, -1), var), (1 / h_hat).reshape(-1, 1))[0, 0] return log_marg_likelihood, NSE
def prior_probs(param, val): if param == "pi": return dirichlet.logpdf(val, alpha=prior_pi) elif param == "rates": return dirichlet.logpdf(val, alpha=prior_er)