def stochastic_lda(corpus, batches=None, lambda_=None, ordering=False, S=1, num_topics=10, max_iter=300, tau=1, kappa=0.5, alpha=0.5, eta=0.001, threshold=0.000001): ''' Stochastic Variational Inference EM algorithm for LDA. (from algorithm 2 in Blei 2010) corpus is a list of lists of [word_index, count] for each document corpus is a matrix of count: (docs, voca) Args: lambda_: to set a specific lambda for the initialization batches: to set an order on the use of the corpus S: size of the mini-batches ''' C, V = corpus.shape # Initialisation if not np.any(lambda_): lambda_ = np.random.gamma(100, 1./100, size=(num_topics, V)) else: lambda_ = lambda_.copy() gamma_d_k = np.ones((C, num_topics)) # Sampling if not np.any(batches): batches = get_samples(C, S, max_iter) for t in xrange(len(batches)): # #### E-step lambda_int = np.zeros((num_topics, V)) for d in batches[t]: gamma_d_k, lambda_int = e_step(d, corpus, gamma_d_k, lambda_, lambda_int, alpha, threshold) # #### M-step rho = (tau + t)**(-kappa) indices = np.unique(np.nonzero(corpus[batches[t], :])[1]) lambda_int = eta + C / (1. * S) * lambda_int lambda_[:, indices] = (1 - rho)*lambda_[:, indices] + rho*lambda_int[:, indices] return lambda_, gamma_d_k
def batch_lda(corpus, lambda_=None, num_topics=10, num_iter=10, alpha=0.5, eta=0.001, threshold=0.000001): ''' Batch Variational Inference EM algorithm for LDA, goes over all the data at each iteration. (from algorithm 1 in Blei 2010) corpus is a list of lists of [word_index, count] for each document corpus is a matrix of count: (docs, voca) Args: lambda_: to set a specific lambda for the initialization ''' C, V = corpus.shape # Initialisation if not np.any(lambda_): lambda_ = np.random.gamma(100, 1./100, size=(num_topics, V)) else: lambda_ = lambda_.copy() gamma_d_k = np.ones((C, num_topics)) sample = range(C) np.random.shuffle(sample) for t in xrange(num_iter): old_lambda_ = lambda_ # #### E-step lambda_int = np.zeros((num_topics, V)) for d in sample: gamma_d_k, lambda_int = e_step(d, corpus, gamma_d_k, lambda_, lambda_int, alpha, threshold) # #### M-step lambda_ = eta + lambda_int # Check if convergence if (np.mean(np.abs((lambda_ - old_lambda_) / old_lambda_)) < threshold): break return lambda_, gamma_d_k