def fit(self, documents): ''' Implement a parallel version of CGS over documents This requires storing a copy of sum_K and topics to avoid conflicts There's a synchronization period over which to reconcile the global sum_K/topics with the thread's local Additional feature is setting split_words = True This splits V into num_threads regions and provides locking mechanisms to update topics. This avoids the need to store the local topics and synchronize in case the corpus is very large, however the extra locking may hinder speedups ''' # topic -> words distribution topics = np.zeros((self.num_topics, documents.shape[1]), dtype=np.float) # documents -> topic distribution gamma = np.zeros((documents.shape[0], self.num_topics), dtype=np.float) # sum of types per topic sum_K = np.zeros((self.num_topics), dtype=np.dtype("i")) # sampling distributions sampling = np.zeros( (documents.shape[0], documents.shape[1], np.max(documents)), dtype=np.dtype("i")) self._pCGS(documents, topics, gamma, sum_K, sampling) assert np.sum(sum_K) == np.sum(documents), "Sum_K not synced: {}, {}".format( np.sum(sum_K), np.sum(documents)) assert np.sum(topics) == np.sum(documents), "topics not synced: {}, {}".format( np.sum(topics), np.sum(documents)) self.topics = topics self.gamma = gamma self.sum_K = sum_K # Compute the perplexity of the trained model on the train data self.perplexity_train = Evaluation._log_likelihood(self, gamma, documents)
def fit(self, dtm): ''' Parallel version of the lda: the temporary topics are computed in parallel for each document inside a mini-batch ''' # Initialisation num_docs, num_words = dtm.shape topics = np.random.gamma(100., 1./100., (self.num_topics, num_words)) gamma = np.ones((num_docs, self.num_topics)) ExpELogBeta = np.zeros((self.num_topics, num_words)) topics_int = np.zeros((self.num_threads, self.num_topics, num_words)) num_batch = num_docs / self.batch_size batches = np.array_split( np.arange(num_docs, dtype=np.int32), num_batch) for it_batch in range(num_batch): ovi_cython.exp_digamma2d(topics, ExpELogBeta) docs_thread = np.array_split(batches[it_batch], self.num_threads) # vector of threads threads = [None]*self.num_threads for tid in range(self.num_threads): threads[tid] = threading.Thread(target=self._worker_estep, args=(docs_thread[tid], dtm, topics_int[tid, :, :], gamma, ExpELogBeta)) threads[tid].start() for thread in threads: thread.join() # Synchronizing the topics_int topics_int_tot = np.sum(topics_int, axis=0) # Initialize the list of topics int for the next batch topics_int[:, :, :] = 0 # M-step indices = (np.sum(dtm[batches[it_batch], :], axis=0) > 0).astype( np.int32) ovi_cython.m_step(topics, topics_int_tot, indices, num_docs, self.batch_size, self.tau, self.kappa, it_batch) self.topics = topics self.gamma = gamma # Compute the perplexity of the trained model on the train data self.perplexity_train = Evaluation._log_likelihood(self, gamma, dtm)