Esempio n. 1
0
File: cgs.py Progetto: virgodi/plda
    def fit(self, documents):
        '''
            Implement a parallel version of CGS over documents
            This requires storing a copy of sum_K and topics to avoid conflicts
            There's a synchronization period over which to reconcile the global
            sum_K/topics with the thread's local
            Additional feature is setting split_words = True
            This splits V into num_threads regions and provides locking mechanisms
            to update topics. This avoids the need to store the local topics and synchronize
            in case the corpus is very large, however the extra locking may hinder speedups
        '''        
        # topic -> words distribution
        topics = np.zeros((self.num_topics, documents.shape[1]), dtype=np.float)
        # documents -> topic distribution
        gamma = np.zeros((documents.shape[0], self.num_topics), dtype=np.float)
        # sum of types per topic
        sum_K = np.zeros((self.num_topics), dtype=np.dtype("i"))
        # sampling distributions
        sampling = np.zeros(
            (documents.shape[0], documents.shape[1], np.max(documents)), dtype=np.dtype("i"))

        self._pCGS(documents, topics, gamma, sum_K,
                  sampling)
        assert np.sum(sum_K) == np.sum(documents), "Sum_K not synced: {}, {}".format(
            np.sum(sum_K), np.sum(documents))
        assert np.sum(topics) == np.sum(documents), "topics not synced: {}, {}".format(
            np.sum(topics), np.sum(documents))

        self.topics = topics
        self.gamma = gamma
        self.sum_K = sum_K

        # Compute the perplexity of the trained model on the train data
        self.perplexity_train = Evaluation._log_likelihood(self, gamma, documents)
Esempio n. 2
0
File: ovi.py Progetto: virgodi/plda
    def fit(self, dtm):
        '''
        Parallel version of the lda: the temporary topics are computed in
        parallel for each document inside a mini-batch

        '''
        # Initialisation
        num_docs, num_words = dtm.shape
        topics = np.random.gamma(100., 1./100., (self.num_topics, num_words))
        gamma = np.ones((num_docs, self.num_topics))
        ExpELogBeta = np.zeros((self.num_topics, num_words))
        topics_int = np.zeros((self.num_threads, self.num_topics, num_words))

        num_batch = num_docs / self.batch_size
        batches = np.array_split(
            np.arange(num_docs, dtype=np.int32), num_batch)

        for it_batch in range(num_batch):
            ovi_cython.exp_digamma2d(topics, ExpELogBeta)

            docs_thread = np.array_split(batches[it_batch], self.num_threads)

            # vector of threads
            threads = [None]*self.num_threads

            for tid in range(self.num_threads):
                threads[tid] = threading.Thread(target=self._worker_estep,
                                                args=(docs_thread[tid], dtm,
                                                      topics_int[tid, :, :],
                                                      gamma, ExpELogBeta))
                threads[tid].start()

            for thread in threads:
                thread.join()

            # Synchronizing the topics_int
            topics_int_tot = np.sum(topics_int, axis=0)
            # Initialize the list of topics int for the next batch
            topics_int[:, :, :] = 0
            # M-step
            indices = (np.sum(dtm[batches[it_batch], :], axis=0) > 0).astype(
                np.int32)
            ovi_cython.m_step(topics, topics_int_tot, indices, num_docs,
                                 self.batch_size, self.tau, self.kappa, it_batch)

        self.topics = topics
        self.gamma = gamma

        # Compute the perplexity of the trained model on the train data
        self.perplexity_train = Evaluation._log_likelihood(self, gamma, dtm)