Ejemplo n.º 1
0
    def train(self, n_iterations=100, verbose=1, **kwargs):
        """
        Takes an optional argument, `n_iterations` and updates the model
        `n_iterations` times.

        :param n_iterations: Number of iterations. Default is 100.
        :type n_iterations: int, optional

        :param verbose: If 1, current number of iterations
            are printed out to notify the user. Default is 1.
        :type verbose: int, optional
        
        :param kwargs: For compatability with calls to LdaCgsMulti.
        :type kwargs: optional
        """
        random_state = np.random.RandomState(self.seed)
        random_state.set_state(self._mtrand_state)


        if verbose > 0:
            print ('Begin LDA training for {0} iterations'\
                   .format(n_iterations))
            start = time.time()
            t = start

        # Training loop
        stop = self.iteration + n_iterations
        pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=n_iterations).start()
        #print("Stop ", stop)
        for itr in xrange(self.iteration , stop):

            results = cgs_update(self.iteration, self.corpus, self.word_top,
                                 self.inv_top_sums, self.top_doc, self.Z,
                                 self.indices, self._mtrand_state[0],
                                 self._mtrand_state[1], self._mtrand_state[2],
                                 self._mtrand_state[3], self._mtrand_state[4])

            lp = results[4]
            self.log_probs.append((self.iteration, lp))

            if verbose == 2:
                itr_time = np.around(time.time()-t, decimals=1)
                t = time.time()
                if verbose > 1 or itr==stop-1:
                    print ('\nIteration {0} complete: log_prob={1}, time={2}'
                           .format(self.iteration, lp, itr_time))

            if verbose == 1:
                #print("Self iteration", self.iteration)
                pbar.update(self.iteration - (stop - n_iterations))
                time.sleep(0.01)

            self.iteration += 1

            self._mtrand_state = results[5:]

        pbar.finish();
        if verbose > 1:
            print '-'*60, ('\n\nWalltime per iteration: {0} seconds'
                           .format(np.around((t-start)/n_iterations, decimals=2)))
Ejemplo n.º 2
0
def update((docs, doc_indices, mtrand_state)):
    """
    For LdaCgsMulti
    """
    start, stop = docs[0][0], docs[-1][1]

    corpus = np.frombuffer(_corpus, dtype='i')[start:stop]
    Z = np.frombuffer(_Z, dtype='i')[start:stop].copy()

    gbl_word_top = np.frombuffer(_word_top, dtype='d')
    gbl_word_top = gbl_word_top.reshape(_V.value, _K.value)
    loc_word_top = gbl_word_top.copy()
    inv_top_sums = np.frombuffer(_inv_top_sums, dtype='d').copy()

    top_doc = np.frombuffer(_top_doc, dtype='d')
    top_doc = top_doc.reshape(_K.value, top_doc.size/_K.value)
    top_doc = top_doc[:, doc_indices[0]:doc_indices[1]].copy()

    log_p = 0
    log_wk = np.log(gbl_word_top * inv_top_sums[np.newaxis, :])
    log_kc = np.log(top_doc / top_doc.sum(0)[np.newaxis, :])

    indices = np.array([(j - start) for (i,j) in docs], dtype='i')

    results = cgs_update(_iteration.value,
                         corpus,
                         loc_word_top,
                         inv_top_sums,
                         top_doc,
                         Z,
                         indices,
                         mtrand_state[0],
                         mtrand_state[1],
                         mtrand_state[2],
                         mtrand_state[3],
                         mtrand_state[4])

    (loc_word_top, inv_top_sums, top_doc, Z, log_p, mtrand_str, mtrand_keys, 
     mtrand_pos, mtrand_has_gauss, mtrand_cached_gaussian) = results

    loc_word_top -= gbl_word_top

    return (Z, top_doc, loc_word_top, log_p, 
            mtrand_str, mtrand_keys, mtrand_pos, 
            mtrand_has_gauss, mtrand_cached_gaussian)