コード例 #1
0
ファイル: lda.py プロジェクト: gitter-badger/pymake
    def sample_heldout(self):
        J = self.data_t.shape[0]
        K = self.get_K()

        # Manage Alpha prior
        if not hasattr(self, 'logalpha'):
            alpha = np.exp(self.log_alpha_beta[:-1])
        else:
            alpha = np.exp(self.logalpha)

        doc_order = np.random.permutation(J)
        for doc_iter, j in enumerate(doc_order):
            nnz =  self.data_t[j].sum()
            lgg.debug( '%d \t %d \t %d' % ( doc_iter , nnz, K ))
            nnz_order = np.random.permutation(nnz)
            for i in nnz_order:

                k_ji = self.z_t[j][i]
                self.doc_topic_counts_t[j, k_ji] -=1

                params = np.log(self.doc_topic_counts_t[j] + alpha) + np.log(self._phi[self.data_t_w[j][i], k_ji])
                params =  lognormalize(params[:K])

                sample_topic = categorical(params)
                self.z_t[j][i] = sample_topic

                self.doc_topic_counts_t[j, sample_topic] += 1

        return self.z_t
コード例 #2
0
ファイル: mmsb.py プロジェクト: dtrckd/ml
    def sample(self):
        # Add pnew container
        self._update_log_alpha_beta()
        self.update_matrix_shape()

        lgg.debug('Sample z...')
        lgg.trace('#J \t #I \t  #topic')
        for j, i in self.likelihood.data_iter(randomize=True):
            lgg.trace('%d \t %d \t %d' %
                      (j, i, self.doc_topic_counts.shape[1] - 1))
            params = self.prob_zji(j, i, self._K + 1)
            sample_topic_raveled = categorical(params)
            k_j, k_i = np.unravel_index(sample_topic_raveled,
                                        (self._K + 1, self._K + 1))
            k_j, k_i = k_j[0], k_i[0]  # beurk :(
            self.z[j, i, 0] = k_j
            self.z[j, i, 1] = k_i

            # Regularize matrices for new topic sampled
            if k_j == self.doc_topic_counts.shape[
                    1] - 1 or k_i == self.doc_topic_counts.shape[1] - 1:
                self._K += 1
                #print 'Simplex probabilities: %s' % (params)
                self.update_matrix_shape(new_topic=True)

            self.update_matrix_count(j, i, k_j, k_i)

        # Remove pnew container
        self.doc_topic_counts = self.doc_topic_counts[:, :-1]
        self.likelihood.word_topic_counts = self.likelihood.word_topic_counts[:, :
                                                                              -1, :
                                                                              -1]
        self.purge_empty_topics()

        return self.z
コード例 #3
0
    def sample(self):
        # Add pnew container
        self._update_log_alpha_beta()
        self.doc_topic_counts = np.column_stack(
            (self.doc_topic_counts, np.zeros(self.J, dtype=int)))
        self.likelihood.word_topic_counts = np.column_stack(
            (self.likelihood.word_topic_counts,
             np.zeros(self.likelihood.nfeat, dtype=int)))
        self.likelihood.total_w_k = self.likelihood.word_topic_counts.sum(0)

        lgg.info('Sample z...')
        lgg.debug('#Doc \t nnz \t  #topic')
        doc_order = np.random.permutation(self.J)
        for doc_iter, j in enumerate(doc_order):
            nnz = self.data_dims[j]
            lgg.debug('%d \t %d \t %d' %
                      (doc_iter, nnz, self.doc_topic_counts.shape[1] - 1))
            nnz_order = np.random.permutation(nnz)
            for i in nnz_order:
                params = self.prob_zji(j, i, self._K + 1)
                sample_topic = categorical(params)
                self.z[j][i] = sample_topic

                # Regularize matrices for new topic sampled
                if sample_topic == self.doc_topic_counts.shape[1] - 1:
                    self._K += 1
                    #print 'Simplex probabilities: %s' % (params)
                    col_doc = np.zeros((self.J, 1), dtype=int)
                    col_word = np.zeros((self.likelihood.nfeat, 1), dtype=int)
                    self.doc_topic_counts = np.hstack(
                        (self.doc_topic_counts, col_doc))
                    self.likelihood.word_topic_counts = np.hstack(
                        (self.likelihood.word_topic_counts, col_word))
                    self.likelihood.total_w_k = self.likelihood.word_topic_counts.sum(
                        0)

                # Update count matrixes
                self.doc_topic_counts[j, sample_topic] += 1
                self.likelihood.word_topic_counts[self.likelihood.data[j][i],
                                                  sample_topic] += 1
                self.likelihood.total_w_k[sample_topic] += 1

        # Remove pnew container
        self.doc_topic_counts = self.doc_topic_counts[:, :-1]
        self.likelihood.word_topic_counts = self.likelihood.word_topic_counts[:, :
                                                                              -1]
        self.purge_empty_topics()

        return self.z
コード例 #4
0
    def sample(self):
        lgg.debug('Sample z...')
        lgg.vdebug('#J \t #I \t #topic')
        for j, i in self.likelihood.data_iter(randomize=True):
            lgg.vdebug( '%d \t %d \t %d' % (j , i, self.doc_topic_counts.shape[1]-1))
            params = self.prob_zji(j, i, self.K)
            sample_topic_raveled = categorical(params)
            k_j, k_i = np.unravel_index(sample_topic_raveled, (self._K, self._K))
            k_j, k_i = k_j[0], k_i[0] # beurk :(
            self.z[j, i, 0] = k_j
            self.z[j, i, 1] = k_i
            nodes_classes_ass = [(j, k_j), (i, k_i)]

            self.update_matrix_count(j, i, k_j, k_i)
        return self.z
コード例 #5
0
ファイル: lda.py プロジェクト: gitter-badger/pymake
    def sample(self):
        print( 'Sample z...')
        lgg.debug( '#Doc \t #nnz\t #Topic')
        doc_order = np.random.permutation(self.J)
        for doc_iter, j in enumerate(doc_order):
            nnz =  self.data_dims[j]
            lgg.debug( '%d \t %d \t %d' % ( doc_iter , nnz, self.K ))
            nnz_order = np.random.permutation(nnz)
            for i in nnz_order:
                params = self.prob_zji(j, i, self.K)
                sample_topic = categorical(params)
                self.z[j][i] = sample_topic

                self.doc_topic_counts[j, sample_topic] += 1
                self.likelihood.word_topic_counts[self.likelihood.data[j][i], sample_topic] += 1
                self.likelihood.total_w_k[sample_topic] += 1

        return self.z
コード例 #6
0
    def sample(self):
        self._update_m()

        indices = np.ndenumerate(self.count_k_by_j)

        lgg.debug('Sample m...')
        for ind in indices:
            j, k = ind[0]
            count = ind[1]

            if count > 0:
                # Sample number of tables in j serving dishe k
                params = self.prob_jk(j, k)
                sample = categorical(params) + 1
            else:
                sample = 0

            self.m[j, k] = sample

        self.m_dotk = self.m.sum(0)
        self.purge_empty_tables()

        return self.m
コード例 #7
0
ファイル: generate.py プロジェクト: dtrckd/ml
    def pvalue(self, _type='global'):
        """ similar to zipf but compute pvalue and print table

            Parameters
            ==========
            _type: str in [global, local, feature]
        """
        if self.model is None: return
        expe = self.expe
        figs = []

        Y = self._Y
        N = Y[0].shape[0]
        model = self.model

        Table, Meas = self.init_fit_tables(_type, Y)

        self.log.info('using `%s\' burstiness' % _type)

        if _type == 'global':
            ### Global degree
            for it_dat, data in enumerate(Y):
                d, dc = degree_hist(adj_to_degree(data), filter_zeros=True)
                gof = gofit(d, dc)
                if not gof:
                    continue

                for i, v in enumerate(Meas):
                    Table[self.corpus_pos, i, it_dat] = gof[v]

        elif _type == 'local':
            ### Z assignement method
            a, b = model.get_params()
            N, K = a.shape
            print('theta shape: %s' % (str((N, K))))
            now = Now()
            if 'mmsb' in expe.model:
                ZZ = []
                for _i, _ in enumerate(Y):
                    #for _ in Y: # Do not reflect real local degree !
                    theta = self._Theta[_i]
                    phi = self._Phi[_i]
                    Z = np.empty((2, N, N))
                    order = np.arange(N**2).reshape((N, N))
                    if expe.symmetric:
                        triu = np.triu_indices(N)
                        order = order[triu]
                    else:
                        order = order.flatten()
                    order = zip(*np.unravel_index(order, (N, N)))

                    for i, j in order:
                        Z[0, i, j] = categorical(theta[i])
                        Z[1, i, j] = categorical(theta[j])
                    Z[0] = np.triu(Z[0]) + np.triu(Z[0], 1).T
                    Z[1] = np.triu(Z[1]) + np.triu(Z[1], 1).T
                    ZZ.append(Z)
                self.log.info('Z formation %s second', nowDiff(now))

            clustering = 'modularity'
            comm = model.communities_analysis(data=Y[0], clustering=clustering)
            print('clustering method: %s, active clusters ratio: %f' %
                  (clustering, len(comm['block_hist'] > 0) / K))

            local_degree_c = {}
            ### Iterate over all classes couple
            if expe.symmetric:
                #k_perm = np.unique( map(list, map(set, itertools.product(np.unique(clusters) , repeat=2))))
                k_perm = np.unique(
                    list(
                        map(
                            list,
                            map(
                                list,
                                map(set, itertools.product(range(K),
                                                           repeat=2))))))
            else:
                #k_perm = itertools.product(np.unique(clusters) , repeat=2)
                k_perm = itertools.product(range(K), repeat=2)

            for it_k, c in enumerate(k_perm):
                if isinstance(c, (np.int64, np.float64)):
                    k = l = c
                elif len(c) == 2:
                    # Stochastic Equivalence (extra class bind
                    k, l = c
                    #continue
                else:
                    # Comunnities (intra class bind)
                    k = l = c.pop()
                #if i > expe.limit_class:
                #   break
                if k != l:
                    continue

                degree_c = []
                YY = []
                if 'mmsb' in expe.model:
                    for y, z in zip(Y, ZZ):  # take the len of ZZ if < Y
                        y_c = y.copy()
                        phi_c = np.zeros(y.shape)
                        # UNDIRECTED !
                        phi_c[(z[0] == k) &
                              (z[1] == l
                               )] = 1  #; phi_c[(z[0] == l) & (z[1] == k)] = 1
                        y_c[phi_c != 1] = 0
                        #degree_c += adj_to_degree(y_c).values()
                        #yerr= None
                        YY.append(y_c)
                elif 'ilfm' in expe.model:
                    for _i, y in enumerate(Y):
                        theta = self._Theta[_i]
                        YY.append(
                            (y *
                             np.outer(theta[:, k], theta[:, l])).astype(int))

                d, dc, yerr = random_degree(YY)
                if len(d) == 0: continue
                gof = gofit(d, dc)
                if not gof:
                    continue

                for i, v in enumerate(Meas):
                    Table[self.corpus_pos, i, it_k] = gof[v]

        elif _type == 'feature':
            raise NotImplementedError

        if self._it == self.expe_size - 1:
            for _model, table in self.gramexp.tables.items():

                # Mean and standard deviation
                table_mean = np.char.array(np.around(
                    table.mean(2), decimals=3)).astype("|S20")
                table_std = np.char.array(np.around(table.std(2),
                                                    decimals=3)).astype("|S20")
                table = table_mean + b' $\pm$ ' + table_std

                # Table formatting
                corpuses = self.specname(self.gramexp.get_set('corpus'))
                table = np.column_stack((self.specname(corpuses), table))
                tablefmt = 'simple'
                table = tabulate(table,
                                 headers=['__' + _model.upper() + '__'] + Meas,
                                 tablefmt=tablefmt,
                                 floatfmt='.3f')
                print()
                print(table)
                if expe._write:
                    if expe._mode == 'predictive':
                        base = '%s_%s_%s' % (self.specname(
                            expe.corpus), self.specname(_model), _type)
                    else:
                        base = '%s_%s_%s' % ('MG', self.specname(_model),
                                             _type)
                    self.write_frames(table, base=base, ext='md')
コード例 #8
0
ファイル: generate.py プロジェクト: dtrckd/ml
    def burstiness(self, _type='all'):
        '''Zipf Analysis
           (global burstiness) + local burstiness + feature burstiness

           Parameters
           ----------
           _type : str
            type of burstiness to compute in ('global', 'local', 'feature', 'all')
        '''
        if self.model is None: return
        expe = self.expe
        figs = []

        Y = self._Y
        N = Y[0].shape[0]
        model = self.model

        if _type in ('global', 'all'):
            # Global burstiness
            d, dc, yerr = random_degree(Y)
            fig = plt.figure()
            title = 'global | %s, %s' % (self.specname(
                expe.get('corpus')), self.specname(expe.model))
            plot_degree_2((d, dc, yerr), logscale=True, title=title)

            figs.append(plt.gcf())

        if _type in ('local', 'all'):
            # Local burstiness
            print('Computing Local Preferential attachment')
            a, b = model.get_params()
            N, K = a.shape
            print('theta shape: %s' % (str((N, K))))
            now = Now()
            if 'mmsb' in expe.model:
                ### Z assignement method #
                ZZ = []
                for _i, _ in enumerate(Y):
                    #for _ in Y: # Do not reflect real local degree !

                    theta = self._Theta[_i]
                    phi = self._Phi[_i]
                    Z = np.empty((2, N, N))
                    order = np.arange(N**2).reshape((N, N))
                    if expe.symmetric:
                        triu = np.triu_indices(N)
                        order = order[triu]
                    else:
                        order = order.flatten()
                    order = zip(*np.unravel_index(order, (N, N)))

                    for i, j in order:
                        Z[0, i, j] = categorical(theta[i])
                        Z[1, i, j] = categorical(theta[j])
                    Z[0] = np.triu(Z[0]) + np.triu(Z[0], 1).T
                    Z[1] = np.triu(Z[1]) + np.triu(Z[1], 1).T
                    ZZ.append(Z)
                self.log.info('Z formation %s second' % nowDiff(now))

            clustering = 'modularity'
            comm = model.communities_analysis(data=Y[0], clustering=clustering)
            print('clustering method: %s, active clusters ratio: %f' %
                  (clustering, len(comm['block_hist'] > 0) / K))

            local_degree_c = {}
            ### Iterate over all classes couple
            if expe.symmetric:
                #k_perm = np.unique( map(list, map(set, itertools.product(np.unique(clusters) , repeat=2))))
                k_perm = np.unique(
                    list(
                        map(
                            list,
                            map(
                                list,
                                map(set, itertools.product(range(K),
                                                           repeat=2))))))
            else:
                #k_perm = itertools.product(np.unique(clusters) , repeat=2)
                k_perm = itertools.product(range(K), repeat=2)

            fig = plt.figure()
            for i, c in enumerate(k_perm):
                if isinstance(c, (np.int64, np.float64)):
                    k = l = c
                elif len(c) == 2:
                    # Stochastic Equivalence (outer class)
                    k, l = c
                else:
                    # Comunnities (inner class)
                    k = l = c.pop()
                #if i > expe.limit_class:
                #   break
                if k != l:
                    continue

                degree_c = []
                YY = []
                if 'mmsb' in expe.model:
                    for y, z in zip(Y, ZZ):  # take the len of ZZ if < Y
                        y_c = np.zeros(y.shape)
                        phi_c = np.zeros(y.shape)
                        # UNDIRECTED !
                        phi_c[(z[0] == k) & (z[1] == l)] = 1
                        y_c = y * phi_c
                        #degree_c += adj_to_degree(y_c).values()
                        #yerr= None
                        YY.append(y_c)
                elif 'ilfm' in expe.model:  # or Corpus !
                    for _i, y in enumerate(Y):
                        theta = self._Theta[_i]
                        if theta.shape[1] <= max(k, l):
                            print('warning: not all block converted.')
                            continue
                        YY.append(
                            (y *
                             np.outer(theta[:, k], theta[:, l])).astype(int))

                d, dc, yerr = random_degree(YY)
                if len(d) == 0: continue
                title = 'local | %s, %s' % (self.specname(
                    expe.get('corpus')), self.specname(expe.model))
                plot_degree_2((d, dc, yerr),
                              logscale=True,
                              colors=True,
                              line=True,
                              title=title)
            figs.append(plt.gcf())

        # Blockmodel Analysis
        #if _type in  ('feature', 'all'):
        #    plt.figure()
        #    if 'mmsb' in expe.model:
        #        # Feature burstiness
        #        hist, label = clusters_hist(comm['clusters'])
        #        bins = len(hist)
        #        plt.bar(range(bins), hist)
        #        plt.xticks(np.arange(bins)+0.5, label)
        #        plt.xlabel('Class labels')
        #        plt.title('Blocks Size (max assignement)')
        #    elif 'ilfm' in expe.model:
        #        # Feature burstiness
        #        hist, label = sorted_perm(comm['block_hist'], reverse=True)
        #        bins = len(hist)
        #        plt.bar(range(bins), hist)
        #        plt.xticks(np.arange(bins)+0.5, label)
        #        plt.xlabel('Class labels')
        #        plt.title('Blocks Size (max assignement)')

        #    figs.append(plt.gcf())

        if expe._write:
            if expe._mode == 'predictive':
                base = '%s_%s' % (self.specname(
                    expe.corpus), self.specname(expe.model))
            else:
                base = '%s_%s' % ('MG', self.specname(expe.model))
            self.write_frames(figs, base=base)
            return