def sample_heldout(self): J = self.data_t.shape[0] K = self.get_K() # Manage Alpha prior if not hasattr(self, 'logalpha'): alpha = np.exp(self.log_alpha_beta[:-1]) else: alpha = np.exp(self.logalpha) doc_order = np.random.permutation(J) for doc_iter, j in enumerate(doc_order): nnz = self.data_t[j].sum() lgg.debug( '%d \t %d \t %d' % ( doc_iter , nnz, K )) nnz_order = np.random.permutation(nnz) for i in nnz_order: k_ji = self.z_t[j][i] self.doc_topic_counts_t[j, k_ji] -=1 params = np.log(self.doc_topic_counts_t[j] + alpha) + np.log(self._phi[self.data_t_w[j][i], k_ji]) params = lognormalize(params[:K]) sample_topic = categorical(params) self.z_t[j][i] = sample_topic self.doc_topic_counts_t[j, sample_topic] += 1 return self.z_t
def sample(self): # Add pnew container self._update_log_alpha_beta() self.update_matrix_shape() lgg.debug('Sample z...') lgg.trace('#J \t #I \t #topic') for j, i in self.likelihood.data_iter(randomize=True): lgg.trace('%d \t %d \t %d' % (j, i, self.doc_topic_counts.shape[1] - 1)) params = self.prob_zji(j, i, self._K + 1) sample_topic_raveled = categorical(params) k_j, k_i = np.unravel_index(sample_topic_raveled, (self._K + 1, self._K + 1)) k_j, k_i = k_j[0], k_i[0] # beurk :( self.z[j, i, 0] = k_j self.z[j, i, 1] = k_i # Regularize matrices for new topic sampled if k_j == self.doc_topic_counts.shape[ 1] - 1 or k_i == self.doc_topic_counts.shape[1] - 1: self._K += 1 #print 'Simplex probabilities: %s' % (params) self.update_matrix_shape(new_topic=True) self.update_matrix_count(j, i, k_j, k_i) # Remove pnew container self.doc_topic_counts = self.doc_topic_counts[:, :-1] self.likelihood.word_topic_counts = self.likelihood.word_topic_counts[:, : -1, : -1] self.purge_empty_topics() return self.z
def sample(self): # Add pnew container self._update_log_alpha_beta() self.doc_topic_counts = np.column_stack( (self.doc_topic_counts, np.zeros(self.J, dtype=int))) self.likelihood.word_topic_counts = np.column_stack( (self.likelihood.word_topic_counts, np.zeros(self.likelihood.nfeat, dtype=int))) self.likelihood.total_w_k = self.likelihood.word_topic_counts.sum(0) lgg.info('Sample z...') lgg.debug('#Doc \t nnz \t #topic') doc_order = np.random.permutation(self.J) for doc_iter, j in enumerate(doc_order): nnz = self.data_dims[j] lgg.debug('%d \t %d \t %d' % (doc_iter, nnz, self.doc_topic_counts.shape[1] - 1)) nnz_order = np.random.permutation(nnz) for i in nnz_order: params = self.prob_zji(j, i, self._K + 1) sample_topic = categorical(params) self.z[j][i] = sample_topic # Regularize matrices for new topic sampled if sample_topic == self.doc_topic_counts.shape[1] - 1: self._K += 1 #print 'Simplex probabilities: %s' % (params) col_doc = np.zeros((self.J, 1), dtype=int) col_word = np.zeros((self.likelihood.nfeat, 1), dtype=int) self.doc_topic_counts = np.hstack( (self.doc_topic_counts, col_doc)) self.likelihood.word_topic_counts = np.hstack( (self.likelihood.word_topic_counts, col_word)) self.likelihood.total_w_k = self.likelihood.word_topic_counts.sum( 0) # Update count matrixes self.doc_topic_counts[j, sample_topic] += 1 self.likelihood.word_topic_counts[self.likelihood.data[j][i], sample_topic] += 1 self.likelihood.total_w_k[sample_topic] += 1 # Remove pnew container self.doc_topic_counts = self.doc_topic_counts[:, :-1] self.likelihood.word_topic_counts = self.likelihood.word_topic_counts[:, : -1] self.purge_empty_topics() return self.z
def sample(self): lgg.debug('Sample z...') lgg.vdebug('#J \t #I \t #topic') for j, i in self.likelihood.data_iter(randomize=True): lgg.vdebug( '%d \t %d \t %d' % (j , i, self.doc_topic_counts.shape[1]-1)) params = self.prob_zji(j, i, self.K) sample_topic_raveled = categorical(params) k_j, k_i = np.unravel_index(sample_topic_raveled, (self._K, self._K)) k_j, k_i = k_j[0], k_i[0] # beurk :( self.z[j, i, 0] = k_j self.z[j, i, 1] = k_i nodes_classes_ass = [(j, k_j), (i, k_i)] self.update_matrix_count(j, i, k_j, k_i) return self.z
def sample(self): print( 'Sample z...') lgg.debug( '#Doc \t #nnz\t #Topic') doc_order = np.random.permutation(self.J) for doc_iter, j in enumerate(doc_order): nnz = self.data_dims[j] lgg.debug( '%d \t %d \t %d' % ( doc_iter , nnz, self.K )) nnz_order = np.random.permutation(nnz) for i in nnz_order: params = self.prob_zji(j, i, self.K) sample_topic = categorical(params) self.z[j][i] = sample_topic self.doc_topic_counts[j, sample_topic] += 1 self.likelihood.word_topic_counts[self.likelihood.data[j][i], sample_topic] += 1 self.likelihood.total_w_k[sample_topic] += 1 return self.z
def sample(self): self._update_m() indices = np.ndenumerate(self.count_k_by_j) lgg.debug('Sample m...') for ind in indices: j, k = ind[0] count = ind[1] if count > 0: # Sample number of tables in j serving dishe k params = self.prob_jk(j, k) sample = categorical(params) + 1 else: sample = 0 self.m[j, k] = sample self.m_dotk = self.m.sum(0) self.purge_empty_tables() return self.m
def pvalue(self, _type='global'): """ similar to zipf but compute pvalue and print table Parameters ========== _type: str in [global, local, feature] """ if self.model is None: return expe = self.expe figs = [] Y = self._Y N = Y[0].shape[0] model = self.model Table, Meas = self.init_fit_tables(_type, Y) self.log.info('using `%s\' burstiness' % _type) if _type == 'global': ### Global degree for it_dat, data in enumerate(Y): d, dc = degree_hist(adj_to_degree(data), filter_zeros=True) gof = gofit(d, dc) if not gof: continue for i, v in enumerate(Meas): Table[self.corpus_pos, i, it_dat] = gof[v] elif _type == 'local': ### Z assignement method a, b = model.get_params() N, K = a.shape print('theta shape: %s' % (str((N, K)))) now = Now() if 'mmsb' in expe.model: ZZ = [] for _i, _ in enumerate(Y): #for _ in Y: # Do not reflect real local degree ! theta = self._Theta[_i] phi = self._Phi[_i] Z = np.empty((2, N, N)) order = np.arange(N**2).reshape((N, N)) if expe.symmetric: triu = np.triu_indices(N) order = order[triu] else: order = order.flatten() order = zip(*np.unravel_index(order, (N, N))) for i, j in order: Z[0, i, j] = categorical(theta[i]) Z[1, i, j] = categorical(theta[j]) Z[0] = np.triu(Z[0]) + np.triu(Z[0], 1).T Z[1] = np.triu(Z[1]) + np.triu(Z[1], 1).T ZZ.append(Z) self.log.info('Z formation %s second', nowDiff(now)) clustering = 'modularity' comm = model.communities_analysis(data=Y[0], clustering=clustering) print('clustering method: %s, active clusters ratio: %f' % (clustering, len(comm['block_hist'] > 0) / K)) local_degree_c = {} ### Iterate over all classes couple if expe.symmetric: #k_perm = np.unique( map(list, map(set, itertools.product(np.unique(clusters) , repeat=2)))) k_perm = np.unique( list( map( list, map( list, map(set, itertools.product(range(K), repeat=2)))))) else: #k_perm = itertools.product(np.unique(clusters) , repeat=2) k_perm = itertools.product(range(K), repeat=2) for it_k, c in enumerate(k_perm): if isinstance(c, (np.int64, np.float64)): k = l = c elif len(c) == 2: # Stochastic Equivalence (extra class bind k, l = c #continue else: # Comunnities (intra class bind) k = l = c.pop() #if i > expe.limit_class: # break if k != l: continue degree_c = [] YY = [] if 'mmsb' in expe.model: for y, z in zip(Y, ZZ): # take the len of ZZ if < Y y_c = y.copy() phi_c = np.zeros(y.shape) # UNDIRECTED ! phi_c[(z[0] == k) & (z[1] == l )] = 1 #; phi_c[(z[0] == l) & (z[1] == k)] = 1 y_c[phi_c != 1] = 0 #degree_c += adj_to_degree(y_c).values() #yerr= None YY.append(y_c) elif 'ilfm' in expe.model: for _i, y in enumerate(Y): theta = self._Theta[_i] YY.append( (y * np.outer(theta[:, k], theta[:, l])).astype(int)) d, dc, yerr = random_degree(YY) if len(d) == 0: continue gof = gofit(d, dc) if not gof: continue for i, v in enumerate(Meas): Table[self.corpus_pos, i, it_k] = gof[v] elif _type == 'feature': raise NotImplementedError if self._it == self.expe_size - 1: for _model, table in self.gramexp.tables.items(): # Mean and standard deviation table_mean = np.char.array(np.around( table.mean(2), decimals=3)).astype("|S20") table_std = np.char.array(np.around(table.std(2), decimals=3)).astype("|S20") table = table_mean + b' $\pm$ ' + table_std # Table formatting corpuses = self.specname(self.gramexp.get_set('corpus')) table = np.column_stack((self.specname(corpuses), table)) tablefmt = 'simple' table = tabulate(table, headers=['__' + _model.upper() + '__'] + Meas, tablefmt=tablefmt, floatfmt='.3f') print() print(table) if expe._write: if expe._mode == 'predictive': base = '%s_%s_%s' % (self.specname( expe.corpus), self.specname(_model), _type) else: base = '%s_%s_%s' % ('MG', self.specname(_model), _type) self.write_frames(table, base=base, ext='md')
def burstiness(self, _type='all'): '''Zipf Analysis (global burstiness) + local burstiness + feature burstiness Parameters ---------- _type : str type of burstiness to compute in ('global', 'local', 'feature', 'all') ''' if self.model is None: return expe = self.expe figs = [] Y = self._Y N = Y[0].shape[0] model = self.model if _type in ('global', 'all'): # Global burstiness d, dc, yerr = random_degree(Y) fig = plt.figure() title = 'global | %s, %s' % (self.specname( expe.get('corpus')), self.specname(expe.model)) plot_degree_2((d, dc, yerr), logscale=True, title=title) figs.append(plt.gcf()) if _type in ('local', 'all'): # Local burstiness print('Computing Local Preferential attachment') a, b = model.get_params() N, K = a.shape print('theta shape: %s' % (str((N, K)))) now = Now() if 'mmsb' in expe.model: ### Z assignement method # ZZ = [] for _i, _ in enumerate(Y): #for _ in Y: # Do not reflect real local degree ! theta = self._Theta[_i] phi = self._Phi[_i] Z = np.empty((2, N, N)) order = np.arange(N**2).reshape((N, N)) if expe.symmetric: triu = np.triu_indices(N) order = order[triu] else: order = order.flatten() order = zip(*np.unravel_index(order, (N, N))) for i, j in order: Z[0, i, j] = categorical(theta[i]) Z[1, i, j] = categorical(theta[j]) Z[0] = np.triu(Z[0]) + np.triu(Z[0], 1).T Z[1] = np.triu(Z[1]) + np.triu(Z[1], 1).T ZZ.append(Z) self.log.info('Z formation %s second' % nowDiff(now)) clustering = 'modularity' comm = model.communities_analysis(data=Y[0], clustering=clustering) print('clustering method: %s, active clusters ratio: %f' % (clustering, len(comm['block_hist'] > 0) / K)) local_degree_c = {} ### Iterate over all classes couple if expe.symmetric: #k_perm = np.unique( map(list, map(set, itertools.product(np.unique(clusters) , repeat=2)))) k_perm = np.unique( list( map( list, map( list, map(set, itertools.product(range(K), repeat=2)))))) else: #k_perm = itertools.product(np.unique(clusters) , repeat=2) k_perm = itertools.product(range(K), repeat=2) fig = plt.figure() for i, c in enumerate(k_perm): if isinstance(c, (np.int64, np.float64)): k = l = c elif len(c) == 2: # Stochastic Equivalence (outer class) k, l = c else: # Comunnities (inner class) k = l = c.pop() #if i > expe.limit_class: # break if k != l: continue degree_c = [] YY = [] if 'mmsb' in expe.model: for y, z in zip(Y, ZZ): # take the len of ZZ if < Y y_c = np.zeros(y.shape) phi_c = np.zeros(y.shape) # UNDIRECTED ! phi_c[(z[0] == k) & (z[1] == l)] = 1 y_c = y * phi_c #degree_c += adj_to_degree(y_c).values() #yerr= None YY.append(y_c) elif 'ilfm' in expe.model: # or Corpus ! for _i, y in enumerate(Y): theta = self._Theta[_i] if theta.shape[1] <= max(k, l): print('warning: not all block converted.') continue YY.append( (y * np.outer(theta[:, k], theta[:, l])).astype(int)) d, dc, yerr = random_degree(YY) if len(d) == 0: continue title = 'local | %s, %s' % (self.specname( expe.get('corpus')), self.specname(expe.model)) plot_degree_2((d, dc, yerr), logscale=True, colors=True, line=True, title=title) figs.append(plt.gcf()) # Blockmodel Analysis #if _type in ('feature', 'all'): # plt.figure() # if 'mmsb' in expe.model: # # Feature burstiness # hist, label = clusters_hist(comm['clusters']) # bins = len(hist) # plt.bar(range(bins), hist) # plt.xticks(np.arange(bins)+0.5, label) # plt.xlabel('Class labels') # plt.title('Blocks Size (max assignement)') # elif 'ilfm' in expe.model: # # Feature burstiness # hist, label = sorted_perm(comm['block_hist'], reverse=True) # bins = len(hist) # plt.bar(range(bins), hist) # plt.xticks(np.arange(bins)+0.5, label) # plt.xlabel('Class labels') # plt.title('Blocks Size (max assignement)') # figs.append(plt.gcf()) if expe._write: if expe._mode == 'predictive': base = '%s_%s' % (self.specname( expe.corpus), self.specname(expe.model)) else: base = '%s_%s' % ('MG', self.specname(expe.model)) self.write_frames(figs, base=base) return