def find_disjoint_biclusters(self, biclusters_number=50): data = np.asarray_chkfinite(self.matrix) data[data == 0] = 0.000001 coclustering = SpectralCoclustering(n_clusters=biclusters_number, random_state=0) coclustering.fit(data) biclusters = set() for i in range(biclusters_number): rows, columns = coclustering.get_indices(i) row_set = set(rows) columns_set = set(columns) if len(row_set) > 0 and len(columns_set) > 0: density = self._calculate_box_cluster_density(row_set, columns_set) odd_columns = set() for column in columns_set: col_density = self._calculate_column_density(column, row_set) if col_density < density / 4: odd_columns.add(column) columns_set.difference_update(odd_columns) if len(columns_set) == 0: continue odd_rows = set() for row in row_set: row_density = self._calculate_row_density(row, columns_set) if row_density < density / 4: odd_rows.add(row) row_set.difference_update(odd_rows) if len(row_set) > 0 and len(columns_set) > 0: density = self._calculate_box_cluster_density(row_set, columns_set) biclusters.add(Bicluster(row_set, columns_set, density)) return biclusters
def print_similarity_matrix(sphns, model, model2=None): print " ", for phn1 in sphns: print phn1, " ", print "" m = np.ndarray((len(sphns), len(sphns)), dtype=np.float32) for i, phn1 in enumerate(sphns): print phn1.ljust(4) + ":", for j, phn2 in enumerate(sphns): sim = model.similarity(phn1, phn2) if model2 != None: sim -= model2.similarity(phn1, phn2) print "%0.2f" % sim, m[i][j] = sim print "" phn_order = [phn for phn in sphns] if BICLUSTER: #model = SpectralBiclustering(n_clusters=4, method='log', model = SpectralCoclustering(n_clusters=n_clusters, random_state=0) model.fit(m) print "INDICES:", indices = [model.get_indices(i) for i in xrange(n_clusters)] print indices tmp = [] for i in xrange(n_clusters): tmp.extend([phn_order[indices[i][0][j]] for j in xrange(len(indices[i][0]))]) phn_order = tmp fit_data = m[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] m = fit_data return phn_order, m
def biclustering(input,num_clusters): global agent1_dict data = np.matrix(input) model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) model.fit(data) #create agent 1 dictionary agent1_dict = {} for c in range(num_clusters): agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices return agent1_dict
def get_clusters(data): coclusters = SpectralCoclustering(n_clusters=5, random_state=0) coclusters.fit(data) word_clusters = [] hidden_clusters = [] for i in range(5): wc = coclusters.get_indices(i)[0] hc = coclusters.get_indices(i)[1] word_clusters.append(wc.tolist()) hidden_clusters.append(hc.tolist()) return word_clusters, hidden_clusters
def biclustering(data,num_clusters): clusters = {} data = np.asmatrix(data) model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) #model = SpectralBiclustering(n_clusters=num_clusters) model.fit(data) for c in range(num_clusters): clusters[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices #fit_data = data[np.argsort(model.row_labels_)] #fit_data = fit_data[:, np.argsort(model.column_labels_)] #plot(fit_data) return clusters
def biclustering(input,num_clusters): global agent1_dict data = np.matrix(input) model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) model.fit(data) #create agent 1 dictionary agent1_dict = {} for c in range(num_clusters): agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plot(fit_data) return agent1_dict
def biclustering(db): #mydata = genfromtxt('/home/fan/intern/process_db/analysis/viewtime_matrix_524.csv',dtype=None,delimiter=',',names=True,skip_header=1) df = pd.read_csv( '/home/fan/intern/process_db/analysis/viewtime_matrix_501_0.1.csv') dma = 501 #print df.head() print df.shape dev_list = df.ix[:, 0].values prog_list = df.columns.values #print type(dev_list) #print type(prog_list) df.drop(df.columns[0], axis=1, inplace=True) #df[df==0] = 1 df = df.apply(fraction, axis=1) #print df.head() #print df.values #print type(df.values) #mydata = df.values #mydata=np.delete(mydata, 0, axis=0) #mydata=np.delete(mydata, 0, axis=1) #mydata[mydata==0] = 0.01 #print 'data format is:',mydata,type(mydata) # model=SpectralCoclustering(n_clusters=5, random_state=0) #n_clusters=(1000,20) # 4*3 = 12 clusters #model = SpectralBiclustering(random_state=None) model = SpectralCoclustering(n_clusters=10) model.fit(df) #fit_data=mydata[np.argsort(model.row_labels_)] #fit_data=fit_data[:,np.argsort(model.column_labels_)] #plt.matshow(fit_data[0:40],cmap=plt.cm.Blues) # plt.show() print model.get_params() for i in range(0, 5): print 'Size of one cluster:', model.get_shape(i) indices = model.get_indices(i) #print indices[1] print prog_list[indices[1]] print model.get_submatrix(i, df.values) dev_in_cluster = dev_list[indices[0]] #print type(dev_in_cluster) print 'number of devices within this cluster:', len(dev_in_cluster) get_income(db, dma, dev_in_cluster.tolist())
def biclustering(input_list,num_clusters): global agent1_dict #clustering agent 1 data = np.matrix(input_list) #plot(data)#original data #model = SpectralBiclustering(n_clusters=num_clusters) #Biclustering refer http://scikit-learn.org/stable/auto_examples/bicluster/plot_spectral_biclustering.html#example-bicluster-plot-spectral-biclustering-py model = SpectralCoclustering(n_clusters=num_clusters,random_state=0) #Coclustering refer http://scikit-learn.org/stable/auto_examples/bicluster/plot_spectral_coclustering.html model.fit(data) #create agent 1 dictionary agent1_dict = {} for c in range(num_clusters): agent1_dict[c] = model.get_indices(c)[0].tolist() #0 row indices, 1 column indices fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plot(fit_data) return agent1_dict
weight = X[rows[:, np.newaxis], cols].sum() cut = (X[row_complement[:, np.newaxis], cols].sum() + X[rows[:, np.newaxis], col_complement].sum()) return cut / weight bicluster_ncuts = list(bicluster_ncut(i) for i in xrange(len(newsgroups.target_names))) best_idx = np.argsort(bicluster_ncuts)[:5] print() print("Best biclusters:") print("----------------") for idx, cluster in enumerate(best_idx): n_rows, n_cols = cocluster.get_shape(cluster) cluster_docs, cluster_words = cocluster.get_indices(cluster) if not len(cluster_docs) or not len(cluster_words): continue # categories cluster_categories = list(document_names[i] for i in cluster_docs) counter = Counter(cluster_categories) cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name) for name, c in counter.most_common()[:3]) # words out_of_cluster_docs = cocluster.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = X[:, cluster_words] word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -
Like Counter.most_common in Python >=2.7. """ return sorted(iteritems(d), key=operator.itemgetter(1), reverse=True) bicluster_ncuts = list(bicluster_ncut(i) for i in range(len(newsgroups.target_names))) best_idx = np.argsort(bicluster_ncuts)[:5] print() print("Best biclusters:") print("----------------") for idx, cluster in enumerate(best_idx): n_rows, n_cols = cocluster.get_shape(cluster) cluster_docs, cluster_words = cocluster.get_indices(cluster) if not len(cluster_docs) or not len(cluster_words): continue # categories counter = defaultdict(int) for i in cluster_docs: counter[document_names[i]] += 1 cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name) for name, c in most_common(counter)[:3]) # words out_of_cluster_docs = cocluster.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = X[:, cluster_words] word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -
ind += 1 proposicoes = dict() ind = 0 for line in label_proposicoes: proposicoes[ind] = line ind += 1 with open('deputados.pkl', 'rb') as dep: deputados = cPickle.load(dep) with open('deputados_antigos.pkl', 'rb') as dep: deputados_antigos = cPickle.load(dep) #salvando os biclusters for i in range(n_clusters): indices = model.get_indices(i) linhas = map(lambda a: int(depdict[a]), indices[0]) colunas = [proposicoes[a] for a in indices[1]] salva = open('solution/' + str(n_clusters) + 'bics_' + str(i) + '.txt', 'w') conteudo = [] #map(lambda a: str(a), linhas); for l in linhas: if l > 0: a = deputados[l] nome_parlamentar = a['nome_parlamentar'] partido = a['partido'] uf = a['uf'] #if not(isinstance(nome_parlamentar, str) and isinstance(partido, str) and isinstance(uf, str)) : # print(type(nome_parlamentar), type(partido), type(uf)) texto = [nome_parlamentar, partido, uf] else:
avg_data[row_sel, col_sel] = np.average(data[row_sel, col_sel]) avg_data = avg_data[np.argsort(model.row_labels_)] avg_data = avg_data[:, np.argsort(model.column_labels_)] plt.matshow(avg_data, cmap=plt.cm.Blues) plt.title("Average cluster intensity") plt.savefig('%s_averaged.png' % (identifier), bbox_inches='tight') if args.write: print "Writing clusters to database." # No need to clean up here, just overwrite by _id. for c in range(n_clusters): (nr, nc) = model.get_shape(c) (row_ind, col_ind) = model.get_indices(c) cluster_val = None if nr > 25 or nc > 50: print "Nulling cluster %d: shape (%d, %d)" % (c, nr, nc) else: cluster_val = c for ri in row_ind: data_list[ri]['cluster'] = cluster_val datastream.save(data_list[ri]) for ci in col_ind: events_list[ci]['cluster'] = cluster_val events.save(events_list[ci]) # plt.show()
avg_data[row_sel, col_sel] = np.average(data[row_sel, col_sel]) avg_data = avg_data[np.argsort(model.row_labels_)] avg_data = avg_data[:, np.argsort(model.column_labels_)] plt.matshow(avg_data, cmap=plt.cm.Blues) plt.title("Average cluster intensity") plt.savefig('%s_averaged.png' % (identifier), bbox_inches='tight') if args.write: print "Writing clusters to database." # No need to clean up here, just overwrite by _id. for c in range(n_clusters): (nr, nc) = model.get_shape(c) (row_ind, col_ind) = model.get_indices(c) cluster_val = None if nr > 25 or nc > 50: print "Nulling cluster %d: shape (%d, %d)" % (c, nr, nc) else: cluster_val = c for ri in row_ind: data_list[ri]['cluster'] = cluster_val datastream.save(data_list[ri]) for ci in col_ind: events_list[ci]['cluster'] = cluster_val events.save(events_list[ci]) # plt.show()