def __init__(self, data, k_min=4, clusters=5): self._data = data #save the names snames = pd.DataFrame({'samples': data.columns}) snames['id'] = [str(uuid.uuid4()) for x in range(0, snames.shape[0])] self.snames = snames #get the clusters cd = data.corr() acd = cluster_ordered_agglomerative(cd, clusters) # add the k nearest neighbors from the tsne every point to its cluster point tcd = TSNE(2).fit_transform(cd) tcd = pd.DataFrame(tcd) tcd.columns = ['x', 'y'] tcd.index = cd.columns nns = pd.DataFrame( NearestNeighbors(n_neighbors=k_min + 1).fit(tcd).kneighbors_graph(tcd).toarray()) nns.columns = cd.columns.copy() nns.columns.name = 'sample_2' nns.index = cd.columns.copy() nns.index.name = 'sample_1' nns = nns.unstack().reset_index().rename(columns={0: 'match'}) nns = nns[nns['match'] != 0] nns = nns[nns['sample_1'] != nns['sample_2']] ## For each cluster samples add in nn samples clusters = {} for cluster_id in acd['cluster_id'].unique(): #print(cluster_id) clusters[cluster_id] = set( acd[acd['cluster_id'] == cluster_id].index) for member in list(clusters[cluster_id]): #print(member) ## add each points nearest neighbors ns = list(nns.loc[nns['sample_1'] == member, 'sample_2']) clusters[cluster_id] |= set(ns) self.clusters = clusters models = {} for cluster_id in self.clusters: models[cluster_id] = data[list(clusters[cluster_id])].\ apply(lambda x: pd.Series(OrderedDict(zip( ['min','max','mean','std','values'], [np.min(x),np.max(x),np.mean(x),np.std(x)]+[list(x)] ))),1) self.models = models
# 第 1 主成分と第 2 主成分の散布図 (転移温度の値でサンプルに色付け) plt.scatter(score.iloc[:, 0], score.iloc[:, 1], c=dataset.iloc[:, 86], cmap=plt.get_cmap('jet')) plt.colorbar() plt.xlabel('t_1 (PCA)') plt.ylabel('t_2 (PCA)') plt.show() # t-SNE selected_indexes = np.arange(0, autoscaled_x.shape[0], 5) autoscaled_x = autoscaled_x.iloc[selected_indexes, :] t = TSNE(perplexity=perplexity, n_components=2, init='pca', random_state=10).fit_transform(autoscaled_x) t = pd.DataFrame(t, index=autoscaled_x.index, columns=['t_1 (t-SNE)', 't_2 (t-SNE)']) t.columns = ['t_1 (t-SNE)', 't_2 (t-SNE)'] t.to_csv('tsne_t.csv') # t1 と t2 の散布図 (転移温度の値でサンプルに色付け) plt.rcParams['font.size'] = 18 plt.scatter(t.iloc[:, 0], t.iloc[:, 1], c=dataset.iloc[selected_indexes, 86], cmap=plt.get_cmap('jet')) plt.colorbar() plt.xlabel('t_1 (t-SNE)') plt.ylabel('t_2 (t-SNE)') plt.show()
#train_topics=get_top_topics(nmf, spleen.index, n_top_words) #print_top_words(nmf, spleen.index, n_top_words) ##----------------------------------------------------------------------------- ##visualization pca = PCA(n_components=2) embedding = pca.fit_transform(topic_matrix) embedding = pd.DataFrame(embedding) embedding.columns=['PC1','PC2'] f=sns.lmplot(x='PC1', y='PC2',data=embedding, fit_reg=False,legend=False,scatter_kws={"s": 5}) f.savefig("pca",dpi=300) embedding = TSNE(n_components=2).fit_transform(topic_matrix) embedding = pd.DataFrame(embedding) embedding.columns=['tSNE1','tSNE2'] sns.lmplot(x='tSNE1', y='tSNE2',data=embedding, fit_reg=False,legend=False,scatter_kws={"s": 5,"color":"red"}) plt.savefig("tsne",dpi=300) embedding = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='correlation').fit_transform(topic_matrix) embedding = pd.DataFrame(embedding) embedding.columns=['UMAP1','UMAP2'] sns.lmplot(x='UMAP1', y='UMAP2',data=embedding, fit_reg=False,legend=False,scatter_kws={"s": 5,"color":"green"}) plt.savefig("UMAP",dpi=300) ##----------------------------------------------------------------------------- ##clustering