def performance(encoder, models, K): mean_ami = dict(zip(models.keys(), list(np.zeros(len(models))))) mean_chs = dict(zip(models.keys(), list(np.zeros(len(models))))) mean_sil = dict(zip(models.keys(), list(np.zeros(len(models))))) tic = time.perf_counter() for i in range(K): features_enc = encoder.fit_transform(features, target) for key in models: model = models[key] y_predict = model.fit_predict(features_enc, target) mean_ami[key] += ami(target, y_predict)/K mean_chs[key] += chs(features_enc, y_predict)/K mean_sil[key] += sil(features_enc, y_predict, metric='euclidean')/K toc = time.perf_counter() # Write results to file res = open('../results/'+name_prefix+'_results.txt', 'a') res.write(type(encoder).__name__[0:-7]+' Encoder\n') for key in mean_ami: res.write(' '+key+': '+str(mean_ami[key])+', '+str(mean_chs[key])+', '+str(mean_sil[key])+'\n') res.write('Total time: '+str(round(toc-tic,3))+'\n') res.close() print('Evaluation of', type(encoder).__name__[0:-7], 'Encoder completed in', round(toc-tic,3),'s')
def test_approx(datasets_dimred, genes, labels, idx, distr, xlabels): integrated = assemble(datasets_dimred[:], approx=False, sigma=150) X = np.concatenate(integrated) distr.append(sil(X[idx, :], labels[idx])) print(ttest_ind(X[idx, :], labels[idx])) xlabels.append('Exact NN') print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('approx'))
def silhouette_score(estimator, X, y=None): """Scorer wrapper for metrics.silhouette_score.""" if hasattr(estimator, 'affinity') and estimator.affinity == 'precomputed': return np.nan _y = estimator.predict(X) n_labels = len(np.unique(_y)) if 1 < n_labels < X.shape[0]: return sil(X, _y) logging.warn("adenine.utils.extension.silhouette_score() returned NaN " "because the number of labels is {}. Valid values are 2 " "to n_samples - 1 (inclusive) = {}".format( n_labels, X.shape[0] - 1)) return np.nan
def predict_from_tuned(self): """Returns labels from hierarchical clustering with n_clusters that maximizes Silhouette measure""" assert self.Mk is not None, "First run fit" scores = [] labels = [] for k in range(self.L_, self.K_): cls = AgglomerativeClustering(n_clusters=k, linkage='average', affinity='precomputed').fit(1 - self.Mk) ls = cls.labels_ labels.append(ls) scores.append(sil(self.X, ls)) labels = np.array(labels) return labels[np.argmax(scores)]
def silhouette_score(estimator, X, y=None): """Scorer wrapper for metrics.silhouette_score.""" if hasattr(estimator, 'affinity') and estimator.affinity == 'precomputed': return np.nan _y = estimator.predict(X) n_labels = len(np.unique(_y)) if 1 < n_labels < X.shape[0]: return sil(X, _y) logging.warn("adenine.utils.extension.silhouette_score() returned NaN " "because the number of labels is {}. Valid values are 2 " "to n_samples - 1 (inclusive) = {}" .format(n_labels, X.shape[0]-1)) return np.nan
def test_alpha(datasets_dimred, genes, labels, idx, distr, xlabels): alphas = [0, 0.05, 0.20, 0.50] for alpha in alphas: integrated = assemble(datasets_dimred[:], alpha=alpha, sigma=150) X = np.concatenate(integrated) distr.append(sil(X[idx, :], labels[idx])) print(ttest_ind(X[idx, :], labels[idx])) xlabels.append(str(alpha)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('alpha'))
def test_sigma(datasets_dimred, genes, labels, idx, distr, xlabels): sigmas = [10, 50, 100, 200] for sigma in sigmas: integrated = assemble(datasets_dimred[:], sigma=sigma) X = np.concatenate(integrated) distr.append(sil(X[idx, :], labels[idx])) print(ttest_ind(X[idx, :], labels[idx])) xlabels.append(str(sigma)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('sigma'))
def test_learn_rate(datasets_dimred, genes, labels, idx, distr, xlabels): X = np.concatenate(datasets_dimred) learn_rates = [50., 100., 500., 1000.] for learn_rate in learn_rates: embedding = fit_tsne(X, learn_rate=learn_rate) distr.append(sil(embedding[idx, :], labels[idx])) print(ttest_ind(X[idx, :], labels[idx])) xlabels.append(str(learn_rate)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('learn_rate'))
def test_perplexity(datasets_dimred, genes, labels, idx, distr, xlabels): X = np.concatenate(datasets_dimred) perplexities = [10, 100, 500, 2000] for perplexity in perplexities: embedding = fit_tsne(X, perplexity=perplexity) distr.append(sil(embedding[idx, :], labels[idx])) print(ttest_ind(X[idx, :], labels[idx])) xlabels.append(str(perplexity)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('perplexity'))
def cluster_and_evaluate(models, X, y, score): n = len(X) k_list = range(2, n) sil_list = [] db_list = [] ae_list = [] for k in k_list: labels = cluster(X, k) sil_list.append(sil(X, labels)) db_list.append(db(X, labels)) ae_list.append(ae(labels)) model = {} evaluate(model, k_list, sil_list, score, 'sil') evaluate(model, k_list, db_list, score, 'db', min) evaluate(model, k_list, ae_list, score, 'ae') models.append(model)
def test_knn(datasets_dimred, genes, labels, idx, distr, xlabels): knns = [5, 10, 50, 100] len_distr = len(distr) for knn in knns: integrated = assemble(datasets_dimred[:], knn=knn, sigma=150) X = np.concatenate(integrated) distr.append(sil(X[idx, :], labels[idx])) for d in distr[:len_distr]: print(ttest_ind(np.ravel(X[idx, :]), np.ravel(d))) xlabels.append(str(knn)) print('') plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('knn'))
def test_dimred(datasets, genes, labels, idx, distr, xlabels): dimreds = [10, 20, 50, 200, 6000] for dimred in dimreds: datasets_dimred, genes = process_data(datasets, genes, dimred=dimred) datasets_dimred = assemble(datasets_dimred, sigma=150) X = np.concatenate(datasets_dimred) distr.append(sil(X[idx, :], labels[idx])) print(ttest_ind(X[idx, :], labels[idx])) xlabels.append(str(dimred)) print('') xlabels[-1] = 'No SVD' plt.figure() plt.boxplot(distr, showmeans=True, whis='range') plt.xticks(range(1, len(xlabels) + 1), xlabels) plt.ylabel('Silhouette Coefficient') plt.ylim((-1, 1)) plt.savefig('param_sensitivity_{}.svg'.format('dimred'))
def plot_projections(holder, labels, preprocess_lda='PCA', class_name='Antioxidants', only_pca=False, binarize_class=True, standardize=True, cluster=True, return_distances=False): ''' holder should be a dictionary with df's as values and fp-filenames as keys labels should be a mapping of DrugCombID: ATC_class ''' if only_pca: from sklearn.decomposition import PCA df = dict() for ind, i in enumerate([ 'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit', 'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new', 'fps_transformer_1024bit_new', 'fps_transformer_64bit_new', 'fps_gae_64bit_new' ]): df_cluster = holder[i].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: from mlxtend.preprocessing import standardize as st classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) df_cluster = st(df_cluster) else: classes = df_cluster.index.copy() pca = PCA(n_components=2) temp = pca.fit_transform(df_cluster) df[ind] = pd.DataFrame(index=df_cluster.index, data=temp) df[ind]['classes'] = classes df[ind]['classes'] = df[ind]['classes'].map(labels) title = 'PCA' else: # to LDA from mlxtend.feature_extraction import LinearDiscriminantAnalysis as LDA from sklearn.preprocessing import LabelEncoder # binary https://stats.stackexchange.com/questions/178587/why-is-the-rank-of-covariance-matrix-at-most-n-1/180366#180366 df = dict() for ind, i in enumerate([ 'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit', 'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new', 'fps_transformer_1024bit_new', 'fps_transformer_64bit_new', 'fps_gae_64bit_new' ]): df_cluster = holder[i].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: from mlxtend.preprocessing import standardize as st from sklearn.preprocessing import MinMaxScaler classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) mms = MinMaxScaler() df_cluster = pd.DataFrame(data=mms.fit_transform(df_cluster), index=df_cluster.index, columns=df.columns) else: classes = df_cluster.index.copy() df_cluster['classes'] = classes df_cluster['classes'] = df_cluster['classes'].map(labels) if binarize_class: df_cluster.loc[df_cluster.classes != class_name, 'classes'] = 'not ' + 'class_name' # change labels from str to int enc = LabelEncoder() real_classes = df_cluster.loc[:, 'classes'] df_cluster.loc[:, 'classes'] = enc.fit_transform( df_cluster['classes']) classes = df_cluster.pop('classes') if preprocess_lda == 'PLS': from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10, scale=False) temp = pls.fit_transform(df_cluster.values, classes.values)[0] elif preprocess_lda == 'PCA': from sklearn.decomposition import PCA pca = PCA(n_components=0.95, svd_solver='full', whiten=False) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'kernelPCA': from sklearn.decomposition import KernelPCA pca = KernelPCA(kernel="rbf", gamma=5) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'NONE': temp = df_cluster.values # lda lda = LDA(n_discriminants=2) lda.fit(temp, classes.values) temp = lda.transform(temp) with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'Casting complex values to real discards the imaginary part' ) temp = temp.astype(np.float) # in case of complex numbers/// df[ind] = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp) df[ind]['classes'] = real_classes title = 'LDA' sns.set_context(context='talk') sns.set_style('dark') sns.set_style({'font.family': 'serif', 'font.sans-serif': ['Helvetica']}) fig, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize=(13, 14)) cm = plt.cm.get_cmap('Spectral') my_cmap = cm(np.linspace(0, 1, len(np.unique(df[ind]['classes']))), alpha=0.6) if return_distances: distances = dict() sil_scores = dict() chs_scores = dict() for ax_n, key, x, name in zip( [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9], df.keys(), df.values(), [ 'E3FP', 'Morgan_300', 'Topo_1024', 'Infomax', 'VAE_256', 'VAE_16', 'Trans_1024', 'Trans_64', 'GAE_64' ]): if not binarize_class: for ind, i in enumerate(np.unique(x['classes'])): color = my_cmap[ind] marker = '.' if i == class_name: color = 'black', marker = ',' ax_n.scatter( x.loc[x.classes == i, 0], x.loc[x.classes == i, 1], marker=marker, label=i + f' (n={str(len(x.loc[x.classes==i, 0]))}) vs Rest ({str(len(x.loc[x.classes!=i, 0]))})', color=color) ax_n.title.set_text(name) else: ax_n.scatter(x.loc[:, 0], x.loc[:, 1], marker='.') ax_n.scatter( x.loc[x.classes == class_name, 0], x.loc[x.classes == class_name, 1], marker=',', label=class_name + f' (n={str(len(x.loc[x.classes==class_name, 0]))}) vs rest (n={str(len(x.loc[x.classes!=class_name, 0]))})', color='darkorange') ax_n.title.set_text(name) if cluster: from sklearn.cluster import KMeans from scipy.spatial.distance import pdist from sklearn.metrics import silhouette_score as sil from sklearn.metrics import calinski_harabasz_score as chs km = KMeans(init='k-means++', n_clusters=1, n_init=10) km.fit(x.loc[x.classes != class_name, [0, 1]]) km1 = KMeans(init='k-means++', n_clusters=1, n_init=10) km1.fit(x.loc[x.classes == class_name, [0, 1]]) ax_n.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='X', color='darkblue', s=100, linewidth=3) ax_n.scatter(km1.cluster_centers_[:, 0], km1.cluster_centers_[:, 1], marker='X', color='red', s=100, linewidth=3) d = round( pdist([km.cluster_centers_[0], km1.cluster_centers_[0]], metric='euclidean')[0], 3) d_sc = round(sil(x.loc[:, [0, 1]], x['classes']), 3) d_chs = round(chs(x.loc[:, [0, 1]], x['classes']), 3) if return_distances: cl_name = class_name + ' ' + name distances[cl_name] = d sil_scores[cl_name] = d_sc chs_scores[cl_name] = d_chs name = name + '\n|d:' + str(d) + '|sil:' + str( d_sc) + '|chs:' + str(d_chs) ax_n.title.set_text(name) for ax in [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]: ax.set_xticks([]) ax.set_yticks([]) labels = ax_n.get_legend_handles_labels()[1] if only_pca: fig.suptitle(labels[0] + "\n classified with: " + title) else: fig.suptitle(labels[0] + "\n classified with: " + title + f', preprocessed with: {preprocess_lda}') fig.tight_layout() if not return_distances: return fig else: return fig, distances, sil_scores, chs_scores
plt.savefig('param_sensitivity_{}.svg'.format('learn_rate')) if __name__ == '__main__': with open('conf/panorama.txt') as f: data_names = f.read().split() labels = np.array(open('data/cell_labels/all.txt').read().rstrip().split()) idx = range(labels.shape[0]) datasets, genes_list, n_cells = load_names(data_names) datasets, genes = merge_datasets(datasets, genes_list) datasets_dimred, genes = process_data(datasets, genes) X = np.concatenate(datasets_dimred) sil_non = sil(X[idx, :], labels[idx]) print(np.median(sil_non)) X = np.loadtxt('data/corrected_mnn.txt') sil_mnn = sil(X[idx, :], labels[idx]) print(np.median(sil_mnn)) X = np.loadtxt('data/corrected_seurat.txt') sil_cca = sil(X[idx, :], labels[idx]) print(np.median(sil_cca)) distr = [sil_non, sil_mnn, sil_cca] xlabels = ['No correction', 'scran MNN', 'Seurat CCA'] # Test alignment parameters. test_approx(datasets_dimred[:], genes, labels, idx, distr[:], xlabels[:])
labels = np.array( open(path + 'cell_labels/all.txt').read().rstrip().split() ) idx = range(labels.shape[0]) #idx = np.random.choice(len(labels), size=int(len(labels)/2), replace=False) datasets, genes_list, cells_list, n_cells = load_names(data_names) datasets, genes = merge_datasets(datasets, genes_list) datasets_dimred, datasets_norm, genes = process_data(datasets, genes) #mmwrite(output + 'panorama_nocorrection_silh.mtx', vstack(datasets_dimred)) # Baseline without correction. X = np.concatenate(datasets_dimred) sil_non = sil(X[idx, :], labels[idx]) print(np.median(sil_non)) ''' # scran MNN. X = np.loadtxt(path + 'corrected_mnn.txt') sil_mnn = sil(X[idx, :], labels[idx]) print(np.median(sil_mnn)) # Seurat CCA. X = np.loadtxt(path + 'corrected_seurat.txt') sil_cca = sil(X[idx, :], labels[idx]) print(np.median(sil_cca)) ''' # Scanorama.