def kmedoidsWithScores(filenameData, filenameSilhMean, nameDBS, nameCHS, kClusters, measure): path = pathlib.Path(str(root) + '\\' + filenameData) if path.is_file(): data = read_sample(path) clusters, predicted = kmedoidsRun(data, kClusters, measure) meanSilhouetteScore = meanSilh(data, clusters) witTXT(meanSilhouetteScore, filenameSilhMean, filepath=root, note=filenameData + " k: " + str(kClusters)) dbsScore = dbs(data, predicted) witTXT(dbsScore, nameDBS, filepath=root, note=filenameData + " k: " + str(kClusters)) chsScore = chs(data, predicted) witTXT(chsScore, nameCHS, filepath=root, note=filenameData + " k: " + str(kClusters))
def kmedoidsWithScores(filenameData, filenameSilhMean, filenameDBS, filenameCHS, kClusters): data = read_sample(str(root) + '\\' + filenameData) #kClusters = canoc(data, kmin, kmax) initial_medoids = randomCenters(len(data), kClusters) kmedoids_instance = kmedoids(data, initial_medoids, metric=metricResearch) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() predicted = kmedoids_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) witTXT(meanSilhouetteScore, filenameSilhMean, filepath=root, note='k: ' + str(kClusters)) dbsScore = dbs(data, predicted) witTXT(dbsScore, filenameDBS, filepath=root, note='k: ' + str(kClusters)) chsScore = chs(data, predicted) witTXT(chsScore, filenameCHS, filepath=root, note='k: ' + str(kClusters))
def calculate_cluster(self, dist_mat, max_iter=21): """Calculates the best possible communities/clusters. self.store_score calculated CH score for every clustering number. (Read the paper for more details) Args: dist_mat (numpy.array) : 1 - cross-correlation matrix. max_iter (int) : Maximum number of iterations for cluster counts. Returns: True if successful; None otherwise. """ best_score = 0 dist_mat = numpy.sqrt(2*(dist_mat)) Data = numpy.triu(dist_mat) Z = ward(Data) self.store_score = dict() for i in range(2, max_iter): label = fcluster(Z, i, criterion='maxclust') self.store_communities[i] = label score = chs(dist_mat, label) self.store_score[i] = score if score > best_score: self.best_community = label best_score = score self.store_score = dict(sorted(self.store_score.items(), key=lambda item: item[1])[::-1]) return True
def performance(encoder, models, K): mean_ami = dict(zip(models.keys(), list(np.zeros(len(models))))) mean_chs = dict(zip(models.keys(), list(np.zeros(len(models))))) mean_sil = dict(zip(models.keys(), list(np.zeros(len(models))))) tic = time.perf_counter() for i in range(K): features_enc = encoder.fit_transform(features, target) for key in models: model = models[key] y_predict = model.fit_predict(features_enc, target) mean_ami[key] += ami(target, y_predict)/K mean_chs[key] += chs(features_enc, y_predict)/K mean_sil[key] += sil(features_enc, y_predict, metric='euclidean')/K toc = time.perf_counter() # Write results to file res = open('../results/'+name_prefix+'_results.txt', 'a') res.write(type(encoder).__name__[0:-7]+' Encoder\n') for key in mean_ami: res.write(' '+key+': '+str(mean_ami[key])+', '+str(mean_chs[key])+', '+str(mean_sil[key])+'\n') res.write('Total time: '+str(round(toc-tic,3))+'\n') res.close() print('Evaluation of', type(encoder).__name__[0:-7], 'Encoder completed in', round(toc-tic,3),'s')
def kmediansWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root) + '\\' + nameData) initial_medians = kppi(data, k_clusters).initialize() kmedians_instance = kmedians(data, initial_medians) kmedians_instance.process() clusters = kmedians_instance.get_clusters() # final_medians = kmedians_instance.get_medians() predicted = kmedians_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root) elbow_instance = elbow(data, kmin, kmax) elbow_instance.process() amount_clusters = elbow_instance.get_amount( ) # most probable amount of clusters wce = elbow_instance.get_wce()
def plot_projections(holder, labels, preprocess_lda='PCA', class_name='Antioxidants', only_pca=False, binarize_class=True, standardize=True, cluster=True, return_distances=False): ''' holder should be a dictionary with df's as values and fp-filenames as keys labels should be a mapping of DrugCombID: ATC_class ''' if only_pca: from sklearn.decomposition import PCA df = dict() for ind, i in enumerate([ 'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit', 'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new', 'fps_transformer_1024bit_new', 'fps_transformer_64bit_new', 'fps_gae_64bit_new' ]): df_cluster = holder[i].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: from mlxtend.preprocessing import standardize as st classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) df_cluster = st(df_cluster) else: classes = df_cluster.index.copy() pca = PCA(n_components=2) temp = pca.fit_transform(df_cluster) df[ind] = pd.DataFrame(index=df_cluster.index, data=temp) df[ind]['classes'] = classes df[ind]['classes'] = df[ind]['classes'].map(labels) title = 'PCA' else: # to LDA from mlxtend.feature_extraction import LinearDiscriminantAnalysis as LDA from sklearn.preprocessing import LabelEncoder # binary https://stats.stackexchange.com/questions/178587/why-is-the-rank-of-covariance-matrix-at-most-n-1/180366#180366 df = dict() for ind, i in enumerate([ 'fps_e3fp_1024bit', 'fps_morgan_1024bit', 'fps_topo_1024bit', 'fps_infomax_new', 'fps_VAE_256bit_new', 'fps_VAE_16bit_new', 'fps_transformer_1024bit_new', 'fps_transformer_64bit_new', 'fps_gae_64bit_new' ]): df_cluster = holder[i].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: from mlxtend.preprocessing import standardize as st from sklearn.preprocessing import MinMaxScaler classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) mms = MinMaxScaler() df_cluster = pd.DataFrame(data=mms.fit_transform(df_cluster), index=df_cluster.index, columns=df.columns) else: classes = df_cluster.index.copy() df_cluster['classes'] = classes df_cluster['classes'] = df_cluster['classes'].map(labels) if binarize_class: df_cluster.loc[df_cluster.classes != class_name, 'classes'] = 'not ' + 'class_name' # change labels from str to int enc = LabelEncoder() real_classes = df_cluster.loc[:, 'classes'] df_cluster.loc[:, 'classes'] = enc.fit_transform( df_cluster['classes']) classes = df_cluster.pop('classes') if preprocess_lda == 'PLS': from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10, scale=False) temp = pls.fit_transform(df_cluster.values, classes.values)[0] elif preprocess_lda == 'PCA': from sklearn.decomposition import PCA pca = PCA(n_components=0.95, svd_solver='full', whiten=False) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'kernelPCA': from sklearn.decomposition import KernelPCA pca = KernelPCA(kernel="rbf", gamma=5) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'NONE': temp = df_cluster.values # lda lda = LDA(n_discriminants=2) lda.fit(temp, classes.values) temp = lda.transform(temp) with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'Casting complex values to real discards the imaginary part' ) temp = temp.astype(np.float) # in case of complex numbers/// df[ind] = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp) df[ind]['classes'] = real_classes title = 'LDA' sns.set_context(context='talk') sns.set_style('dark') sns.set_style({'font.family': 'serif', 'font.sans-serif': ['Helvetica']}) fig, ((ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize=(13, 14)) cm = plt.cm.get_cmap('Spectral') my_cmap = cm(np.linspace(0, 1, len(np.unique(df[ind]['classes']))), alpha=0.6) if return_distances: distances = dict() sil_scores = dict() chs_scores = dict() for ax_n, key, x, name in zip( [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9], df.keys(), df.values(), [ 'E3FP', 'Morgan_300', 'Topo_1024', 'Infomax', 'VAE_256', 'VAE_16', 'Trans_1024', 'Trans_64', 'GAE_64' ]): if not binarize_class: for ind, i in enumerate(np.unique(x['classes'])): color = my_cmap[ind] marker = '.' if i == class_name: color = 'black', marker = ',' ax_n.scatter( x.loc[x.classes == i, 0], x.loc[x.classes == i, 1], marker=marker, label=i + f' (n={str(len(x.loc[x.classes==i, 0]))}) vs Rest ({str(len(x.loc[x.classes!=i, 0]))})', color=color) ax_n.title.set_text(name) else: ax_n.scatter(x.loc[:, 0], x.loc[:, 1], marker='.') ax_n.scatter( x.loc[x.classes == class_name, 0], x.loc[x.classes == class_name, 1], marker=',', label=class_name + f' (n={str(len(x.loc[x.classes==class_name, 0]))}) vs rest (n={str(len(x.loc[x.classes!=class_name, 0]))})', color='darkorange') ax_n.title.set_text(name) if cluster: from sklearn.cluster import KMeans from scipy.spatial.distance import pdist from sklearn.metrics import silhouette_score as sil from sklearn.metrics import calinski_harabasz_score as chs km = KMeans(init='k-means++', n_clusters=1, n_init=10) km.fit(x.loc[x.classes != class_name, [0, 1]]) km1 = KMeans(init='k-means++', n_clusters=1, n_init=10) km1.fit(x.loc[x.classes == class_name, [0, 1]]) ax_n.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='X', color='darkblue', s=100, linewidth=3) ax_n.scatter(km1.cluster_centers_[:, 0], km1.cluster_centers_[:, 1], marker='X', color='red', s=100, linewidth=3) d = round( pdist([km.cluster_centers_[0], km1.cluster_centers_[0]], metric='euclidean')[0], 3) d_sc = round(sil(x.loc[:, [0, 1]], x['classes']), 3) d_chs = round(chs(x.loc[:, [0, 1]], x['classes']), 3) if return_distances: cl_name = class_name + ' ' + name distances[cl_name] = d sil_scores[cl_name] = d_sc chs_scores[cl_name] = d_chs name = name + '\n|d:' + str(d) + '|sil:' + str( d_sc) + '|chs:' + str(d_chs) ax_n.title.set_text(name) for ax in [ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9]: ax.set_xticks([]) ax.set_yticks([]) labels = ax_n.get_legend_handles_labels()[1] if only_pca: fig.suptitle(labels[0] + "\n classified with: " + title) else: fig.suptitle(labels[0] + "\n classified with: " + title + f', preprocessed with: {preprocess_lda}') fig.tight_layout() if not return_distances: return fig else: return fig, distances, sil_scores, chs_scores
def kmedoidsWithScore(nameData, nameSilhouetteMean, nameDBS, nameCHS, k_clusters, measure, kmin, kmax): data = read_sample(str(root)+'\\'+filenameData) kClusters = canoc(data, kmin, kmax) initial_medoids = rci(data, kClusters).initialize() kmedoids_instance = kmedoids(data, initial_medoids) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() predicted = kmedoids_instance.predict(data) silhouetteScore = silhouette(data, clusters).process().get_score() meanSilhouetteScore = np.mean(silhouetteScore) #wlitCSV(silhouetteScore, filenameSilhouette, '', root) #witCSV(meanSilhouetteScore, nameSilhouetteMean, '', root) dbsScore = dbs(data, predicted) #witCSV(dbsScore, nameDBS, '', root) chsScore = chs(data, predicted) #witCSV(chsScore, nameCHS, '', root) # elbow_instance = elbow(data, kmin, kmax) # elbow_instance.process() # amount_clusters = elbow_instance.get_amount() # most probable amount of clusters # wce = elbow_instance.get_wce() kmedoidsWithScore(filenameData, filenameSilhouetteMean, filenameDBS, filenameCHS, k, metric, k_min, k_max)