def plot_single_projection(holder, labels, class_name='Antioxidants', fp_name='fps_e3fp_1024bit', standardize=True, preprocess_lda='PCA'): ''' holder should be a dictionary with df's as values and fp-filenames as keys labels should be a mapping of DrugCombID: ATC_class ''' from mlxtend.preprocessing import standardize as st from sklearn.preprocessing import LabelEncoder from sklearn.cluster import KMeans from mlxtend.feature_extraction import LinearDiscriminantAnalysis #in sklearn LDA i'd need to add a dummy class if i want to have 2 components after trasnformation from scipy.spatial.distance import pdist df_cluster = holder[fp_name].copy() df_cluster = df_cluster.loc[df_cluster.index.isin(labels.keys())] df_cluster = df_cluster[~df_cluster.index.duplicated(keep='last')] if standardize: classes = df_cluster.index.copy() df_cluster.reset_index(inplace=True, drop=True) df_cluster = st(df_cluster) else: classes = df_cluster.index.copy() df_cluster[ 'classes'] = classes # our classes are mapped to index in labels dictionary df_cluster['classes'] = df_cluster['classes'].map(labels) df_cluster.loc[df_cluster.classes != class_name, 'classes'] = 'not ' + 'class_name' #dummy = [0]*(df_cluster.shape[1]-1) + ['dummy'] #df_cluster.loc[df_cluster.shape[0]] = dummy # change labels from str to int enc = LabelEncoder() real_classes = df_cluster.loc[:, 'classes'] df_cluster.loc[:, 'classes'] = enc.fit_transform(df_cluster['classes']) classes = df_cluster.pop('classes') if preprocess_lda == 'PLS': from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10, scale=False) temp = pls.fit_transform(df_cluster.values, classes.values)[0] elif preprocess_lda == 'PCA': from sklearn.decomposition import PCA pca = PCA(n_components=0.95, svd_solver='full', whiten=False) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'kernelPCA': from sklearn.decomposition import KernelPCA pca = KernelPCA(kernel="rbf", gamma=5) temp = pca.fit_transform(df_cluster.values) elif preprocess_lda == 'NONE': temp = df_cluster.values elif preprocess_lda == 'NCA': from sklearn.neighbors import NeighborhoodComponentsAnalysis nca = NeighborhoodComponentsAnalysis() temp = nca.fit_transform(df_cluster.values, classes.values) #lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto') #lda.fit(temp, classes.values) #temp1 = lda.transform(temp) lda = LinearDiscriminantAnalysis(n_discriminants=2) lda.fit(temp, classes.values) temp = lda.transform(temp) with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'Casting complex values to real discards the imaginary part') temp = temp.astype(np.float) # in case of complex numbers/// df = pd.DataFrame(index=df_cluster.index, columns=[0, 1], data=temp) df['classes'] = real_classes km = KMeans(init='k-means++', n_clusters=1, n_init=10) km.fit(df.loc[df.classes != class_name, [0, 1]]) km1 = KMeans(init='k-means++', n_clusters=1, n_init=10) km1.fit(df.loc[df.classes == class_name, [0, 1]]) d = pdist([km.cluster_centers_[0], km1.cluster_centers_[0]]) d = str(round(d[0], 3)) fig, ax = plt.subplots(figsize=(6, 6)) ax.scatter(df.loc[df.classes != class_name, 0], df.loc[df.classes != class_name, 1], marker=',', color='grey') ax.scatter(df.loc[df.classes == class_name, 0], df.loc[df.classes == class_name, 1], marker=',', color='orange') ax.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='X', color='green', linewidths=30) ax.scatter(km1.cluster_centers_[:, 0], km1.cluster_centers_[:, 1], marker='X', color='red', linewidths=30) fig.suptitle(class_name + ' ' + d) return fig
def LDA: lda = LinearDiscriminantAnalysis(n_discriminants=2) lda.fit(X_train, y_train) X_lda = lda.transform(X)