def compute_tsne(self): Database = self.Database2 smiles = list(Database["SMILES"]) smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [AllChem.GetMorganFingerprintAsBitVect(x, 3) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] distance_matrix = np.subtract(1, similarity_matrix) TSNE_sim = TSNE( n_components=2, init='pca', random_state=1992, angle=0.3, perplexity=self.perplexity).fit_transform(distance_matrix) tsne_result = pd.DataFrame(data=TSNE_sim, columns=["PC1", "PC2"]) tsne_result["LIBRARY"] = list(Database.LIBRARY) tsne_result["TIPO"] = list(Database.LIBRARY) tsne_result["SMILES"] = list(Database.SMILES) tsne_result["NAME"] = list(Database.NAME) self.tsne_result = tsne_result.set_index('TIPO')
def compute_pca(self): Database = self.Database2 smiles = list(Database.SMILES) smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [AllChem.GetMorganFingerprintAsBitVect(x, 2) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] sklearn_pca = sklearn.decomposition.PCA(n_components=2, svd_solver="full", whiten=True) sklearn_pca.fit(similarity_matrix) variance = list(sklearn_pca.explained_variance_ratio_) a = round(variance[0] * 100, 2) b = round(variance[1] * 100, 2) pca_result = pd.DataFrame(sklearn_pca.transform(similarity_matrix), columns=['PC1', 'PC2']) pca_result["LIBRARY"] = Database.LIBRARY pca_result["TIPO"] = Database.LIBRARY pca_result["SMILES"] = Database.SMILES pca_result["NAME"] = Database.NAME self.pca_result = pca_result.set_index('TIPO') variance = list(sklearn_pca.explained_variance_ratio_) self.a = round(variance[0] * 100, 2) self.b = round(variance[1] * 100, 2) return pca_result
def compute_maccskeys_fp(Data): smiles = list(Data["canonical_smiles"]) # set smile colum smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [MACCSkeys.GenMACCSKeys(x) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] distance_matrix = np.subtract(1, similarity_matrix) distance_matrix = np.round(distance_matrix, 2) print(distance_matrix) ids = list(Data["chembl_id"]) # set id columns return distance_matrix, ids