def fp_ap_std_mp(mol, mol_can, i, nBits, chiral): fp_mol = Pairs.GetHashedAtomPairFingerprint(mol, nBits=nBits, includeChirality=chiral) if id(mol) == id(mol_can): fp_can = fp_mol else: fp_can = Pairs.GetHashedAtomPairFingerprint(mol_can, nBits=nBits, includeChirality=chiral) return (i, fp_mol, fp_can)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] active_molecules_ap = [] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) ecfp_fingerprint = Pairs.GetAtomPairFingerprint(molecule) active_molecules_ap.append(ecfp_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Pairs.GetAtomPairFingerprint( test_molecule) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_ap ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def atom_pairs(): """ Atom pair fingerprints, atom descriptor """ # Generate molecules ms = [ Chem.MolFromSmiles('C1CCC1OCC'), Chem.MolFromSmiles('CC(C)OCC'), Chem.MolFromSmiles('CCOCC') ] pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms] # Get the list of bits and their counts for each fingerprint as a dictionary d = pairFps[-1].GetNonzeroElements() print(d) # Explanation of the bitscore. print(Pairs.ExplainPairScore(558115)) # Dice similarity; The usual metric for similarity between atom-pair fingerprints print(DataStructs.DiceSimilarity(pairFps[0], pairFps[1])) # Atom decriptor without count pairFps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms] print(DataStructs.DiceSimilarity(pairFps[0], pairFps[1]))
def extract_atompair_fragments(molecule: object) -> list: output = [] pairFps = Pairs.GetAtomPairFingerprint(molecule) d = pairFps.GetNonzeroElements() for pair in d: atom1 = rdkit.Chem.AtomFromSmarts(Pairs.ExplainPairScore(pair)[0][0]) atom2 = rdkit.Chem.AtomFromSmarts(Pairs.ExplainPairScore(pair)[2][0]) smiles = (Pairs.ExplainPairScore(pair)[0][0] + Pairs.ExplainPairScore(pair)[2][0]) atom1_type = atom1.GetAtomicNum() atom2_type = atom2.GetAtomicNum() atom1_num_pi_bonds = Pairs.ExplainPairScore(pair)[0][2] atom2_num_pi_bonds = Pairs.ExplainPairScore(pair)[2][2] atom1_num_neigh = Pairs.ExplainPairScore(pair)[0][1] atom2_num_neigh = Pairs.ExplainPairScore(pair)[2][1] atom1_property_value = 64 * atom1_type + 16 * atom1_num_pi_bonds + atom1_num_neigh atom2_property_value = 64 * atom2_type + 16 * atom2_num_pi_bonds + atom2_num_neigh dist = Pairs.ExplainPairScore(pair)[1] + 1 atom_pair_key = min( atom1_property_value, atom2_property_value) + 1024 * ( max(atom1_property_value, atom2_property_value) + 1024 * dist) num = (d[pair]) for i in range(num): output.append({ "smiles": smiles, "index": atom_pair_key, "type": "AP", "size": dist }) return output
def fp_ap_std(mols, nBits, chiral): for i in mols: fp_mol = Pairs.GetHashedAtomPairFingerprint(mols[i]["mol"], nBits=nBits, includeChirality=chiral) mols[i]["fp"] = fp_mol if id(mols[i]["mol"]) == id(mols[i]["mol_can"]): mols[i]["fp_can"] = fp_mol else: mols[i]["fp_can"] = Pairs.GetHashedAtomPairFingerprint(mols[i]["mol_can"], nBits=nBits, includeChirality=chiral)
def caculate_similarity_atomPairs(smiles_A, smiles_B): try: m1 = Chem.MolFromSmiles(smiles_A) m2 = Chem.MolFromSmiles(smiles_B) p1 = Pairs.GetAtomPairFingerprint(m1) p2 = Pairs.GetAtomPairFingerprint(m2) similarity_p1_p2 = DataStructs.DiceSimilarity(p1, p2) return round(similarity_p1_p2, 4) except: return -1
def sim_rdk_topo_fps(smiA, smisT): """ calculate the fingerprint similarity using the RDK atompair fingerprints input are a smiles string and a list of smiles strings returned is a list of similarities """ fp_A = Pairs.GetAtomPairFingerprint(getMolFromSmiles(smiA)) fps_T = [Pairs.GetAtomPairFingerprint(getMolFromSmiles(y)) for y in smisT] sim_vector = [] for t in fps_T: sim_vector.append(DataStructs.DiceSimilarity(fp_A, t)) return sim_vector
def findCluster(self, smiles): mol = Chem.MolFromSmiles(smiles) if mol: try: scaffold = MurckoScaffold.GetScaffoldForMol(mol) except: return "", "", False if scaffold: cluster = Chem.MolToSmiles(scaffold, isomericSmiles=False) else: return "", "", False else: return "", "", False fp = Pairs.GetAtomPairFingerprint(scaffold) # Change to Tanimoto? if cluster in self.getFingerprints(): return cluster, fp, False fps = list(self.getFingerprints().values()) sims = DataStructs.BulkTanimotoSimilarity(fp, fps) if len(sims) == 0: return cluster, fp, True closest = np.argmax(sims) if sims[closest] >= self.minsimilarity: return list(self.getFingerprints().keys())[closest], fp, False else: return cluster, fp, True
def fp_atompairs_taut(query, nBits, chiral): for i in query: for j in range(len(query[i]["tauts"])): fp = Pairs.GetHashedAtomPairFingerprint(query[i]["tauts"][j], nBits=nBits, includeChirality=chiral) query[i][f"fp{j}"] = fp
def atom_pair_fp(self): df = pd.read_csv(self.csv_path) smiles_list = df['Smiles'].tolist() fingerprints = [] not_found = [] for i in tqdm(range(len(smiles_list))): try: mol = Chem.MolFromSmiles(smiles_list[i]) fp = Pairs.GetAtomPairFingerprintAsIntVect(mol) fp._sumCache = fp.GetTotalVal( ) #Bit vector here will be huge, which is why taking TotalVal() # bits = fp.ToBitString() # bits_array = (np.fromstring(fp.ToBitString(),'u1') - ord('0')) fingerprints.append(fp._sumCache) print('fing', fingerprints) except: fingerprints.append(np.nan) not_found.append(i) pass df.drop(not_found, axis=0, inplace=True) print('Number of FPs not found: {}'.format(len(not_found))) df.reset_index(drop=True, inplace=True) labelencoder = LabelEncoder() Y = labelencoder.fit_transform(df['Label'].values) Y = Y.reshape(Y.shape[0], 1) print('Output shape: {}'.format(Y.shape)) fp_array = (np.asarray((fingerprints), dtype=object)) X = np.delete(fp_array, not_found, axis=0) X = np.vstack(X).astype(np.float32) print('Typeof X', type(X)) print(X) print('Input shape: {}'.format(X.shape)) final_array = np.concatenate((X, Y), axis=1) # Removing rows, from final_array, where duplicate FPs are present final_array_slice = final_array[:, 0:(final_array.shape[1] - 1)] _, unq_row_indices = np.unique(final_array_slice, return_index=True, axis=0) final_array_unique = final_array[unq_row_indices] print( 'Number of Duplicate FPs: {}'.format(final_array.shape[0] - final_array_unique.shape[0])) print('Final Numpy array shape: {}'.format(final_array_unique.shape)) print('Type of final array: {}'.format(type(final_array_unique))) final_numpy_array = np.asarray((final_array_unique), dtype=np.float32) return final_numpy_array
def compute_pca(self): Database = self.Database2 smiles = list(Database.SMILES) smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] sklearn_pca = sklearn.decomposition.PCA(n_components=2, svd_solver="full", whiten=True) sklearn_pca.fit(similarity_matrix) variance = list(sklearn_pca.explained_variance_ratio_) a = round(variance[0] * 100, 2) b = round(variance[1] * 100, 2) pca_result = pd.DataFrame(sklearn_pca.transform(similarity_matrix), columns=['PC1', 'PC2']) pca_result["LIBRARY"] = Database.LIBRARY pca_result["TIPO"] = Database.LIBRARY pca_result["SMILES"] = Database.SMILES pca_result["NAME"] = Database.NAME self.pca_result = pca_result.set_index('TIPO') variance = list(sklearn_pca.explained_variance_ratio_) self.a = round(variance[0] * 100, 2) self.b = round(variance[1] * 100, 2) return pca_result
def computeFP(self, typeFP): from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions from rdkit.Chem import AllChem if not "smiclean" in self.__dict__: self.log = self.log + "No smiles prepared\n" return 1 else: self.mol = Chem.MolFromSmiles(self.smiclean) #print self.smiclean dFP = {} if typeFP == "Mol" or typeFP == "All": dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint( self.mol) if typeFP == "Morgan" or typeFP == "All": dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.FP = dFP return 0
def atom_pairs(self): ms = np.array([Chem.MolFromSmiles(i) for i in self.data.SMILES]) # compute Atom Pair fp = [ Pairs.GetAtomPairFingerprint( Chem.RemoveHs(x)).GetNonzeroElements() for x in ms ] # obtain all bits present bits_ap = set() for i in fp: bits_ap.update([*i]) # add bits for each molecule bits_ap = sorted(bits_ap) feature_matrix = list() # convert fp to bits for item in fp: vect_rep = np.isin( bits_ap, [*item]) # vect_rep, var that indicates bits presents # identify axis to replace ids_to_update = np.where(vect_rep == True) vect_rep = 1 * vect_rep vect_rep = np.array(vect_rep).astype(int) # replace indices with bict values vect_rep[ids_to_update] = list(item.values()) feature_matrix.append(vect_rep) return feature_matrix
def compute_tsne(self): Database = self.Database2 smiles = list(Database["SMILES"]) smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] distance_matrix = np.subtract(1, similarity_matrix) TSNE_sim = TSNE( n_components=2, init='pca', random_state=1992, angle=0.3, perplexity=self.perplexity).fit_transform(distance_matrix) tsne_result = pd.DataFrame(data=TSNE_sim, columns=["PC1", "PC2"]) tsne_result["LIBRARY"] = list(Database.LIBRARY) tsne_result["TIPO"] = list(Database.LIBRARY) tsne_result["SMILES"] = list(Database.SMILES) tsne_result["NAME"] = list(Database.NAME) self.tsne_result = tsne_result.set_index('TIPO')
def _atomsFingerprintsClustering(rdkit_mols): """ Returns the dice distance matrix based on atomsfingerprints method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- dicematrix: np.array The numpy array containing the dice matrix """ from rdkit.Chem.AtomPairs import Pairs # Atom pairs fps = [] for m in tqdm(rdkit_mols): fps.append(Pairs.GetAtomPairFingerprint(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) dice_matrix = aprun(total=len(fps), desc='AtomsFingerprints Distance') \ (delayed(DiceDistances)(fp1, fps) for fp1 in fps) return np.array(dice_matrix)
def atom_pairs_similarity(active_molecules1, test_molecules): similarity = [] active_molecules_pairfps = [ Pairs.GetAtomPairFingerprint(p) for p in active_molecules1 ] test_molecules_pairsfps = [ Pairs.GetAtomPairFingerprint(p) for p in test_molecules ] for i in range(len(test_molecules_pairsfps)): num_sim = 0 for j in range(len(active_molecules_pairfps)): sim = DataStructs.DiceSimilarity(test_molecules_pairsfps[i], active_molecules_pairfps[j]) if sim > num_sim: num_sim = sim similarity.append(num_sim) return similarity
def testPairsRegression(self): inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.aps.pkl.gz'), 'rb') atomPairs = cPickle.load(inF, encoding='bytes') for i, m in enumerate(self.mols): ap = Pairs.GetAtomPairFingerprint(m) if ap != atomPairs[i]: # pragma: nocover debugFingerprint(m, ap, atomPairs[i]) self.assertEqual(ap, atomPairs[i]) self.assertNotEqual(ap, atomPairs[i - 1])
def fingerprint_smile(smile, fp_type): murcko = get_murcko_smile(smile) mol = Chem.MolFromSmiles(murcko) if fp_type == "atom-pair": fps = Pairs.GetAtomPairFingerprintAsBitVect(mol) elif fp_type == "maccs": fps = MACCSkeys.GenMACCSKeys(mol) else: fps = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=1024) return fps
def orng_sim_rdk_atompair_fps(smile_active, train_instance): """ calculate the fingerprint similarity using the RDK atom pair fingerprints input are a smiles string and a orange data instance returned is a similaritie value """ smilesName = getSMILESAttr(train_instance) if not smilesName: return None smile_train = str(train_instance[smilesName].value) molAct = getMolFromSmiles(smile_active) molTrain = getMolFromSmiles(smile_train) if not molAct: return None if not molTrain: return None fp_A = Pairs.GetAtomPairFingerprint(molAct) fp_T = Pairs.GetAtomPairFingerprint(molTrain) sim = DataStructs.DiceSimilarity(fp_A, fp_T) return sim
def fingerprint(mol, fp_type="DL"): if fp_type == "DL": return FingerprintMols.FingerprintMol(mol) elif fp_type == "circular": return AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024) elif fp_type == "MACCS": return MACCSkeys.GenMACCSKeys(mol) elif fp_type == "torsions": return Pairs.GetAtomPairFingerprintAsBitVect(mol) elif fp_type == "pharm": return Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)
def get_similarity(): # get similarities on the first molecule in compound group # precalculate fingerprints for reference compound ref_morgan2 = AllChem.GetMorganFingerprintAsBitVect(mols[0],radius,bit_size) ref_cmorgan2 = AllChem.GetMorganFingerprint(mols[0],radius) ref_fmorgan2 = AllChem.GetMorganFingerprintAsBitVect(mols[0], radius,bit_size, useFeatures = True) ref_ap = Pairs.GetAtomPairFingerprint(mols[0]) # precalculate fingerprints and bit information for test molecules total_sims = '' fps_morgan2 = [] fps_cmorgan2 = [] fps_fmorgan2 = [] fps_ap = [] info_morgan2 = [] info_cmorgan2 = [] info_fmorgan2 = [] num_mols = len(mols) - 1 reference = compounds[0] del compounds[0] del mols[0] #remove reference cmp from list for m in mols: info = {} fps_morgan2.append(AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size, bitInfo = info)) info_morgan2.append(info) info = {} fps_cmorgan2.append(AllChem.GetMorganFingerprint(m, radius, bitInfo=info)) info_cmorgan2.append(info) info = {} fps_fmorgan2.append(AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size, useFeatures=True, bitInfo=info)) info_fmorgan2.append(info) fps_ap.append(Pairs.GetAtomPairFingerprint(m)) ## calculate similarities for i,m in enumerate(mols): ap_simil = DataStructs.DiceSimilarity(ref_ap, fps_ap[i]) morgan2_simil = DataStructs.DiceSimilarity(ref_morgan2, fps_morgan2[i]) cmorgan2_simil = DataStructs.DiceSimilarity(ref_cmorgan2, fps_cmorgan2[i]) fmorgan2_simil = DataStructs.DiceSimilarity(ref_fmorgan2, fps_fmorgan2[i]) sims =str(reference)+' '+ str(compounds[i].rstrip())+' '+ str(ap_simil)+' '+str(morgan2_simil)+' '+str(cmorgan2_simil)+' '+str(fmorgan2_simil)+'\n' total_sims += sims return total_sims
def atom_fp(Library): ms = list() sim = list() y = list() random.seed(43) N=round(len(Library)*.2) X = random.sample(Library,N) ms=[Chem.MolFromSmiles(i) for i in X] fps_atom = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms] Atom = [DataStructs.FingerprintSimilarity(y,x) for x,y in it.combinations(fps_atom,2)] Atom.sort() sim = Atom y= np.arange(1, len(sim) + 1)/ len(sim) return sim, y
def Atompair_fp(mol, rc_names): fp = [Pairs.GetAtomPairFingerprint(x) for x in mol] tc_df = pd.DataFrame(index=rc_names, columns=rc_names).fillna(0) for c1 in range(len(fp)): tc_df[rc_names[c1]] = [ DataStructs.DiceSimilarity(fp[c1], fp[c2]) for c2 in range(len(fp)) ] clusters = linkage(tc_df.as_matrix(columns=None), "ward") clust_tree = to_tree(clusters, rd=False) d3Dendro = dict(children=[], name=" ") add_node(clust_tree, d3Dendro) label_tree(d3Dendro["children"][0], rc_names) return d3Dendro
def getCountInfo(m, fpType): # m = Chem.MolFromSmiles(formula) fp = None if fpType == 'AtomPair' or fpType.lower() == 'atom': fp = Pairs.GetAtomPairFingerprint(m) return fp.GetNonzeroElements() elif fpType.lower() == 'morgan' or fpType.lower() == 'circular': fp = AllChem.GetMorganFingerprint(m, 2) return fp.GetNonzeroElements() elif fpType == 'Topological' or fpType.lower() == 'topo': fp = Torsions.GetTopologicalTorsionFingerprint(m) Dict = fp.GetNonzeroElements() convertedDict = {} for elem in Dict: convertedDict[int(elem)] = Dict[elem] return convertedDict
def calculate_atom_pair_fp(molecular_df, col): """ Calculates atom pair fingerprint :param molecular_df: pandas data frame containing molecules :param col: column with molecules present :return: """ fps = [] for index, row in molecular_df.iterrows(): try: mol = Chem.MolFromSmiles(row[col]) fp = Pairs.GetAtomPairFingerprintAsBitVect(mol) fps.append(fp) except: fps.append('N/A') molecular_df['atom_pair_fp'] = fps return molecular_df
def Fingerprints(mols, fingerprint): # Indigo fingerprints if fingerprint in indigofps: return [mol.fingerprint(fingerprint) for mol in mols] # RDKit fingerprints if fingerprint in rdkitfps: if fingerprint == "atompair": return [Pairs.GetAtomPairFingerprintAsBitVect(mol) for mol in mols] elif fingerprint == "avalon": return [pyAvalonTools.GetAvalonFP(mol) for mol in mols] elif fingerprint == "daylight": return [Chem.RDKFingerprint(mol, fpSize=2048) for mol in mols] elif fingerprint == "maccs": return [MACCSkeys.GenMACCSKeys(mol) for mol in mols] elif fingerprint == "morgan": return [(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)) for mol in mols] elif fingerprint == "pharm2d": return [ Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory) for mol in mols ] elif fingerprint == "topological": return [FingerprintMols.FingerprintMol(mol) for mol in mols] # RDKit non-bit (integer or float) fingerprints if fingerprint in rdkitnonbitfps: if fingerprint == "sheridan": return [Sheridan.GetBPFingerprint(mol) for mol in mols] elif fingerprint == "topotorsion": return [ Torsions.GetTopologicalTorsionFingerprint(mol) for mol in mols ] # E-state fingerprints if fingerprint in rdkitestatefps: if fingerprint == "estate1": return [Fingerprinter.FingerprintMol(mol)[0] for mol in mols] elif fingerprint == "estate2": return [Fingerprinter.FingerprintMol(mol)[1] for mol in mols] # unknown fingerprint return None
def computeFP(self, typeFP): if not "mol" in self.__dict__: self.log = self.log + "No smiles prepared\n" self.err = 1 else: d_FP = {} if typeFP == "Mol" or typeFP == "All": d_FP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": d_FP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": d_FP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": d_FP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(self.mol) if typeFP == "Morgan" or typeFP == "All": d_FP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.d_FP = d_FP
def compare_structure(smiles1, smiles2, fp_type="Morgan", sim_type="Dice"): """ Task: Compare structual similarity of two compound based on fingerprints. Parameters: smiles1: str, smiles of the compound 1 smiles2: str, smiles of the compound 2 fp_type: str, type of fingerprints sim_type: str, method for calculating similarity """ if fp_type == "Morgan": getfp = lambda smi: AllChem.GetMorganFingerprint( Chem.MolFromSmiles(smi), 2, useFeatures=False) elif fp_type == "MorganWithFeature": getfp = lambda smi: AllChem.GetMorganFingerprint( Chem.MolFromSmiles(smi), 2, useFeatures=True) elif fp_type == "MACCS": getfp = lambda smi: Chem.MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smi) ) elif fp_type == "Topological": getfp = lambda smi: FingerprintMols.FingerprintMol( Chem.MolFromSmiles(smi)) elif fp_type == "AtomPairs": getfp = lambda smi: Pairs.GetAtomPairFingerprint( Chem.MolFromSmiles(smi)) try: fp1 = getfp(smiles1) fp2 = getfp(smiles2) if sim_type == "Dice": sim_fp = DataStructs.DiceSimilarity(fp1, fp2) elif sim_type == "Tanimoto": sim_fp = DataStructs.TanimotoSimilarity(fp1, fp2) elif sim_type == "Cosine": sim_fp = DataStructs.CosineSimilarity(fp1, fp2) elif sim_type == "Sokal": sim_fp = DataStructs.SokalSimilarity(fp1, fp2) elif sim_type == "Russel": sim_fp = DataStructs.RusselSimilarity(fp1, fp2) except Exception as e: sim_fp = -1 return sim_fp
def atom_pairs_fp(SMILES, Library): ms = [Chem.MolFromSmiles(i) for i in SMILES] fp = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms] sim = [DataStructs.FingerprintSimilarity(y, x) for x, y in it.combinations(fp, 2)] sim.sort() # sim = MACCKeys y = np.arange(1, len(sim) + 1) / len(sim) # eje y#estatistical values stat = { "MIN": [round(min(sim), 2)], "1Q": [round(np.percentile(sim, 25))], "MEDIAN": [round(st.median(sim))], "MEAN": [round(st.mean(sim), 2)], "3Q": [round(np.percentile(sim, 75), 2)], "MAX": [max(sim)], "STD": [round(st.stdev(sim), 2)], "Library": [str(Library)], } df = pd.DataFrame.from_dict(stat) fp_result = {"sim": sim, "y": np.arange(1, len(sim) + 1) / len(sim), "df": df} return fp_result
def testPairsRegression(self): inF = gzip.open(os.path.join(self.testDataPath,'mols1000.aps.pkl.gz'),'rb') atomPairs = cPickle.load(inF, encoding='bytes') for i,m in enumerate(self.mols): ap = Pairs.GetAtomPairFingerprint(m) #if ap!=atomPairs[i]: # print Chem.MolToSmiles(m) # pd=ap.GetNonzeroElements() # rd=atomPairs[i].GetNonzeroElements() # for k,v in pd.iteritems(): # if rd.has_key(k): # if rd[k]!=v: print '>>>1',k,v,rd[k] # else: # print '>>>2',k,v # for k,v in rd.iteritems(): # if pd.has_key(k): # if pd[k]!=v: print '>>>3',k,v,pd[k] # else: # print '>>>4',k,v self.assertTrue(ap==atomPairs[i]) self.assertTrue(ap!=atomPairs[i-1])
fps_fmorgan2.append(AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size, useFeatures=True, bitInfo=info)) info_fmorgan2.append(info) fps_ap.append(Pairs.GetAtomPairFingerprint(m)) ### ATOM PAIRS print "generate atom pairs similarity maps" # calculate weights mol_weights = [] for i,m in enumerate(mols): weights = [] orig_simil = DataStructs.DiceSimilarity(ref_ap, fps_ap[i]) matrix = rdmolops.GetDistanceMatrix(m) for at1 in range(m.GetNumAtoms()): new_fp = copy.deepcopy(fps_ap[i]) for at2 in range(m.GetNumAtoms()): bit = Pairs.pyScorePair(m.GetAtomWithIdx(at1), m.GetAtomWithIdx(at2), matrix[at1][at2]) new_fp[bit] -= 1 new_simil = DataStructs.DiceSimilarity(ref_ap, new_fp) weights.append(orig_simil - new_simil) mol_weights.append(weights) # normalization mol_weights = getNormalizedWeights(mol_weights) # draw similarity maps generateSimilarityMaps(mols, mol_weights, 'ap') ### MORGAN2 print "generate morgan2 similarity maps" # calculate weights mol_weights = [] for i,m in enumerate(mols): weights = []