def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] radius = int( model_configuration["configuration"]["fragments"][0]["size"]) active_molecules_tt = [] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) tt_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect( molecule, radius) active_molecules_tt.append(tt_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect( test_molecule, radius) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_tt ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def testTorsionsRegression(self): inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.tts.pkl.gz'), 'rb') torsions = cPickle.load(inF, encoding='bytes') for i, m in enumerate(self.mols): tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) if tt != torsions[i]: print(Chem.MolToSmiles(m)) pd = tt.GetNonzeroElements() rd = torsions[i].GetNonzeroElements() for k, v in pd.iteritems(): if rd.has_key(k): if rd[k] != v: print('>>>1', k, v, rd[k]) else: print('>>>2', k, v) for k, v in rd.iteritems(): if pd.has_key(k): if pd[k] != v: print('>>>3', k, v, pd[k]) else: print('>>>4', k, v) self.assertTrue(tt == torsions[i]) self.assertTrue(tt != torsions[i - 1])
def testGetTopologicalTorsionFingerprintAsIds(self): mol = Chem.MolFromSmiles('C1CCCCN1') tt = Torsions.GetTopologicalTorsionFingerprint(mol) self.assertEqual(tt.GetNonzeroElements(), {4437590049: 2, 8732557345: 2, 4445978657: 2}) tt = Torsions.GetTopologicalTorsionFingerprintAsIds(mol) self.assertEqual( sorted(tt), [4437590049, 4437590049, 4445978657, 4445978657, 8732557345, 8732557345]) tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) self.assertEqual(tt.GetNonzeroElements(), {4437590049: 2, 8732557345: 2, 4445978657: 2})
def torsions_fp(self): df = pd.read_csv(self.csv_path) smiles_list = df['Smiles'].tolist() fingerprints = [] not_found = [] for i in tqdm(range(len(smiles_list))): try: mol = Chem.MolFromSmiles(smiles_list[i]) fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) fp._sumCache = fp.GetTotalVal( ) #Bit vector here will be huge, which is why taking TotalVal() # bits = fp.ToBitString() # bits_array = (np.fromstring(fp.ToBitString(),'u1') - ord('0')) fingerprints.append(fp._sumCache) except: fingerprints.append(np.nan) not_found.append(i) pass df.drop(not_found, axis=0, inplace=True) print('Number of FPs not found: {}'.format(len(not_found))) df.reset_index(drop=True, inplace=True) labelencoder = LabelEncoder() Y = labelencoder.fit_transform(df['Label'].values) Y = Y.reshape(Y.shape[0], 1) print('Output shape: {}'.format(Y.shape)) fp_array = (np.asarray((fingerprints), dtype=object)) X = np.delete(fp_array, not_found, axis=0) X = np.vstack(X).astype(np.float32) print('Input shape: {}'.format(X.shape)) final_array = np.concatenate((X, Y), axis=1) # Removing rows, from final_array, where duplicate FPs are present final_array_slice = final_array[:, 0:(final_array.shape[1] - 1)] _, unq_row_indices = np.unique(final_array_slice, return_index=True, axis=0) final_array_unique = final_array[unq_row_indices] print( 'Number of Duplicate FPs: {}'.format(final_array.shape[0] - final_array_unique.shape[0])) print('Final Numpy array shape: {}'.format(final_array_unique.shape)) print('Type of final array: {}'.format(type(final_array_unique))) final_numpy_array = np.asarray((final_array_unique), dtype=np.float32) return final_numpy_array
def testTorsionsRegression(self): inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.tts.pkl.gz'), 'rb') torsions = cPickle.load(inF, encoding='bytes') for i, m in enumerate(self.mols): tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) if tt != torsions[i]: # pragma: nocover debugFingerprint(m, tt, torsions[i]) self.assertEqual(tt, torsions[i]) self.assertNotEqual(tt, torsions[i - 1])
def TORSIONSfpDataFrame(chempandas, namecol, smicol): """ Torsions-based fingerprints 2048 bits. """ assert chempandas.shape[0] <= MAXLINES molsmitmp = [Chem.MolFromSmiles(x) for x in chempandas.iloc[:, smicol]] i = 0 molsmi = [] for x in molsmitmp: if x is not None: x.SetProp("_Name", chempandas.iloc[i, namecol]) molsmi.append(x) i += 1 # TORSIONS Fingerprints. fps = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molsmi ] fpsmat = np.matrix(fps) df = DataFrame(fpsmat, index=[x.GetProp("_Name") for x in molsmi]) # how to name the col? df['SMILES'] = [Chem.MolToSmiles(x) for x in molsmi] df['CHEMBL'] = df.index return (df)
FPDICT['fcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=nbits) FPDICT['fcfc2'] = lambda m: Chem.GetMorganFingerprint(m, 1, useFeatures=True) FPDICT['fcfc4'] = lambda m: Chem.GetMorganFingerprint(m, 2, useFeatures=True) FPDICT['fcfc6'] = lambda m: Chem.GetMorganFingerprint(m, 3, useFeatures=True) FPDICT['lecfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 2, nBits=nbits_long) FPDICT['lecfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, nBits=nbits_long) FPDICT['lfcfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 2, useFeatures=True, nBits=nbits_long) FPDICT['lfcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=nbits_long) FPDICT['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m) FPDICT['ap'] = lambda m: Pairs.GetAtomPairFingerprint(m) FPDICT['tt'] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) FPDICT['hashap'] = lambda m: Desc.GetHashedAtomPairFingerprintAsBitVect( m, nBits=nbits) FPDICT[ 'hashtt'] = lambda m: Desc.GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=nbits) FPDICT['rdk5'] = lambda m: Chem.RDKFingerprint( m, maxPath=5, fpSize=nbits, nBitsPerHash=2) FPDICT['rdk6'] = lambda m: Chem.RDKFingerprint( m, maxPath=6, fpSize=nbits, nBitsPerHash=2) FPDICT['rdk7'] = lambda m: Chem.RDKFingerprint( m, maxPath=7, fpSize=nbits, nBitsPerHash=2) if USE_AVALON: FPDICT['avalon'] = lambda m: pyAv.GetAvalonFP(m, nbits) FPDICT['avalon_l'] = lambda m: pyAv.GetAvalonFP(m, nbits_long)
def ClusterOnFingerprint(filename, mols=None, fingerprint=0, cutoff=0.8, metric='Tanimoto', outMatrix=False): '''Clustering Structure based on Fingerprints in RDKit filename: Smile format file saving molecules. If set to None, use given "mols" mols: Input molecules. No use if set up "filename" cutoff: Cutoff using for Butina Clustering fingerprint: Fingerprint to use: 0 or else: RDKit Topological Fingerprint 1: MACCS Fingerprint 2: Atom Pair Fingerprint (AP) 3: Topological Torsion Fingerprint (TT) 4: Morgan Fingerprint similar to ECFP4 Fingerprint 5: Morgan Fingerprint similar to FCFP4 Fingerprint metric: Available similarity metrics include: Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky. outMatrix: Change output to a similarity matrix Return: Default output "clusters, clusterOut": clusters: Clusters containing molecule number. clusterOut: Molecular Cluster Number in List. ''' from rdkit import DataStructs from rdkit.Chem.Draw import SimilarityMaps from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions if filename: suppl = Chem.SmilesMolSupplier(filename) mols=[] for mol in suppl: mols.append(mol) molnums=len(mols) ### Calculate Molecular Fingerprint ## MACCS Fingerprint if fingerprint==1: fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols] ## Atom Pair Fingerprint (AP) elif fingerprint == 2: fps = [Pairs.GetAtomPairFingerprint(mol) for mol in mols] ## Topological Torsion Fingerprint (TT) elif fingerprint == 3: fps = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in mols] ## Morgan Fingerprint similar to ECFP4 Fingerprint elif fingerprint == 4: fps = [AllChem.GetMorganFingerprint(mol,2) for mol in mols] ## Morgan Fingerprint similar to FCFP4 Fingerprint elif fingerprint == 5: fps = [AllChem.GetMorganFingerprint(mol,2,useFeatures=True) for mol in mols] ## RDKit Topological Fingerprint else: #fingerprint==0: fps = [FingerprintMols.FingerprintMol(mol) for mol in mols] if outMatrix: ### Output the Fingerprint similarity Matrix metricsAvailable={'tanimoto':DataStructs.TanimotoSimilarity,"dice":DataStructs.DiceSimilarity, "cosine": DataStructs.CosineSimilarity, "sokal": DataStructs.SokalSimilarity, "russel": DataStructs.RusselSimilarity, "rogotGoldberg": DataStructs.RogotGoldbergSimilarity, "allbit": DataStructs.AllBitSimilarity, "kulczynski": DataStructs.KulczynskiSimilarity, "mcconnaughey": DataStructs.McConnaugheySimilarity, "asymmetric": DataStructs.AsymmetricSimilarity, "braunblanquet": DataStructs.BraunBlanquetSimilarity} if metric.lower() not in metricsAvailable: print "The given metric is unknown!" metric='Tanimoto' simMetrics=metricsAvailable[metric.lower()] ### Calculate Fingerprint similarity Matrix simdm=[[0.0]*molnums]*molnums for i in range(molnums): simdm[i,i]=1.0 for j in range(i+1,molnums): simdm[i,j]=DataStructs.FingerprintSimilarity(fps[i],fps[j],metric=simMetrics) simdm[j,i]=DataStructs.FingerprintSimilarity(fps[j],fps[i],metric=simMetrics) for i in range(molnums): print for j in range(molnums): print '%3.2f' % simdm[i,j], return simdm else: clusters=ClusterFps(fps, cutoff=1-cutoff, metric='Tanimoto') clusterID=0 clusterOut=[0]*len(mols) for cluster in clusters: clusterID+=1 for idx in cluster: clusterOut[idx]=clusterID ## To depict cluster molecule if False: if len(cluster)>1: print "Cluster: " for idx in cluster: mol2mpl(mols[idx]) return clusters, clusterOut
def sim_two_serial(): #Load Data----------------------------------------------------------------------- path1 = input("Path for list 1: ") path2 = input("Path for list 2: ") smis1 = pd.read_csv(path1) smis1 = smis1["smiles"] smis2 = pd.read_csv(path2) smis2 = smis2["smiles"] l1 = len(smis1) l2 = len(smis2) l = l1 * l2 lp = round(l / 20) #Get molecules from smiles----------------------------------------------------------------------- bad1 = [] molecules1 = [] for i, smi in enumerate(smis1): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list 1 could not be converted to molecule') bad1.append(i) continue molecules1.append(m) bad2 = [] molecules2 = [] for i, smi in enumerate(smis2): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list 2 could not be converted to molecule') bad2.append(i) continue molecules2.append(m) #can1=[Chem.MolToSmiles(x) for x in molecules1] #can2=[Chem.MolToSmiles(x) for x in molecules2] #for j in bad1: #can1.insert(j,"bad1") #for j in bad2: #can2.insert(j,"bad2") smis1 = [] smis2 = [] #Final output matrix----------------------------------------------------------------------- similarity = np.zeros(shape=(l1, l2), dtype=np.float32) from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs from rdkit.Chem.AtomPairs import Torsions from rdkit.Chem import AllChem print('Begining fingerprint calculation...wait') fps_topol1 = [FingerprintMols.FingerprintMol(x) for x in molecules1] fps_maccs1 = [MACCSkeys.GenMACCSKeys(x) for x in molecules1] fps_pairs1 = [Pairs.GetAtomPairFingerprint(x) for x in molecules1] fps_tts1 = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules1 ] fps_ecfp4_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules1 ] fps_ecfp6_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules1 ] fps_fcfp4_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules1 ] fps_fcfp6_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules1 ] print('Begining fingerprint calculation...50%') fps_topol2 = [FingerprintMols.FingerprintMol(x) for x in molecules2] fps_maccs2 = [MACCSkeys.GenMACCSKeys(x) for x in molecules2] fps_pairs2 = [Pairs.GetAtomPairFingerprint(x) for x in molecules2] fps_tts2 = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules2 ] fps_ecfp4_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules2 ] fps_ecfp6_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules2 ] fps_fcfp4_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules2 ] fps_fcfp6_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules2 ] print('Begining fingerprint calculation...done\n') for j in bad1: fps_topol1.insert(j, 1) fps_maccs1.insert(j, 1) fps_pairs1.insert(j, 1) fps_tts1.insert(j, 1) fps_ecfp4_1.insert(j, 1) fps_ecfp6_1.insert(j, 1) fps_fcfp4_1.insert(j, 1) fps_fcfp6_1.insert(j, 1) for j in bad2: fps_topol2.insert(j, 1) fps_maccs2.insert(j, 1) fps_pairs2.insert(j, 1) fps_tts2.insert(j, 1) fps_ecfp4_2.insert(j, 1) fps_ecfp6_2.insert(j, 1) fps_fcfp4_2.insert(j, 1) fps_fcfp6_2.insert(j, 1) print('Begining of fingerprints similarity calculation\n') molecules1 = [] molecules2 = [] k = 0 maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / ( 0.75 * 5) + 1 / (0.85 * 5) for i in range(l1): for j in range(l2): if not ((i in bad1) or (j in bad2)): similarities_topol = DataStructs.FingerprintSimilarity( fps_topol1[i], fps_topol2[j]) similarities_maccs = DataStructs.FingerprintSimilarity( fps_maccs1[i], fps_maccs2[j]) similarities_pairs = DataStructs.DiceSimilarity( fps_pairs1[i], fps_pairs2[j]) similarities_tts = DataStructs.DiceSimilarity( fps_tts1[i], fps_tts2[j]) similarities_ecfp4 = DataStructs.FingerprintSimilarity( fps_ecfp4_1[i], fps_ecfp4_2[j]) similarities_ecfp6 = DataStructs.FingerprintSimilarity( fps_ecfp6_1[i], fps_ecfp6_2[j]) similarities_fcfp4 = DataStructs.FingerprintSimilarity( fps_fcfp4_1[i], fps_fcfp4_2[j]) similarities_fcfp6 = DataStructs.FingerprintSimilarity( fps_fcfp6_1[i], fps_fcfp6_2[j]) similarity[i][j] = ( 0.5 * (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6) + 0.5 * (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6) + 0.5 * (similarities_tts / 0.7 + similarities_pairs / 0.7) + similarities_maccs / 0.85 + similarities_topol / 0.75) / 5 k = k + 1 if k % lp == 0: print('running:', (k / l) * 100, '%') #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity) similarity = similarity / maxs similarity[bad1, :] = 10 similarity[:, bad2] = 10 print('End of fingerprints similarity calculation') bad1 = [] bad2 = [] df_similarity = pd.DataFrame(similarity) similarity = [] return df_similarity
def sim_one_serial(): #Load Data----------------------------------------------------------------------- path = input("Path for list : ") smis = pd.read_csv(path) smis = smis["smiles"] l = len(smis) lp = round(l * l / 20) #Get molecules from smiles----------------------------------------------------------------------- bad = [] molecules = [] for i, smi in enumerate(smis): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list could not be converted to molecule') bad.append(i) continue molecules.append(m) #can=[Chem.MolToSmiles(x) for x in molecules] #for j in bad: #can.insert(j,"bad") smis = [] #Final output matrix----------------------------------------------------------------------- similarity = np.zeros(shape=(l, l), dtype=np.float32) from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs from rdkit.Chem.AtomPairs import Torsions from rdkit.Chem import AllChem print('Begining fingerprint calculation...wait') fps_topol = [FingerprintMols.FingerprintMol(x) for x in molecules] fps_maccs = [MACCSkeys.GenMACCSKeys(x) for x in molecules] fps_pairs = [Pairs.GetAtomPairFingerprint(x) for x in molecules] fps_tts = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules ] fps_ecfp4 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules ] fps_ecfp6 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules ] fps_fcfp4 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules ] fps_fcfp6 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules ] print('Begining fingerprint calculation...done\n') for j in bad: fps_topol.insert(j, 1) fps_maccs.insert(j, 1) fps_pairs.insert(j, 1) fps_tts.insert(j, 1) fps_ecfp4.insert(j, 1) fps_ecfp6.insert(j, 1) fps_fcfp4.insert(j, 1) fps_fcfp6.insert(j, 1) #molecules=[] print('Begining of fingerprints similarity calculation\n') k = 0 maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / ( 0.75 * 5) + 1 / (0.85 * 5) for i in range(l): for j in range(l): if i >= j: if not ((i in bad) or (j in bad)): similarities_topol = DataStructs.FingerprintSimilarity( fps_topol[i], fps_topol[j]) similarities_maccs = DataStructs.FingerprintSimilarity( fps_maccs[i], fps_maccs[j]) similarities_pairs = DataStructs.DiceSimilarity( fps_pairs[i], fps_pairs[j]) similarities_tts = DataStructs.DiceSimilarity( fps_tts[i], fps_tts[j]) similarities_ecfp4 = DataStructs.FingerprintSimilarity( fps_ecfp4[i], fps_ecfp4[j]) similarities_ecfp6 = DataStructs.FingerprintSimilarity( fps_ecfp6[i], fps_ecfp6[j]) similarities_fcfp4 = DataStructs.FingerprintSimilarity( fps_fcfp4[i], fps_fcfp4[j]) similarities_fcfp6 = DataStructs.FingerprintSimilarity( fps_fcfp6[i], fps_fcfp6[j]) similarity[i][j] = ( 0.5 * (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6) + 0.5 * (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6) + 0.5 * (similarities_tts / 0.7 + similarities_pairs / 0.7) + similarities_maccs / 0.85 + similarities_topol / 0.75) / 5 similarity[j][i] = similarity[i][j] k = k + 1 if k % lp == 0: print('running:', (k / (l * l / 2)) * 100, '%') #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity) similarity = similarity / maxs similarity[bad, :] = 10 similarity[:, bad] = 10 print('End of fingerprints similarity calculation') bad = [] df_similarity = pd.DataFrame(similarity) similarity = [] return df_similarity
) fpdict["lecfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, nBits=longbits ) fpdict["lecfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, nBits=longbits ) fpdict["lfcfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, useFeatures=True, nBits=longbits ) fpdict["lfcfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=longbits ) fpdict["maccs"] = lambda m: MACCSkeys.GenMACCSKeys(m) fpdict["ap"] = lambda m: Pairs.GetAtomPairFingerprint(m) fpdict["tt"] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) fpdict[ "hashap" ] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( m, nBits=nbits ) fpdict[ "hashap_cas_length" ] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( m, nBits=n_cas_bits ) fpdict[ "hashtt" ] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=nbits )
similarities_pairs[i][j] = 1 if i % 500 == 0: print('running:', i / len(fps_pairs) * 100, '%') # In[ ]: df = pd.DataFrame(similarities_pairs) df.to_csv('similarities_pairs.csv') # ### Topological torsion descriptors # In[ ]: from rdkit.Chem.AtomPairs import Torsions fps_tts = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules ] similarities_tts = np.zeros(shape=((len(fps_tts), len(fps_tts)))) # In[ ]: #compute similarities. Comment this section if only the fingerprints are needed for i in range(len(fps_tts)): for j in range(len(fps_tts)): if i > j: similarities_tts[i][j] = DataStructs.DiceSimilarity( fps_tts[i], fps_tts[j]) #default is the Dice similarity for these fps similarities_tts[j][i] = similarities_tts[i][j] elif i == j: similarities_tts[i][j] = 1
def BuildTorsionsFP(mol): from rdkit.Chem.AtomPairs import Torsions fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) fp._sumCache = fp.GetTotalVal() return fp
'/drug_development/studyRdkit/st_rdcit/img/mol21.jpg' ) pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms] print(pairFps) # 由于包含在原子对指纹中的位空间很大,因此他们以稀疏的方式存储为字典形式 d = pairFps[-1].GetNonzeroElements() print(d) # {541732: 1, 558113: 2, 558115: 2, 558146: 1, 1606690: 2, 1606721: 2} print(d[541732]) # 1 # 位描述也可以像如下所示展示 de = Pairs.ExplainPairScore(558115) print(de) # (('C', 1, 0), 3, ('C', 2, 0)) # The above means: C with 1 neighbor and 0 pi electrons which is 3 bonds from a C with 2 neighbors and 0 pi electrons # 碳带有一个邻位孤电子和0个π电子,这是因为碳与两个邻位原子和氧原子形成3个化学键。 # # 2.4 拓扑扭曲topological torsions tts = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in ms] d_ds = DataStructs.DiceSimilarity(tts[0], tts[1]) print(d_ds) # 0.16666666666666666 # # 2.5 摩根指纹(圆圈指纹)AllChem.GetMorganFingerprint(mol,2) # 通过将Morgan算法应用于一组用户提供的原子不变式,可以构建这一系列的指纹。生成Morgan指纹时,还必须提供指纹的半径 m1 = Chem.MolFromSmiles('Cc1ccccc1') m2 = Chem.MolFromSmiles('Cc1ncccc1') fp1 = AllChem.GetMorganFingerprint(m1, 2) fp2 = AllChem.GetMorganFingerprint(m2, 2) d_mf = DataStructs.DiceSimilarity(fp1, fp2) print(d_mf) # 0.55 # Morgan指纹像原子对和拓扑扭转一样,默认情况系按使用计数,但有也可以将他们计算为位向量 fp1 = AllChem.GetMorganFingerprintAsBitVect(m1, 2, nBits=1024) fp2 = AllChem.GetMorganFingerprintAsBitVect(m2, 2, nBits=1024)
def Calc_Torsions(self): tts = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in self.sd] return tts
def FptTorsion(rdkmol): return Torsions.GetTopologicalTorsionFingerprintAsIntVect(rdkmol)