def __call__(self, smiles: List[str]) -> dict: mols = [Chem.MolFromSmiles(smile) for smile in smiles] valid = [1 if mol is not None else 0 for mol in mols] valid_idxs = [idx for idx, boolean in enumerate(valid) if boolean == 1] valid_mols = [mols[idx] for idx in valid_idxs] fps = [ AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=False) for mol in valid_mols ] tanimoto = np.array([ np.max(DataStructs.BulkTanimotoSimilarity(fp, self.ref_fps)) for fp in fps ]) tanimoto = np.maximum((1 - 2 * np.absolute(0.5 - tanimoto)), 0) score = np.full(len(smiles), 0, dtype=np.float32) for idx, value in zip(valid_idxs, tanimoto): score[idx] = value return {"total_score": np.array(score, dtype=np.float32)}
def ClusterFps(fps, similaridad): cutoff = 1-similaridad # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1,nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i]) dists.extend([1-x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True) return cs
def Tanimotosimilarity(): out = open("hogehoge.txt", "w") fps = [AllChem.GetMorganFingerprint(m, 2) for m in mols] simmat = [] for i in range(len(fps)): tsims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) simmat.append(tsims) if tsims >= 0.7: out.write("%s\tsim\t%s\n" % (mols[i].GetProp("ID"), mols[j].GetProp("ID"))) out.close()
def calcDistMatrix(df, distMeasure): # calculates the distance matrix between all paris of molecules, standard: Tanimoto and Morgan2 FPs dists=np.zeros([len(df),len(df)]) if distMeasure=='Tanimoto': for i in range(1,len(df)): ds = DataStructs.BulkTanimotoSimilarity(df.FP.iloc[i],list(df.FP.iloc[:i]),returnDistance=1) for j in range(i): dists[i,j] = ds[j] dists[j,i] = ds[j] else: print(distMeasure, 'distance metric not implemented.') return return dists
def ClusterFps(fps, cutoff=0.2): # Function to group molecules based on their similarity to other compounds in the HERG set. # This is an attempt to overcome some of the problems associated with not # having a single series of related compounds # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
def cluster_mols( mols: List[Chem.rdchem.Mol], cutoff: float = 0.2, feature_fn: Callable = None, n_jobs: Optional[int] = 1, ): """Cluster a set of molecules using the butina clustering algorithm and a given threshold. Args: mols: a list of molecules. cutoff: Cuttoff for the clustering. Default to 0.2. feature_fn: A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None. n_jobs: Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. """ if feature_fn is None: feature_fn = functools.partial(dm.to_fp, as_array=False) features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs) dists = [] n_mols = len(mols) for i in range(1, n_mols): dist = DataStructs.BulkTanimotoSimilarity(features[i], features[:i], returnDistance=True) dists.extend([x for x in dist]) # now cluster the data cluster_indices = Butina.ClusterData(dists, n_mols, cutoff, isDistData=True) cluster_mols = [ operator.itemgetter(*cluster)(mols) for cluster in cluster_indices ] # Make single mol cluster a list cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c for c in cluster_mols] return cluster_indices, cluster_mols
def tanimoto(mol, lib_in): return DataStructs.BulkTanimotoSimilarity(mol, lib_in)
def _calculate_tanimoto(self, query_fps, ref_fingerprints) -> np.array: return np.array([ np.max(DataStructs.BulkTanimotoSimilarity(fp, ref_fingerprints)) for fp in query_fps ])
fp_bulk.append(fp1) for idy, parms_y in dataset2.iterrows(): try: smi_y = parms_y['SMILES'] exp_solb = parms_y['measured'] mol_y = Chem.MolFromSmiles(smi_y) AllChem.Compute2DCoords(mol_y) # fp2 = Chem.RDKFingerprint(mol_y) fp2 = AllChem.GetMorganFingerprintAsBitVect(mol_y, 2, 2048) except: print(smi_y + "was not valid SMILES\n") continue else: Tan = DataStructs.BulkTanimotoSimilarity(fp2, fp_bulk) Tan = np.asarray(Tan) Tan_mean = Tan.mean() Tan_min = Tan.min() Tan_max = Tan.max() Tan_bulk.append([smi_y, exp_solb, Tan_mean, Tan_min, Tan_max]) print("Tan= %s" % (Tan)) Tan_pd = pd.DataFrame(Tan_bulk, columns=[ 'smi', 'experimental_solubilty', 'smlty_mean', 'smlty_min', 'smlty_max' ]) pd_file = '%s_%s_smlty.csv' % (args.dataset1, args.dataset2) Tan_pd.to_csv(pd_file, index=False) print('mean= %s' % (Tan_pd['smlty_mean'].mean()))