Example #1
0
    def __call__(self, smiles: List[str]) -> dict:
        mols = [Chem.MolFromSmiles(smile) for smile in smiles]
        valid = [1 if mol is not None else 0 for mol in mols]
        valid_idxs = [idx for idx, boolean in enumerate(valid) if boolean == 1]
        valid_mols = [mols[idx] for idx in valid_idxs]

        fps = [
            AllChem.GetMorganFingerprint(mol,
                                         3,
                                         useCounts=True,
                                         useFeatures=False)
            for mol in valid_mols
        ]

        tanimoto = np.array([
            np.max(DataStructs.BulkTanimotoSimilarity(fp, self.ref_fps))
            for fp in fps
        ])
        tanimoto = np.maximum((1 - 2 * np.absolute(0.5 - tanimoto)), 0)

        score = np.full(len(smiles), 0, dtype=np.float32)

        for idx, value in zip(valid_idxs, tanimoto):
            score[idx] = value
        return {"total_score": np.array(score, dtype=np.float32)}
Example #2
0
def ClusterFps(fps, similaridad):
    cutoff = 1-similaridad
    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
        dists.extend([1-x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    return cs
Example #3
0
def Tanimotosimilarity():
    out = open("hogehoge.txt", "w")
    fps = [AllChem.GetMorganFingerprint(m, 2) for m in mols]

    simmat = []
    for i in range(len(fps)):
        tsims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        simmat.append(tsims)
        if tsims >= 0.7:
            out.write("%s\tsim\t%s\n" %
                      (mols[i].GetProp("ID"), mols[j].GetProp("ID")))
    out.close()
Example #4
0
def calcDistMatrix(df, distMeasure):
    # calculates the distance matrix between all paris of molecules, standard: Tanimoto and Morgan2 FPs
    dists=np.zeros([len(df),len(df)])
    if distMeasure=='Tanimoto':
        for i in range(1,len(df)):
            ds = DataStructs.BulkTanimotoSimilarity(df.FP.iloc[i],list(df.FP.iloc[:i]),returnDistance=1)
            for j in range(i):
                dists[i,j] = ds[j]
                dists[j,i] = ds[j]
    else:
        print(distMeasure, 'distance metric not implemented.')
        return
    return dists 
Example #5
0
def ClusterFps(fps, cutoff=0.2):
    # Function to group molecules based on their similarity to other compounds in the HERG set.
    # This is an attempt to overcome some of the problems associated with not
    # having a single series of related compounds

    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    return cs
Example #6
0
def cluster_mols(
    mols: List[Chem.rdchem.Mol],
    cutoff: float = 0.2,
    feature_fn: Callable = None,
    n_jobs: Optional[int] = 1,
):
    """Cluster a set of molecules using the butina clustering algorithm and a given threshold.

    Args:
        mols: a list of molecules.
        cutoff: Cuttoff for the clustering. Default to 0.2.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    dists = []
    n_mols = len(mols)

    for i in range(1, n_mols):
        dist = DataStructs.BulkTanimotoSimilarity(features[i],
                                                  features[:i],
                                                  returnDistance=True)
        dists.extend([x for x in dist])

    # now cluster the data
    cluster_indices = Butina.ClusterData(dists,
                                         n_mols,
                                         cutoff,
                                         isDistData=True)
    cluster_mols = [
        operator.itemgetter(*cluster)(mols) for cluster in cluster_indices
    ]

    # Make single mol cluster a list
    cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c
                    for c in cluster_mols]

    return cluster_indices, cluster_mols
def tanimoto(mol, lib_in):
    return DataStructs.BulkTanimotoSimilarity(mol, lib_in)
Example #8
0
 def _calculate_tanimoto(self, query_fps, ref_fingerprints) -> np.array:
     return np.array([
         np.max(DataStructs.BulkTanimotoSimilarity(fp, ref_fingerprints))
         for fp in query_fps
     ])
Example #9
0
        fp_bulk.append(fp1)

for idy, parms_y in dataset2.iterrows():

    try:
        smi_y = parms_y['SMILES']
        exp_solb = parms_y['measured']
        mol_y = Chem.MolFromSmiles(smi_y)
        AllChem.Compute2DCoords(mol_y)
        # fp2 = Chem.RDKFingerprint(mol_y)
        fp2 = AllChem.GetMorganFingerprintAsBitVect(mol_y, 2, 2048)
    except:
        print(smi_y + "was not valid SMILES\n")
        continue
    else:
        Tan = DataStructs.BulkTanimotoSimilarity(fp2, fp_bulk)
        Tan = np.asarray(Tan)
        Tan_mean = Tan.mean()
        Tan_min = Tan.min()
        Tan_max = Tan.max()
        Tan_bulk.append([smi_y, exp_solb, Tan_mean, Tan_min, Tan_max])
        print("Tan=  %s" % (Tan))
Tan_pd = pd.DataFrame(Tan_bulk,
                      columns=[
                          'smi', 'experimental_solubilty', 'smlty_mean',
                          'smlty_min', 'smlty_max'
                      ])
pd_file = '%s_%s_smlty.csv' % (args.dataset1, args.dataset2)
Tan_pd.to_csv(pd_file, index=False)
print('mean= %s' % (Tan_pd['smlty_mean'].mean()))