def find_unique_confs(best_conformers, mol_files, threshold=0.5):
    """ Clustering conformers with RDKit's Butina algorithm
    to find unique conformer from a list of .sdf files
    using either heavy-atom root mean square deviation (RMSD) 
    or heavy-atom torsion fingerprint deviation (TFD) """

    rdkit_mol = next(rdmolfiles.ForwardSDMolSupplier(mol_files[0], sanitize=False, removeHs=True))
    for mol_file in mol_files[1:]:
        mol = next(rdmolfiles.ForwardSDMolSupplier(mol_file, sanitize=False, removeHs=True))
        rdkit_mol.AddConformer(mol.GetConformer(),assignId=True)

    # calculate difference matrix
    diffmat = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False) #threshold=0.5, sanitize=False, load AllChem
    # diffmat = TorsionFingerprints.GetTFDMatrix(rdkit_mol) #threshold=0.01, sanitize=True, load TorsionFingerprints

    # Cluster conformers
    num_confs = rdkit_mol.GetNumConformers()
    clt = Butina.ClusterData(diffmat, num_confs, threshold,
                             isDistData=True, reordering=True)

    # Get unique conformers
    centroid_idx = [c[0] for c in clt] # centroid indexes.
    unique_best_conformers = [best_conformers[i] for i in centroid_idx]
    
    return unique_best_conformers
Exemple #2
0
def cluster_fingerprints(fps, cutoff=0.2):
    """
    Performs Butina clustering on compounds specified by a list of fingerprint bit vectors.

    From RDKit cookbook http://rdkit.org/docs_temp/Cookbook.html.

    Args:
        fps (list of rdkit.ExplicitBitVect): List of fingerprint bit vectors.

        cutoff (float): Cutoff distance parameter used to seed clusters in Butina algorithm.

    Returns:
        tuple of tuple: Indices of fingerprints assigned to each cluster.

    """

    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    return cs
def cluster_conformers(mol, mode="RMSD", threshold=0.2):
    """
    Cluster conf based on heavy atom rmsd 
    Then Butina is used for clustering
    """
    ### get heavy atom idx ###
    heavyatomidx = []
    for a in mol.GetAtoms():
        if a.GetAtomicNum() != 1:
            heavyatomidx.append(a.GetIdx())

    ### align on heavy atom for each pair and get dmat ###
    n = mol.GetNumConformers()
    dmat = []
    for i in range(n):
        for j in range(i):
            dmat.append(
                Chem.rdMolAlign.AlignMol(mol,
                                         mol,
                                         i,
                                         j,
                                         atomMap=[(k, k)
                                                  for k in heavyatomidx]))
    ### clustering ###
    rms_clusters = Butina.ClusterData(dmat,
                                      mol.GetNumConformers(),
                                      threshold,
                                      isDistData=True,
                                      reordering=True)

    return rms_clusters
def cluster_ligands(ligands, cutoff=0.2):
    """"""
    rdkit_ligands = []
    for lig in ligands:
        try:
            rdkit_ligands.append(ccdc_to_rdkit(lig))
        except:
            pass

    # from RDKit Cookbook
    fps = [
        AllChem.GetMorganFingerprintAsBitVect(lig, 2, 1024)
        for lig in rdkit_ligands
    ]
    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])

    # now cluster the data:
    clusters = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    all_ligands = []
    for cluster in clusters:
        try:
            all_ligands.append(rdkit_to_ccdc(rdkit_ligands[cluster[0]]))
        except:
            pass
    return all_ligands
def gen_cluster_subset_algButina(fps, cutoff):
    dists = []
    for i, fp in enumerate(fps):
        distance_matrix = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in distance_matrix])
    cs = Butina.ClusterData(dists, len(fps), cutoff, isDistData=True)
    return cs  # returns tuple of tuples with sequential numbers of compounds in each cluster
Exemple #6
0
def ClusterAlignments(mol,
                      alignments,
                      builder,
                      neighborTol=0.1,
                      distMetric=SubshapeDistanceMetric.PROTRUDE,
                      tempConfId=1001):
    from rdkit.ML.Cluster import Butina
    dists = []
    for i in range(len(alignments)):
        TransformMol(mol, alignments[i].transform, newConfId=tempConfId)
        shapeI = builder.GenerateSubshapeShape(mol,
                                               tempConfId,
                                               addSkeleton=False)
        for j in range(i):
            TransformMol(mol,
                         alignments[j].transform,
                         newConfId=tempConfId + 1)
            shapeJ = builder.GenerateSubshapeShape(mol,
                                                   tempConfId + 1,
                                                   addSkeleton=False)
            d = GetShapeShapeDistance(shapeI, shapeJ, distMetric)
            dists.append(d)
            mol.RemoveConformer(tempConfId + 1)
        mol.RemoveConformer(tempConfId)
    clusts = Butina.ClusterData(dists,
                                len(alignments),
                                neighborTol,
                                isDistData=True)
    res = [alignments[x[0]] for x in clusts]
    return res
def cluster_chemicals(
    *,
    rebuild: bool = False,
    chemicals_dict,
):
    """Cluster chemicals based on their similarities."""
    if not rebuild and os.path.exists(DEFAULT_CLUSTERED_CHEMICALS):
        return pd.read_csv(DEFAULT_CLUSTERED_CHEMICALS,
                           sep="\t",
                           index_col=False,
                           dtype={'PubchemID': str})
    dists = []
    drugs, fps = zip(*chemicals_dict.items())

    nfps = len(chemicals_dict)
    for i in tqdm(range(1, nfps), desc='Calculating distance for clustering'):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])
    cs = Butina.ClusterData(dists, nfps, 0.3, isDistData=True)
    df = pd.DataFrame(columns=['PubchemID', 'Cluster'])

    i = 1
    for j, cluster in enumerate(cs, start=1):
        for drug in cluster:
            df.loc[i] = [drugs[drug - 1]] + [j]
            i += 1

    df.to_csv(DEFAULT_CLUSTERED_CHEMICALS, sep='\t', index=False)
    return df
Exemple #8
0
def leven_butina_cs(smiles, distThresh=3, reordering=False):
    cs = Butina.ClusterData(data=smiles,
                            nPts=len(smiles),
                            distThresh=distThresh,
                            distFunc=levenshtein,
                            reordering=reordering)
    return cs
Exemple #9
0
def cluster(
    mol: Chem.rdchem.Mol,
    rms_cutoff: float = 1,
    already_aligned: bool = False,
    centroids: bool = True,
):
    """Cluster the conformers of a molecule according to an RMS threshold in Angstrom.

    Args:
        mol: a molecule
        rms_cutoff: The RMS cutoff in Angstrom.
        already_aligned: Whether or not the conformers are aligned. If False,
            they will be aligmned furing the RMS computation.
        centroids: If True, return one molecule with centroid conformers
            only. If False return a list of molecules per cluster with all
            the conformers of the cluster. Defaults to True.
    """

    # Clone molecule
    mol = copy.deepcopy(mol)

    # Compute RMS
    dmat = AllChem.GetConformerRMSMatrix(mol, prealigned=already_aligned)

    # Cluster
    conf_clusters = Butina.ClusterData(
        dmat,
        nPts=mol.GetNumConformers(),
        distThresh=rms_cutoff,
        isDistData=True,
        reordering=False,
    )

    return return_centroids(mol, conf_clusters, centroids=centroids)
Exemple #10
0
def PerformButinaClustering(Mols, MolsFingerprints):
    """Perform clustering using Butina methodology."""

    MiscUtil.PrintInfo(
        "\nClustering molecules using Butina methodology and %s similarity metric..."
        % OptionsInfo["SimilarityMetric"])

    FingerprintsCount = len(MolsFingerprints)
    DistanceCutoff = 1 - OptionsInfo["ButinaSimilarityCutoff"]
    Reordering = OptionsInfo["ButinaReordering"]

    DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints)

    ClusteredMolIndices = Butina.ClusterData(DistanceMatrix,
                                             FingerprintsCount,
                                             DistanceCutoff,
                                             reordering=Reordering,
                                             isDistData=True)

    MolsClusters = []
    for Cluster in ClusteredMolIndices:
        MolsCluster = [Mols[MolIndex] for MolIndex in Cluster]
        MolsClusters.append(MolsCluster)

    return MolsClusters
def butina_clustering_m(rdkit_mol, difference_matrix='tfd', threshold=0.001):
    """ Clustering conformers with RDKit's Butina algorithem """

    # calculate difference matrix
    if difference_matrix.lower() == 'tfd':
        diffmat = TorsionFingerprints.GetTFDMatrix(rdkit_mol)

    if difference_matrix.lower() == 'rms':
        diffmat = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False)

    # cluster conformers
    num_confs = rdkit_mol.GetNumConformers()
    clt = Butina.ClusterData(diffmat,
                             num_confs,
                             threshold,
                             isDistData=True,
                             reordering=True)

    # new conformers
    centroid_idx = [c[0] for c in clt]  # centroid indexes.

    new_rdkit_mol = copy.deepcopy(rdkit_mol)
    new_rdkit_mol.RemoveAllConformers()

    for idx in centroid_idx:
        centroid_conf = rdkit_mol.GetConformer(idx)
        new_rdkit_mol.AddConformer(centroid_conf, assignId=True)

    del rdkit_mol  # delete old mol, is this nessesary?

    return new_rdkit_mol
    def cluster_butina(self, cutoff=0.7):
        '''
		Generate a list with cluster belongings.
		The cutoff variable can be used to specify the clustering threshold.
		'''
        # make a linear input file
        dists = self.distance().values
        data = []
        for i in range(len(self.names())):
            for j in range(i):
                data.append(dists[i, j])

        # cluster them
        cluster_data = Butina.ClusterData(data,
                                          len(self.names()),
                                          cutoff,
                                          isDistData=True)

        # generate a list with cluster belongings
        cluster = [None] * len(self.names())
        for i, clu in enumerate(cluster_data):
            for member in clu:
                cluster[member] = i

        return cluster
Exemple #13
0
def ClusterFps(fps,cutoff=0.2, metric='Tanimoto'):
    '''Clustering Structure based on given Fingerprints.
    fps: Fingerprint Input for clustering.
    cutoff: Cutoff for Butina Clustering.
    metric: Available similarity metrics include:
        Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky.
    '''
    from rdkit import DataStructs
    from rdkit.ML.Cluster import Butina

    metricsAvailableBulk={'tanimoto':DataStructs.BulkTanimotoSimilarity,"dice":DataStructs.BulkDiceSimilarity,
    "cosine": DataStructs.BulkCosineSimilarity, "sokal": DataStructs.BulkSokalSimilarity, "russel": DataStructs.BulkRusselSimilarity, 
    "rogotGoldberg": DataStructs.BulkRogotGoldbergSimilarity, "allbit": DataStructs.BulkAllBitSimilarity, 
    "kulczynski": DataStructs.BulkKulczynskiSimilarity, "mcconnaughey": DataStructs.BulkMcConnaugheySimilarity,
    "asymmetric": DataStructs.BulkAsymmetricSimilarity, "braunblanquet": DataStructs.BulkBraunBlanquetSimilarity}
    
    if metric.lower() not in metricsAvailableBulk:
        print "The given metric is unknown!"
        metric='Tanimoto'
    simMetricsBulk=metricsAvailableBulk[metric.lower()]

    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = simMetricsBulk(fps[i],fps[:i])
        dists.extend([1-x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    return cs
def cluster(smile_keys, fp_type, cutoff=0.15):
    #note: it seems cutoff is one - similarity coefficient, it's euclidean distance I think??
    nfps = len(smile_keys)
    dists = []
    combinations = []

    data = [None] * nfps
    #Finger print each smile in the given smiles
    for i in range(0, nfps):
        fps = fingerprint_smile(smile_keys[i], fp_type)
        data[i] = fps

    #For each smile bulk calculate its similarity to each other smile in the list
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(data[i], data[:i])
        dists.extend([1 - x for x in sims])
        combinations.extend([(smile_keys[j], smile_keys[i])
                             for j in list(range(i))])

    #Prepare export data with each combination of
    matrix_df = create_similarity_export_matrix(combinations, dists)

    #perform clustering algorithm
    result = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    clusters = form_cluster_with_algorithm_results(smile_keys, result)
    return clusters, matrix_df
Exemple #15
0
def do_clustering(simm, queue_fps, threshold):
    """Function to peform the clustering for a library"""
    # Now produce the distance matric
    dists = []
    screen_fps = []
    while True:
        try:
            screen_fps.append(queue_fps.get())
        except Closed:
            break
    nfps = len(screen_fps)
    for i in range(1, nfps):
        other_mols_to_scr = CloseableQueue.CloseableQueue()
        # Make the queues
        [other_mols_to_scr.put(x) for x in screen_fps[:i]]
        other_mols_to_scr.close()
        sims = [
            x["values"]["similarity"]
            for x in simm.find_sim(screen_fps[i], other_mols_to_scr, -1.0)
        ]
        # The mol(1) is the smiles of the mol
        dists.extend([1 - x for x in sims])
    # now cluster the data:
    cs = Butina.ClusterData(dists, nfps, threshold, isDistData=True)
    # Out mols is the list for caputring the clusters
    out_mols = []
    # Now loop through the clusters outputing the results
    for i, c in enumerate(cs):
        for mol_ind in c:
            my_mol = screen_fps[mol_ind]
            my_mol["values"]["cluster"] = i
            out_mols.append(my_mol)
    # Now return the response
    return HttpResponse(json.dumps(remove_keys(out_mols)))
Exemple #16
0
def ClusterFps(fps, cutoff=0.2):
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
        dists.extend([1-x for x in sims])
    cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    return cs
Exemple #17
0
def ButinaClusteringOriginal(dists, nfps):
    print "-------------------------------------------------"
    print "starting Butina clustering"
    # now cluster the data:
    start_time = time.time()
    cs = Butina.ClusterData(dists, nfps, 0.7, isDistData=True, reordering=True)

    print "time taken: ", time.time() - start_time
    return cs
Exemple #18
0
    def test1(self):
        dists = [1, 2, 1, 4, 3, 2, 6, 5, 4, 2, 7, 6, 5, 3, 1]
        nPts = 6
        cs = Butina.ClusterData(dists, nPts, 1.1, isDistData=1)
        self.failUnless(len(cs) == 3)

        self.failUnless(cs[0] == (1, 0, 2))
        self.failUnless(cs[1] == (5, 4))
        self.failUnless(cs[2] == (3, ))
Exemple #19
0
def ClusterFps(fps, cutoff=0.2):
    # (ytz): this is directly copypasta'd from Greg Landrum's clustering example.
    dists = []
    nfps = len(fps)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
        dists.extend([1 - x for x in sims])
    cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    return cs
Exemple #20
0
def ClusterFps(fps, cutoff=0.2):
    # Calculate Tanimoto distance matrix
    distance_matr = Tanimoto_distance_matrix(fps)
    # Now cluster the data with the implemented Butina algorithm:
    clusters = Butina.ClusterData(distance_matr,
                                  len(fps),
                                  cutoff,
                                  isDistData=True)
    return clusters
Exemple #21
0
def cluster_from_mol_list(mol_list, cutoff=0.8, fp="ecfp6", activity_prop=None,
                          summary_only=True, generate_cores=False, align_to_core=False):
    """Clusters the input Mol_List.

    Parameters:
        mol_list (tools.Mol_List): the input molecule list.
        cutoff (float): similarity cutoff for putting molecules into the same cluster.

    Returns:
        A new Mol_List containing the input molecules with their respective cluster number,
        as well as additionally the cluster cores, containing some statistics."""

    try:
        fp_func = FPDICT[fp]
    except KeyError:
        print("Fingerprint {} not found. Available fingerprints are: {}".format(fp, ", ".join(sorted(FPDICT.keys()))))
        return

    counter = Counter()

    # generate the fingerprints
    fp_list = [fp_func(mol) for mol in mol_list]

    # second generate the distance matrix:
    dists = []
    num_of_fps = len(fp_list)
    for i in range(1, num_of_fps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i])
        dists.extend([1 - x for x in sims])

    # now cluster the data:
    cluster_idx_list = Butina.ClusterData(dists, num_of_fps, cutoff, isDistData=True)
    for cluster in cluster_idx_list:
        counter[len(cluster)] += 1
    print("    fingerprint:", fp)
    print("    clustersize  num_of_clusters")
    print("    ===========  ===============")
    for length in sorted(counter.keys(), reverse=True):
        print("        {:4d}            {:3d}".format(length, counter[length]))
    print()

    if summary_only:
        return None

    cluster_list = tools.Mol_List()

    # go over each list of indices to collect the cluster's molecules
    for cl_id, idx_list in enumerate(sorted(cluster_idx_list, key=len, reverse=True), 1):
        cluster = get_mol_list_from_index_list(mol_list, idx_list, cl_id)
        cluster[0].SetProp("is_repr", "yes")  # The first compound in a cluster is the representative
        cluster_list.extend(cluster)

    if generate_cores:
        cluster_list = add_cores(cluster_list, activity_prop, align_to_core)

    return cluster_list
Exemple #22
0
 def test4(self):
   " edge case: everything in one cluster "
   dists = [1,
            2,1,
            3,2,1,
            ]
   nPts = 4
   cs = Butina.ClusterData(dists,nPts,2,isDistData=1)
   self.assertTrue(len(cs)==1)
   self.assertTrue(cs[0]==(3,0,1,2))
Exemple #23
0
    def test8_reordering_changes(self):
        # " reordering: changes"
        dists = [
            2,
            3.5,
            1.5,
            5,
            3,
            1.5,
            7,
            5,
            3.5,
            2,
            8,
            6,
            4.5,
            3,
            1,
            9,
            7,
            5.5,
            4,
            2,
            1,
        ]
        nPts = 7
        # without reordering
        cs = Butina.ClusterData(dists, nPts, 2.1, isDistData=1)
        self.assertTrue(len(cs) == 3)
        self.assertTrue(cs[0] == (4, 3, 5, 6))
        self.assertTrue(cs[1] == (2, 1))
        self.assertTrue(cs[2] == (0, ))

        # with reordering
        cs = Butina.ClusterData(dists,
                                nPts,
                                2.1,
                                isDistData=1,
                                reordering=True)
        self.assertTrue(len(cs) == 2)
        self.assertTrue(cs[0] == (4, 3, 5, 6))
        self.assertTrue(cs[1] == (1, 0, 2))
Exemple #24
0
def ClusterFps(fps,cutoff=0.2):
    # first generate the distance matrix:
    dists = []
    nfps = len(fps)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i])
        dists.extend([1-x for x in sims])

    # now cluster the data:
    cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    return cs
Exemple #25
0
def cluster_conformers(mol, mode="RMSD", threshold=2.0):
    if mode == "TFD":
        dmat = TorsionFingerprints.GetTFDMatrix(mol)
    else:
        dmat = AllChem.GetConformerRMSMatrix(mol, prealigned=False)
    rms_clusters = Butina.ClusterData(dmat,
                                      mol.GetNumConformers(),
                                      threshold,
                                      isDistData=True,
                                      reordering=True)
    return rms_clusters
Exemple #26
0
 def ClusterFps_Butina(self, dists, nfps, cutoff):
     self.cdict = {}
     cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
     for index, eachcs in enumerate(cs):
         self.clustdict[index + 1] = eachcs
         for eachid in eachcs:
             self.cdict[eachid] = [index + 1]
             if eachid == eachcs[0]:
                 self.cdict[eachid].append("true")
             else:
                 self.cdict[eachid].append("flase")
Exemple #27
0
def cluster_fingerprints(fingerprints, cutoff=0.2):
    from rdkit import DataStructs
    from rdkit.ML.Cluster import Butina

    dists = []
    length = len(fingerprints)
    for i in range(1, length):
        sims = DataStructs.BulkTanimotoSimilarity(fingerprints[i],
                                                  fingerprints[:i])
        dists.extend([1 - x for x in sims])

    return Butina.ClusterData(dists, length, cutoff, isDistData=True)
Exemple #28
0
  def test3(self):
    " edge case: everything a singleton "
    dists = [1,
             2,1,
             ]
    nPts = 3
    cs = Butina.ClusterData(dists,nPts,0.9,isDistData=1)
    self.assertTrue(len(cs)==3)

    self.assertTrue(cs[0]==(2,))
    self.assertTrue(cs[1]==(1,))
    self.assertTrue(cs[2]==(0,))
Exemple #29
0
 def test6(self):
   " edge case: zero distances: "
   dists = [1,
            2,0,
            2,0,0,
            4,2,2,2,
            ]
   nPts = 5
   cs = Butina.ClusterData(dists,nPts,0.9,isDistData=1)
   self.assertTrue(len(cs)==3)
   self.assertTrue(cs[0]==(3,1,2))
   self.assertTrue(cs[1]==(4,))
   self.assertTrue(cs[2]==(0,))
Exemple #30
0
 def test4(self):
   " edge case: one in the middle leaves the edges lonely "
   dists = [1.5,
            2.5,1,
            3.5,2,1,
            5,3.5,2.5,1.5,
            ]
   nPts = 5
   cs = Butina.ClusterData(dists,nPts,1.1,isDistData=1)
   self.assertTrue(len(cs)==3)
   self.assertTrue(cs[0]==(2,1,3))
   self.assertTrue(cs[1]==(4,))
   self.assertTrue(cs[2]==(0,))