Beispiel #1
0
 def _morgan(self, molecules):
     if self.vector == 'int':
         from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
         self.fps_ = [
             GetMorganFingerprint(self._sanitary(mol), self.radius,
                                  **self.kwargs) for mol in molecules
         ]
         # get nonzero elements as a dictionary for each molecule
         dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps_]
         # pairScores = []
         # for fp in dict_nonzero:
         #     pairScores += list(fp)
         data = pd.DataFrame(
             dict_nonzero)  #, columns=list(set(pairScores)))
         data.fillna(0, inplace=True)
         return data
     elif self.vector == 'bit':
         from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
         self.fps_ = [
             GetMorganFingerprintAsBitVect(self._sanitary(mol),
                                           self.radius,
                                           nBits=self.n_bits,
                                           **self.kwargs)
             for mol in molecules
         ]
         data = np.array(self.fps_)
         data = pd.DataFrame(data)
         return data
Beispiel #2
0
 def test_keep_similar_samples(self):
     samp = self.sampler.sample(100, filter_similar=False, verbose=False)
     scores = list()
     i, j = 0, 0
     while i < len(samp) - 1:
         j = i + 1
         mol1 = Chem.MolFromSmiles(samp[i])
         fp1 = GetMorganFingerprintAsBitVect(mol1, 4, nBits=1024)
         while j < len(samp):
             mol2 = Chem.MolFromSmiles(samp[j])
             fp2 = GetMorganFingerprintAsBitVect(mol2, 4, nBits=1024)
             score = FingerprintSimilarity(fp1, fp2)
             scores.append(score)
             j += 1
         i += 1
     self.assertFalse(all([s < 0.85 for s in scores]))
Beispiel #3
0
def morgan_fingerprint(
    df: pd.DataFrame,
    mols_col: str,
    radius: int = 3,
    nbits: int = 2048,
    kind: str = "counts",
):
    """
    Convert a column of RDKIT Mol objects into Morgan Fingerprints.

    Returns a new dataframe without any of the original data. This is
    intentional, as Morgan fingerprints are usually high-dimensional
    features.

    Method chaining usage:

    .. code-block:: python

        df = pd.DataFrame(...)
        morgans = df.morgan_fingerprint(mols_col='mols', radius=3, nbits=2048)

    If you wish to join the Morgans back into the original dataframe, this
    can be accomplished by doing a `join`, becuase the indices are
    preserved:

    ..code-block:: python

        joined = df.join(morgans)

    :param df: A pandas DataFrame.
    :param mols_col: The name of the column that has the RDKIT mol objects
    :param radius: Radius of Morgan fingerprints. Defaults to 3.
    :param nbits: The length of the fingerprints. Defaults to 2048.
    :param kind: Whether to return counts or bits. Defaults to counts.
    :returns: A pandas DataFrame
    """
    acceptable_kinds = ["counts", "bits"]
    if kind not in acceptable_kinds:
        raise ValueError(f"`kind` must be one of {acceptable_kinds}")

    if kind == "bits":
        fps = [
            GetMorganFingerprintAsBitVect(m, radius, nbits)
            for m in df[mols_col]
        ]
    elif kind == "counts":
        fps = [
            GetHashedMorganFingerprint(m, radius, nbits) for m in df[mols_col]
        ]

    np_fps = []
    for fp in fps:
        arr = np.zeros((1, ))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    np_fps = np.vstack(np_fps)
    fpdf = pd.DataFrame(np_fps)
    fpdf.index = df.index
    return fpdf
Beispiel #4
0
    def test__string_output_format__binary(self) -> None:
        fprintr = CircularFPFeaturizer(output_format="sparse_string", fp_mode="binary_folded")

        fps_str = fprintr.fit_transform(self.smis)  # using SMILES

        # Output shape
        self.assertEqual(self.n_mols, len(fps_str))

        # Fingerprint matrix structure
        for i, mol in enumerate(self.mols):
            fps_ref = GetMorganFingerprintAsBitVect(
                mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality,
                nBits=fprintr.n_bits_
            )

            fp_i_from_str = eval("{" + fps_str[i] + "}")

            for idx in fps_ref.GetOnBits():
                self.assertIn(idx, fp_i_from_str)
Beispiel #5
0
 def test_correct_filter_similar_samples(self):
     samp = self.sampler.sample(60,
                                filter_similar=True,
                                threshold=0.3,
                                verbose=False)
     scores = list()
     i, j = 0, 0
     while i < len(samp) - 1:
         j = i + 1
         mol1 = Chem.MolFromSmiles(samp[i])
         fp1 = GetMorganFingerprintAsBitVect(mol1, 4, nBits=2048)
         while j < len(samp):
             mol2 = Chem.MolFromSmiles(samp[j])
             fp2 = GetMorganFingerprintAsBitVect(mol2, 4, nBits=2048)
             score = FingerprintSimilarity(fp1, fp2)
             scores.append(score)
             j += 1
         i += 1
     self.assertTrue(all([s < 0.3 for s in scores]))
Beispiel #6
0
    def __represent(self, smiles):
        # The descriptor must be a binary Morgan fingerprint with radius 2 and 1024 bits.

        mol = Chem.MolFromSmiles(smiles.strip())
        if mol is None:
            msg = '%s is not a valid SMILES representation' % smiles
            raise ValueError(msg)
        else:
            return np.array(
                GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024))
Beispiel #7
0
def ecfp(mol, r=3, nBits=4096, errors_as_zeros=True):
    mol = Chem.MolFromSmiles(mol) if not isinstance(
        mol, rdkit.Chem.rdchem.Mol) else mol
    try:
        arr = np.zeros((1, ))
        ConvertToNumpyArray(GetMorganFingerprintAsBitVect(mol, r, nBits), arr)
        return arr.astype(np.float32)
    except:
        return np.NaN if not errors_as_zeros else np.zeros(
            (nBits, ), dtype=np.float32)
Beispiel #8
0
    def _transform_mol(self, mol):
        """Private method to transform a skchem molecule.

        Use `transform` for the public method, which genericizes the argument
        to iterables of mols.

        Args:
            mol (skchem.Mol): Molecule to calculate fingerprint for.

        Returns:
            np.array or dict:
                Fingerprint as an array (or a dict if sparse).
        """

        if self.as_bits and self.n_feats > 0:

            fp = GetMorganFingerprintAsBitVect(
                mol,
                self.radius,
                nBits=self.n_feats,
                useFeatures=self.use_features,
                useBondTypes=self.use_bond_types,
                useChirality=self.use_chirality)

            res = np.array(0)
            ConvertToNumpyArray(fp, res)
            res = res.astype(np.uint8)

        else:

            if self.n_feats <= 0:

                res = GetMorganFingerprint(mol,
                                           self.radius,
                                           useFeatures=self.use_features,
                                           useBondTypes=self.use_bond_types,
                                           useChirality=self.use_chirality)

                res = res.GetNonzeroElements()
                if self.as_bits:
                    res = {k: int(v > 0) for k, v in res.items()}

            else:
                res = GetHashedMorganFingerprint(
                    mol,
                    self.radius,
                    nBits=self.n_feats,
                    useFeatures=self.use_features,
                    useBondTypes=self.use_bond_types,
                    useChirality=self.use_chirality)

                res = np.array(list(res))

        return res
def mol_to_1jxx_feats(mol: MyMol, atom0: int, atom1: int):
    path = mol.path(atom0, atom1)
    assert path[0] == atom0, 'wrong path'

    s_nodes = determine_surrounding_nodes_1jhx(mol.G, path)
    nodes_in_interest = [
        NamedNode(name=str(i), n=n) for i, n in enumerate(path)
    ] + s_nodes

    node_feats = reduce(lambda x, y: {
        **x,
        **y
    }, [node_to_feat(mol, n.n, n.name) for n in nodes_in_interest])

    all_3d_dist_feats = calc_all_3d_dist(mol, nodes_in_interest)
    all_angle_feats = calc_all_angle_feats(mol, nodes_in_interest)
    dist_stats_feats = calc_dist_stats_feats(mol, nodes_in_interest)

    dihedral_stats_feats = reduce(lambda a, b: {
        **a,
        **b
    }, [
        calc_node_to_atom_dihedral_stats(mol, nn, atom_sym)
        for nn, atom_sym in product(nodes_in_interest, ['H', 'C', 'N', 'O'])
    ])

    angle_stats_feats = reduce(lambda a, b: {
        **a,
        **b
    }, [
        calc_node_to_atom_angle_stats(mol, nn, atom_sym)
        for nn, atom_sym in product(nodes_in_interest, ['H', 'C', 'N', 'O'])
    ])

    fp = GetMorganFingerprintAsBitVect(mol.mol, 2, fromAtoms=path)

    return {
        **node_feats,
        **all_3d_dist_feats,
        **all_angle_feats,
        **dist_stats_feats,
        **dihedral_stats_feats,
        **angle_stats_feats,
        **{
            'fp': fp,
        },
        # Not feature
        **{
            'molecule_name': mol.name,
        },
        **{'n{}'.format(nn.name): nn.n
           for nn in nodes_in_interest}
    }
Beispiel #10
0
def create_circular_fingerprint(mol, radius, size, chirality):
    """

    :param mol:
    :param radius:
    :param size:
    :param chirality:
    :return: np array of morgan fingerprint
    """
    fp = GetMorganFingerprintAsBitVect(mol,
                                       radius,
                                       nBits=size,
                                       useChirality=chirality)
    return np.array(fp)
Beispiel #11
0
 def CalculateECFP(self, mol):
     """Function to compute ECFP fingerprint under useFeatures is True
     
     :param mol: molecule
     :type mol: rdkit.Chem.rdchem.Mol
     :return: fingerprint
     :rtype: list
     
     """
     fp = GetMorganFingerprintAsBitVect(mol,
                                        radius=self.radius,
                                        nBits=self.nBits)
     fp = list(fp)
     return fp
Beispiel #12
0
    def test__folded_binary_fingerprints__ecfp(self) -> None:
        fprintr = CircularFPFeaturizer(fp_mode="binary_folded", n_bits_folded=512)

        fps_mat_smi = fprintr.fit_transform(self.smis)  # using SMILES
        fps_mat_mol = fprintr.fit_transform(self.mols)  # using Mol objects

        # Output shape
        self.assertEqual(fps_mat_smi.shape[0], self.n_mols)
        self.assertEqual(fps_mat_smi.shape[1], fprintr.n_bits_folded)
        self.assertEqual(fps_mat_mol.shape[0], self.n_mols)
        self.assertEqual(fps_mat_mol.shape[1], fprintr.n_bits_folded)

        # Fingerprint matrix structure
        for i, mol in enumerate(self.mols):
            fps_ref = GetMorganFingerprintAsBitVect(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_,
                                                    useChirality=fprintr.use_chirality, nBits=fprintr.n_bits_folded)
            on_bits = list(fps_ref.GetOnBits())
            for j in range(fprintr.n_bits_folded):
                if j in on_bits:
                    self.assertTrue(fps_mat_smi[i, j])
                    self.assertTrue(fps_mat_mol[i, j])
                else:
                    self.assertFalse(fps_mat_smi[i, j])
                    self.assertFalse(fps_mat_mol[i, j])
Beispiel #13
0
 def CalculateFCFP(self, mol):
     """
     Parameters:
     -----------
     mols: rdkit.Chem.rdchem.Mol
     
     Return:
     -----------
     fps: list
     """
     fp = GetMorganFingerprintAsBitVect(mol,
                                        radius=self.radius,
                                        nBits=self.nBits,
                                        useFeatures=True)
     fp = list(fp)
     return fp
Beispiel #14
0
 def CalculateECFP(self, mol):
     """
     Parameters:
     -----------
     mols: Iterable object, each element is a rdkit.Chem.rdchem.Mol
     The molecule(s) to be scanned
     
     Return:
     -----------
     fps: list
     """
     fp = GetMorganFingerprintAsBitVect(mol,
                                        radius=self.radius,
                                        nBits=self.nBits)
     fp = list(fp)
     return fp
Beispiel #15
0
 def _sample_w_filter(self, n_samples, data, threshold, verbose):
     count = 0
     selected_mols_fp = list()
     if verbose:
         pb = tqdm(total=n_samples, ascii=True, desc="Sampling")
     while count < n_samples:
         idx = random.sample(range(len(data)), 1)[0]
         smiles = data[idx].split(",")[0]
         mol = Chem.MolFromSmiles(smiles)
         if mol is None:
             data[idx] = data.pop()
             continue
         fp = GetMorganFingerprintAsBitVect(mol, 4, nBits=1024)
         if self._are_similar(fp, selected_mols_fp, threshold):
             data[idx] = data.pop()
             continue
         self._samples.append(smiles)
         selected_mols_fp.append(fp)
         if verbose:
             pb.update(1)
         count += 1
Beispiel #16
0
def GetFoldedCircularFragment(mol, minRadius=1, maxRadius=2,
                              nBits=1024, maxFragment=True,
                              disposed=True):
    """Get folded circular fragment

    Parameters
    ----------
    mol : dkit.Chem.rdchem.Mol object
        Compound to be Calculated
    minRadius : int, optional
        The probable minimum radius of circular fragment, by default 1
    maxRadius : int, optional
        The probable maximum radius of circular fragment, by default 2
    nBits : int, optional, 
        the number of bit of morgan, by default 1014
    maxFragment : bool, optional
        Whether only return the maximum fragment at a center atom, by default True
    disposed : bool, optional
        Whether dispose the original bitinfo, by default True

    Returns
    -------
    fragments : list of list
        The first element is the ID of all fragments generated
        the second one is the ID of output fragments
    """
    bitInfo = {}
    fp = GetMorganFingerprintAsBitVect(mol,
                                       radius=maxRadius,
                                       nBits=nBits,
                                       bitInfo=bitInfo)

    fragments = _DisposeCircularBitInfo(
            bitInfo, minRadius, maxFragment
        ) if disposed else bitInfo
    return fragments
Beispiel #17
0
def get_ecfp(mol):
    from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
    bitstring = GetMorganFingerprintAsBitVect(mol, 2, nBits=2048).ToBitString()
    return np.array(list(bitstring))
Beispiel #18
0
)
from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from rdkit.DataStructs import (
    ExplicitBitVect,
    BulkTanimotoSimilarity,
    BulkDiceSimilarity,
    BulkTverskySimilarity,
)
from rdkit.ML.Cluster import Butina

DEBUG = True

DESCRIPTORS = {
    'path': RDKFingerprint,
    'ecfp4': lambda mol: GetMorganFingerprintAsBitVect(mol, radius=2),
    'zinc':
    lambda mol: GetMorganFingerprintAsBitVect(mol, radius=2, nBits=512),
    'apair': lambda mol: GetAtomPairFingerprint(mol)
}

COEFFICIENTS = {
    'tanimoto': lambda x, ys, *args: BulkTanimotoSimilarity(x, ys),
    'dice': lambda x, ys, *args: BulkDiceSimilarity(x, ys),
    'tversky': lambda x, ys, a, b, *args: BulkTverskySimilarity(x, ys, a, b),
}

CLUSTERING_APPROACHES = [
    'butina',
    'cassidy',
]
def convert_to_morgan(mol):
    morgan = GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    return morgan
Beispiel #20
0
    def Fingerprint(self):
        if self.FPtype == 'Hashed_atom_pair' or self.FPtype == 'HAP':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Pairs import GetHashedAtomPairFingerprint
                self.fps = [
                    GetHashedAtomPairFingerprint(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                data = pd.DataFrame(dict_nonzero, columns=range(self.nBits))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetHashedAtomPairFingerprintAsBitVect
                self.fps = [
                    GetHashedAtomPairFingerprintAsBitVect(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Atom_pair' or self.FPtype == 'AP':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint
                self.fps = [GetAtomPairFingerprint(m) for m in self.molecules]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += [key for key in fp]
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprintAsBitVect
                self.fps = [
                    GetAtomPairFingerprintAsBitVect(m) for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)

                print len(data.columns)
                d_des = data.describe()
                for i in data.columns:
                    if d_des[i]['mean'] == 0:
                        data.drop(i, 1)
                print len(data.columns)

                dict_nonzero = []
                for fp in self.fps:
                    dict_nonzero.append(
                        {i: el
                         for i, el in enumerate(fp) if el != 0})
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += [key for key in fp]
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'MACCS':
            if self.vector == 'int':
                msg = "There is no RDKit function to encode int vectors for MACCS keys"
                raise ValueError(msg)
            elif self.vector == 'bit':
                from rdkit.Chem.MACCSkeys import GenMACCSKeys
                self.fps = [GenMACCSKeys(mol) for mol in self.molecules]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The vector argument can only be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Morgan':
            if self.vector == 'int':
                from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
                self.fps = [
                    GetMorganFingerprint(mol, self.radius)
                    for mol in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += list(fp)
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
                self.fps = [
                    GetMorganFingerprintAsBitVect(mol,
                                                  self.radius,
                                                  nBits=self.nBits)
                    for mol in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can only be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Hashed_topological_torsion' or self.FPtype == 'HTT':
            if self.vector == 'int':
                from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprint
                self.fps = [
                    GetHashedTopologicalTorsionFingerprint(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                data = pd.DataFrame(dict_nonzero, columns=range(self.nBits))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprintAsBitVect
                self.fps = [
                    GetHashedTopologicalTorsionFingerprintAsBitVect(
                        m, nBits=self.nBits) for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Topological_torsion' or self.FPtype == 'TT':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Torsions import GetTopologicalTorsionFingerprintAsIntVect
                self.fps = [
                    GetTopologicalTorsionFingerprintAsIntVect(mol)
                    for mol in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += list(fp)
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                msg = "There is no RDKit function to encode bit vectors for Topological Torsion Fingerprints"
                raise ValueError(msg)
            else:
                msg = "The argument vector can only be 'int'"
                raise ValueError(msg)
        else:
            msg = "The type argument '%s' is not a valid fingerprint type" % self.FPtype
            raise ValueError(msg)
Beispiel #21
0
def morgan_fingerprint(
    df: pd.DataFrame,
    mols_column_name: str,
    radius: int = 3,
    nbits: int = 2048,
    kind: str = "counts",
) -> pd.DataFrame:
    """
    Convert a column of RDKIT Mol objects into Morgan Fingerprints.

    Returns a new dataframe without any of the original data. This is
    intentional, as Morgan fingerprints are usually high-dimensional
    features.

    This method does not mutate the original DataFrame.

    Functional usage example:

    .. code-block:: python

        import pandas as pd
        import janitor.chemistry

        df = pd.DataFrame(...)

        # For "counts" kind
        morgans = janitor.chemistry.morgan_fingerprint(
            df=df.smiles2mol('smiles', 'mols'),
            mols_column_name='mols',
            radius=3,      # Defaults to 3
            nbits=2048,    # Defaults to 2048
            kind='counts'  # Defaults to "counts"
        )

        # For "bits" kind
        morgans = janitor.chemistry.morgan_fingerprint(
            df=df.smiles2mol('smiles', 'mols'),
            mols_column_name='mols',
            radius=3,      # Defaults to 3
            nbits=2048,    # Defaults to 2048
            kind='bits'    # Defaults to "counts"
        )

    Method chaining usage example:

    .. code-block:: python

        import pandas as pd
        import janitor.chemistry

        df = pd.DataFrame(...)

        # For "counts" kind
        morgans = (
            df.smiles2mol('smiles', 'mols')
              .morgan_fingerprint(mols_column_name='mols',
                                  radius=3,      # Defaults to 3
                                  nbits=2048,    # Defaults to 2048
                                  kind='counts'  # Defaults to "counts"
              )
        )

        # For "bits" kind
        morgans = (
            df.smiles2mol('smiles', 'mols')
              .morgan_fingerprint(mols_column_name='mols',
                                  radius=3,    # Defaults to 3
                                  nbits=2048,  # Defaults to 2048
                                  kind='bits'  # Defaults to "counts"
              )
        )

    If you wish to join the morgan fingerprints back into the original
    dataframe, this can be accomplished by doing a `join`,
    because the indices are preserved:

    .. code-block:: python

        joined = df.join(morgans)

    :param df: A pandas DataFrame.
    :param mols_column_name: The name of the column that has the RDKIT
        mol objects
    :param radius: Radius of Morgan fingerprints. Defaults to 3.
    :param nbits: The length of the fingerprints. Defaults to 2048.
    :param kind: Whether to return counts or bits. Defaults to counts.
    :returns: A new pandas DataFrame of Morgan fingerprints.
    """
    acceptable_kinds = ["counts", "bits"]
    if kind not in acceptable_kinds:
        raise ValueError(f"`kind` must be one of {acceptable_kinds}")

    if kind == "bits":
        fps = [
            GetMorganFingerprintAsBitVect(m, radius, nbits, useChirality=True)
            for m in df[mols_column_name]
        ]
    elif kind == "counts":
        fps = [
            GetHashedMorganFingerprint(m, radius, nbits, useChirality=True)
            for m in df[mols_column_name]
        ]

    np_fps = []
    for fp in fps:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    np_fps = np.vstack(np_fps)
    fpdf = pd.DataFrame(np_fps)
    fpdf.index = df.index
    return fpdf
Beispiel #22
0
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from rdkit import DataStructs
from tqdm import tqdm


if __name__ == "__main__":
    with open("./test.csv", "r") as f:
        header, *data = f.readlines()
    i = 0
    pb1 = tqdm(total=len(data), ascii=True, desc="Main progress")
    while i < len(data):
        pb2 = tqdm(total=len(data), ascii=True, desc="Look for similar")
        m1 = Chem.MolFromSmiles(data[i].split(",")[0])
        fp1 = GetMorganFingerprintAsBitVect(m1, 4, nBits=2048)
        j = i + 1
        while j < len(data):
            m2 = Chem.MolFromSmiles(data[j].split(",")[0])
            if m2 is None:
                data[j] = data.pop()
                pb2.update(1)
                continue
            fp2 = GetMorganFingerprintAsBitVect(m2, 4, nBits=2048)
            similarity = DataStructs.FingerprintSimilarity(fp1, fp2)
            if similarity > 0.85:
                if j == len(data) - 1:
                    data.pop()
                else:
                    data[j] = data.pop()
            else: