Exemple #1
0
 def CalculateMACCS(self, mol):
     """There is a SMARTS-based implementation of the 166 public MACCS keys.
     167 bits
     
     :param mol: molecule
     :type mol: rdkit.Chem.rdchem.Mol
     :return: fingerprint
     :rtype: list
         
     """
     fp = list(GetMACCSKeysFingerprint(mol))
     return fp
Exemple #2
0
def maccs_keys_fingerprint(df: pd.DataFrame, mols_column_name) -> pd.DataFrame:
    """
    Convert a column of RDKIT mol objects into MACCS Keys Fingeprints.

    Returns a new dataframe without any of the original data.
    This is intentional to leave the user with the data requested.

    Method chaining usage:

    .. code-block:: python

        df = pd.DataFrame(...)
        maccs = df.maccs_keys_fingerprint(mols_column_name='mols')

    If you wish to join the molecular descriptors back into the
    original dataframe, this can be accomplished by doing a `join`,
    because the indices are preserved:

    ..code-block:: python

        joined = df.join(maccs_keys_fingerprint)


    :param df: A pandas DataFrame.
    :param mols_column_name: The name of the column that has the RDKIT mol
        objects.
    :returns: A pandas DataFrame
    """

    maccs = [GetMACCSKeysFingerprint(m) for m in df[mols_column_name]]

    np_maccs = []

    for macc in maccs:
        arr = np.zeros((1, ))
        DataStructs.ConvertToNumpyArray(macc, arr)
        np_maccs.append(arr)
    np_maccs = np.vstack(np_maccs)
    fmaccs = pd.DataFrame(np_maccs)
    fmaccs.index = df.index
    return fmaccs
Exemple #3
0
def CalculateMACCS(mols):
    """
    Brief:
    -----------
    There is a SMARTS-based implementation of the 166 public MACCS keys. 166 bits
    
    Ref.:
    -----------
    Using the 166 public keys implemented as SMARTS
    
    Parameters:
    -----------
    mols: Iterable object, each element is a rdkit.Chem.rdchem.Mol
        The molecule(s) to be scanned
        
    Return:
    -----------
    fps: numpy.ndarray
    """
    fps = list(map(lambda mol: np.array(GetMACCSKeysFingerprint(mol)), mols))
    fps = np.array(fps)
    return fps
Exemple #4
0
def maccs_keys(mol, **kwargs):
    return list(GetMACCSKeysFingerprint(mol).GetOnBits())
Exemple #5
0
class FingerprintsTransformer(MoleculeTransformer):
    r"""
    Fingerprint molecule transformer.
    This transformer is able to compute various fingerprints regularly used in QSAR modeling.

    Arguments
    ----------
        kind: str, optional
            Name of the fingerprinting method used. Should be one of
            {'global_properties', 'atom_pair', 'topological_torsion',
            'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg',
            'rdkit', 'maccs'}
            (Default value = 'morgan_circular')
        length: int, optional
            Length of the fingerprint to use
            (Default value = 2000)

    Attributes
    ----------
        kind: str
            Name of the fingerprinting technique used
        length: int
            Length of the fingerprint to use
        fpfun: function
            function to call to compute the fingerprint
    """
    MAPPING = OrderedDict(
        # global_properties=lambda x, params: augmented_mol_properties(x),
        # physiochemical=lambda x: GetBPFingerprint(x),
        atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect(
            x, **params),
        topological_torsion=lambda x, params:
        GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params),
        ecfp2=lambda x, params: GetMorganFingerprintAsBitVect(x, 1, **params),
        ecfp4=lambda x, params: GetMorganFingerprintAsBitVect(x, 2, **params),
        ecfp6=lambda x, params: GetMorganFingerprintAsBitVect(x, 3, **params),
        estate=lambda x, params: FingerprintMol(x)[0],
        avalon_bit=lambda x, params: GetAvalonFP(x, **params),
        avalon_count=lambda x, params: GetAvalonCountFP(x, **params),
        erg=lambda x, params: GetErGFingerprint(x),
        rdkit=lambda x, params: RDKFingerprint(x, **params),
        maccs=lambda x, params: GetMACCSKeysFingerprint(x))

    def __init__(self, kind='ecfp2', length=4096):
        super(FingerprintsTransformer, self).__init__()
        if not (isinstance(kind, str) and
                (kind in FingerprintsTransformer.MAPPING.keys())):
            raise ValueError("Argument kind must be in: " +
                             ', '.join(FingerprintsTransformer.MAPPING.keys()))
        self.kind = kind
        self.length = length
        self.fpfun = self.MAPPING.get(kind, None)
        if not self.fpfun:
            raise ValueError("Fingerprint {} is not offered".format(kind))
        self._params = {}
        self._params.update({
            ('fpSize' if kind == 'rdkit' else 'nBits'): length
        })

    def _transform(self, mol):
        r"""
        Transforms a molecule into a fingerprint vector
        :raises ValueError: when the input molecule is None

        Arguments
        ----------
            mol: rdkit.Chem.Mol
                Molecule of interest

        Returns
        -------
            fp: np.ndarray
                The computed fingerprint

        """

        if mol is None:
            raise ValueError("Expecting a Chem.Mol object, got None")
        # expect cryptic rdkit errors here if this fails, #rdkitdev
        fp = self.fpfun(mol, self._params)
        if isinstance(fp, ExplicitBitVect):
            fp = explicit_bit_vect_to_array(fp)
        else:
            fp = list(fp)
        return fp

    def transform(self, mols, **kwargs):
        r"""
        Transforms a batch of molecules into fingerprint vectors.

        .. note::
            The recommended way is to use the object as a callable.

        Arguments
        ----------
            mols: (str or rdkit.Chem.Mol) iterable
                List of SMILES or molecules
            kwargs: named parameters for transform (see below)

        Returns
        -------
            fp: array
                computed fingerprints of size NxD, where D is the
                requested length of features and N is the number of input
                molecules that have been successfully featurized.

        See Also
        --------
            :func:`~ivbase.transformers.features.MoleculeTransformer.transform`

        """
        mol_list = [
            self.to_mol(mol, addHs=False) for i, mol in enumerate(mols)
        ]
        # idx = [i for i, m in enumerate(mol_list) if m is None]
        mol_list = list(filter(None.__ne__, mol_list))
        features = np.array([self._transform(mol)
                             for mol in mol_list]).astype(np.float32)
        features = totensor(features, gpu=False)

        return features

    def __call__(self, mols, dtype=torch.long, cuda=False, **kwargs):
        r"""
        Transforms a batch of molecules into fingerprint vectors,
        and return the transformation in the desired data type format as well as
        the set of valid indexes.

        Arguments
        ----------
            mols: (str or rdkit.Chem.Mol) iterable
                The list of input smiles or molecules
            dtype: torch.dtype or numpy.dtype, optional
                Datatype of the transformed variable.
                Expect a tensor if you provide a torch dtype, a numpy array if you provide a
                numpy dtype (supports valid strings) or a vanilla int/float. Any other option will
                return the output of the transform function.
                (Default value = torch.long)
            cuda: bool, optional
                Whether to transfer tensor on the GPU (if output is a tensor)
            kwargs: named parameters for transform (see below)

        Returns
        -------
            fp: array
                computed fingerprints (in `dtype` datatype) of size NxD,
                where D is the requested length of features and N is the number
                of input molecules that have been successfully featurized.
            ids: array
                all valid molecule positions that did not failed during featurization

        See Also
        --------
            :func:`~ivbase.transformers.features.FingerprintsTransformer.transform`

        """
        fp, ids = super(FingerprintsTransformer, self).__call__(mols, **kwargs)
        if is_dtype_numpy_array(dtype):
            fp = np.array(fp, dtype=dtype)
        elif is_dtype_torch_tensor(dtype):
            fp = totensor(fp, gpu=cuda, dtype=dtype)
        else:
            raise (TypeError('The type {} is not supported'.format(dtype)))
        return fp, ids
Exemple #6
0
 def _transform_mol(self, mol):
     return np.array(list(GetMACCSKeysFingerprint(mol)))[1:]
Exemple #7
0
def maccs_keys_fingerprint(
    df: pd.DataFrame, mols_column_name: Hashable
) -> pd.DataFrame:
    """
    Convert a column of RDKIT mol objects into MACCS Keys Fingerprints.

    Returns a new dataframe without any of the original data.
    This is intentional to leave the user with the data requested.

    This method does not mutate the original DataFrame.

    Functional usage example:

    .. code-block:: python

        import pandas as pd
        import janitor.chemistry

        df = pd.DataFrame(...)

        maccs = janitor.chemistry.maccs_keys_fingerprint(
            df=df.smiles2mol('smiles', 'mols'),
            mols_column_name='mols'
        )

    Method chaining usage example:

    .. code-block:: python

        import pandas as pd
        import janitor.chemistry

        df = pd.DataFrame(...)

        maccs = (
            df.smiles2mol('smiles', 'mols')
              .maccs_keys_fingerprint(mols_column_name='mols')
        )

    If you wish to join the maccs keys fingerprints back into the
    original dataframe, this can be accomplished by doing a `join`,
    because the indices are preserved:

    .. code-block:: python

        joined = df.join(maccs_keys_fingerprint)


    :param df: A pandas DataFrame.
    :param mols_column_name: The name of the column that has the RDKIT mol
        objects.
    :returns: A new pandas DataFrame of MACCS keys fingerprints.
    """

    maccs = [GetMACCSKeysFingerprint(m) for m in df[mols_column_name]]

    np_maccs = []

    for macc in maccs:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(macc, arr)
        np_maccs.append(arr)
    np_maccs = np.vstack(np_maccs)
    fmaccs = pd.DataFrame(np_maccs)
    fmaccs.index = df.index
    return fmaccs
class FingerprintsTransformer(MoleculeTransformer):
    """Molecule transformer into molecular fingerprint

    Parameters
    ----------
    kind : {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular',
        'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'}, optional, default='global_properties'
        Name of the fingerprinting technique used
    length: int
        Length of the fingerprint to use

    Attributes
    ----------
    kind : str
        Name of the fingerprinting technique used
    length : int
        Length of the fingerprint to use
    fpfun : function
        function to call to compute the fingerprint
    """
    mapping = OrderedDict(
        # physiochemical=lambda x: GetBPFingerprint(x),
        atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect(
            x, **params),
        topological_torsion=lambda x, params:
        GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params),
        morgan_circular=lambda x, params: GetMorganFingerprintAsBitVect(
            x, 2, **params),
        estate=lambda x, params: FingerprintMol(x)[0],
        avalon_bit=lambda x, params: GetAvalonFP(x, **params),
        avalon_count=lambda x, params: GetAvalonCountFP(x, **params),
        erg=lambda x, params: GetErGFingerprint(x),
        rdkit=lambda x, params: RDKFingerprint(x, **params),
        maccs=lambda x, params: GetMACCSKeysFingerprint(x))

    def __init__(self, kind='morgan_circular', length=2000):
        super(FingerprintsTransformer, self).__init__()
        if not (isinstance(kind, str) and
                (kind in FingerprintsTransformer.mapping)):
            raise ValueError("Argument kind must be in: " +
                             ', '.join(FingerprintsTransformer.mapping.keys()))
        self.kind = kind
        self.length = length
        self.fpfun = self.mapping.get(kind, None)
        if not self.fpfun:
            raise ValueError("Fingerprint {} is not offered".format(kind))
        self._params = {}
        self._params.update({
            ('fpSize' if kind == 'rdkit' else 'nBits'): length
        })

    def _transform(self, mol):
        """Transform a molecule into a fingerprint vector

        Parameters
        ----------
        mol: str or rdkit.Chem.Mol
            The smiles of the molecule of interest or the molecule itself
        Returns
        -------
        fp : np.ndarray
            The computed fingerprint
        """
        if mol is None:
            warnings.warn("None value received for argument mol")
            fp = np.zeros(self.length)
        else:
            fp = self.fpfun(mol, self._params)
        if isinstance(fp, ExplicitBitVect):
            fp = explicit_bit_vect_to_array(fp)
        else:
            fp = np.array(list(fp))
        return fp

    def transform(self, mols):
        """Transform a batch of molecule into a fingerprint vectors

        Parameters
        ----------
        X: (str or rdkit.Chem.Mol) list
            The list of smiles or molecule

        Returns
        -------
        fp : 2d np.ndarray
            The computed fingerprint vectors
        """
        res = np.array(
            super(FingerprintsTransformer, self).transform(mols,
                                                           as_numpy=True))
        return res