コード例 #1
0
def getAtomPairFingerPrintFromSmile(smile):
    try:
        molecule = Chem.MolFromSmiles(smile)
        # print("here1")
        atomPairFP = GetHashedAtomPairFingerprintAsBitVect(molecule)
        # print("Here2")
        return atomPairFP.ToBitString()
    except:
        raise Exception("Not able to Generate FigerPrint")
コード例 #2
0
def get_fp(mols):
    fps = []
    if (args.fpType == 'ECFP4'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x, 2)
                fps.append(z)
    if (args.fpType == 'ECFP6'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x, 3)
                fps.append(z)
    if (args.fpType == 'ECFP12'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x, 6)
                fps.append(z)
    if (args.fpType == 'MACCS'):
        for x in mols:
            if (x):
                z = Chem.MACCSkeys.GenMACCSKeys(x)
                fps.append(z)
    if (args.fpType == 'Daylight'):
        for x in mols:
            if (x):
                z = FingerprintMols.FingerprintMol(x)
                fps.append(z)
    if (args.fpType == 'AP'):
        for x in mols:
            if (x):
                z = GetHashedAtomPairFingerprintAsBitVect(x, nBits=4096)
                fps.append(z)
    return fps
コード例 #3
0
ファイル: chemstruct.py プロジェクト: vladchimescu/chemgen
def get_fingerprints(smiles_df, r=2, length=512,
                     type_='morgan'):
    if type_ == 'morgan':
        fp = [AllChem.GetMorganFingerprintAsBitVect(m, r,
                                                    nBits = length)\
              for m in smiles_df['mol']]
    elif type_ == 'fcpf':
        fp = [AllChem.GetMorganFingerprintAsBitVect(m, r,
                                                    useFeatures=True,
                                                    nBits = length)\
              for m in smiles_df['mol']]
    elif type_ == 'atom pair':
        fp = [GetHashedAtomPairFingerprintAsBitVect(m,
                                                    nBits = length)\
              for m in smiles_df['mol']]
    elif type_ == 'avalon':
         fp = [GetAvalonFP(m, nBits = length) for m in smiles_df['mol']]
    elif type_ == 'torsion':
        fp = [GetHashedTopologicalTorsionFingerprintAsBitVect(m,
                                                         nBits = length)\
         for m in smiles_df['mol']]
    elif type_ == 'rdkit':
        fp = [RDKFingerprint(m, fpSize = length) for m in smiles_df['mol']]
    else:
        raise ValueError("Possible values: morgan, fcpf, atom pair, avalon, torision and rdkit")

    drug_names = smiles_df['drug'].values
    return fp_to_pandas(fp=fp, drug_names=drug_names)
コード例 #4
0
ファイル: fingerprints.py プロジェクト: yccai/scikit-chem
    def _transform_mol(self, mol):
        """Private method to transform a skchem molecule.

        Use transform` for the public method, which genericizes the argument to
        iterables of mols.

        Args:
            mol (skchem.Mol): Molecule to calculate fingerprint for.

        Returns:
            np.array or dict:
                Fingerprint as an array (or a dict if sparse).
        """

        if self.as_bits and self.n_feats > 0:

            fp = GetHashedAtomPairFingerprintAsBitVect(
                mol,
                nBits=self.n_feats,
                minLength=self.min_length,
                maxLength=self.max_length,
                includeChirality=self.use_chirality)

            res = np.array(0)
            ConvertToNumpyArray(fp, res)
            res = res.astype(np.uint8)

        else:

            if self.n_feats <= 0:

                res = GetAtomPairFingerprint(
                    mol,
                    nBits=self.n_feats,
                    minLength=self.min_length,
                    maxLength=self.max_length,
                    includeChirality=self.use_chirality)

                res = res.GetNonzeroElements()
                if self.as_bits:
                    res = {k: int(v > 0) for k, v in res.items()}

            else:
                res = GetHashedAtomPairFingerprint(
                    mol,
                    nBits=self.n_feats,
                    minLength=self.min_length,
                    maxLength=self.max_length,
                    includeChirality=self.use_chirality)

                res = np.array(list(res))

        return res
コード例 #5
0
def get_fp(mols):
    fps = []
    if (args.fpType == 'ECFP4'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x,
                                                          2)  #, nBits=4096 )
                fps.append(z)
    if (args.fpType == 'ECFP6'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x,
                                                          3)  #, nBits=4096 )
                fps.append(z)
    if (args.fpType == 'ECFP12'):
        for x in mols:
            if (x):
                z = AllChem.GetMorganFingerprintAsBitVect(x,
                                                          6)  #, nBits=4096 )
                fps.append(z)
    if (args.fpType == 'MACCS'):
        for x in mols:
            if (x):
                z = Chem.MACCSkeys.GenMACCSKeys(x)
                fps.append(z)
    if (args.fpType == 'simple'):
        describer = MUVDescriptors()
        for x in mols:
            if (x):
                z = describer.calculate_descriptors(x)
                fps.append(z)
    if (args.fpType == 'Daylight'):
        for x in mols:
            if (x):
                z = FingerprintMols.FingerprintMol(x)
                fps.append(z)
    if (args.fpType == 'AP'):
        for x in mols:
            if (x):
                z = GetHashedAtomPairFingerprintAsBitVect(x, nBits=4096)
                #z=Pairs.GetAtomPairFingerprint( x )
                fps.append(z)
    return fps
コード例 #6
0
ファイル: RDKFP.py プロジェクト: hachmannlab/chemml
 def _hap(self, molecules):
     if self.vector == 'int':
         from rdkit.Chem.AtomPairs.Pairs import GetHashedAtomPairFingerprint
         self.fps_ = [
             GetHashedAtomPairFingerprint(self._sanitary(m),
                                          nBits=self.n_bits,
                                          **self.kwargs) for m in molecules
         ]
         # get nonzero elements as a dictionary for each molecule
         dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps_]
         data = pd.DataFrame(dict_nonzero)
         data.fillna(0, inplace=True)
         return data
     elif self.vector == 'bit':
         from rdkit.Chem.rdMolDescriptors import GetHashedAtomPairFingerprintAsBitVect
         self.fps_ = [
             GetHashedAtomPairFingerprintAsBitVect(self._sanitary(m),
                                                   nBits=self.n_bits,
                                                   **self.kwargs)
             for m in molecules
         ]
         data = np.array(self.fps_)
         data = pd.DataFrame(data)
         return data
コード例 #7
0
class FingerprintsTransformer(MoleculeTransformer):
    r"""
    Fingerprint molecule transformer.
    This transformer is able to compute various fingerprints regularly used in QSAR modeling.

    Arguments
    ----------
        kind: str, optional
            Name of the fingerprinting method used. Should be one of
            {'global_properties', 'atom_pair', 'topological_torsion',
            'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg',
            'rdkit', 'maccs'}
            (Default value = 'morgan_circular')
        length: int, optional
            Length of the fingerprint to use
            (Default value = 2000)

    Attributes
    ----------
        kind: str
            Name of the fingerprinting technique used
        length: int
            Length of the fingerprint to use
        fpfun: function
            function to call to compute the fingerprint
    """
    MAPPING = OrderedDict(
        # global_properties=lambda x, params: augmented_mol_properties(x),
        # physiochemical=lambda x: GetBPFingerprint(x),
        atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect(
            x, **params),
        topological_torsion=lambda x, params:
        GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params),
        ecfp2=lambda x, params: GetMorganFingerprintAsBitVect(x, 1, **params),
        ecfp4=lambda x, params: GetMorganFingerprintAsBitVect(x, 2, **params),
        ecfp6=lambda x, params: GetMorganFingerprintAsBitVect(x, 3, **params),
        estate=lambda x, params: FingerprintMol(x)[0],
        avalon_bit=lambda x, params: GetAvalonFP(x, **params),
        avalon_count=lambda x, params: GetAvalonCountFP(x, **params),
        erg=lambda x, params: GetErGFingerprint(x),
        rdkit=lambda x, params: RDKFingerprint(x, **params),
        maccs=lambda x, params: GetMACCSKeysFingerprint(x))

    def __init__(self, kind='ecfp2', length=4096):
        super(FingerprintsTransformer, self).__init__()
        if not (isinstance(kind, str) and
                (kind in FingerprintsTransformer.MAPPING.keys())):
            raise ValueError("Argument kind must be in: " +
                             ', '.join(FingerprintsTransformer.MAPPING.keys()))
        self.kind = kind
        self.length = length
        self.fpfun = self.MAPPING.get(kind, None)
        if not self.fpfun:
            raise ValueError("Fingerprint {} is not offered".format(kind))
        self._params = {}
        self._params.update({
            ('fpSize' if kind == 'rdkit' else 'nBits'): length
        })

    def _transform(self, mol):
        r"""
        Transforms a molecule into a fingerprint vector
        :raises ValueError: when the input molecule is None

        Arguments
        ----------
            mol: rdkit.Chem.Mol
                Molecule of interest

        Returns
        -------
            fp: np.ndarray
                The computed fingerprint

        """

        if mol is None:
            raise ValueError("Expecting a Chem.Mol object, got None")
        # expect cryptic rdkit errors here if this fails, #rdkitdev
        fp = self.fpfun(mol, self._params)
        if isinstance(fp, ExplicitBitVect):
            fp = explicit_bit_vect_to_array(fp)
        else:
            fp = list(fp)
        return fp

    def transform(self, mols, **kwargs):
        r"""
        Transforms a batch of molecules into fingerprint vectors.

        .. note::
            The recommended way is to use the object as a callable.

        Arguments
        ----------
            mols: (str or rdkit.Chem.Mol) iterable
                List of SMILES or molecules
            kwargs: named parameters for transform (see below)

        Returns
        -------
            fp: array
                computed fingerprints of size NxD, where D is the
                requested length of features and N is the number of input
                molecules that have been successfully featurized.

        See Also
        --------
            :func:`~ivbase.transformers.features.MoleculeTransformer.transform`

        """
        mol_list = [
            self.to_mol(mol, addHs=False) for i, mol in enumerate(mols)
        ]
        # idx = [i for i, m in enumerate(mol_list) if m is None]
        mol_list = list(filter(None.__ne__, mol_list))
        features = np.array([self._transform(mol)
                             for mol in mol_list]).astype(np.float32)
        features = totensor(features, gpu=False)

        return features

    def __call__(self, mols, dtype=torch.long, cuda=False, **kwargs):
        r"""
        Transforms a batch of molecules into fingerprint vectors,
        and return the transformation in the desired data type format as well as
        the set of valid indexes.

        Arguments
        ----------
            mols: (str or rdkit.Chem.Mol) iterable
                The list of input smiles or molecules
            dtype: torch.dtype or numpy.dtype, optional
                Datatype of the transformed variable.
                Expect a tensor if you provide a torch dtype, a numpy array if you provide a
                numpy dtype (supports valid strings) or a vanilla int/float. Any other option will
                return the output of the transform function.
                (Default value = torch.long)
            cuda: bool, optional
                Whether to transfer tensor on the GPU (if output is a tensor)
            kwargs: named parameters for transform (see below)

        Returns
        -------
            fp: array
                computed fingerprints (in `dtype` datatype) of size NxD,
                where D is the requested length of features and N is the number
                of input molecules that have been successfully featurized.
            ids: array
                all valid molecule positions that did not failed during featurization

        See Also
        --------
            :func:`~ivbase.transformers.features.FingerprintsTransformer.transform`

        """
        fp, ids = super(FingerprintsTransformer, self).__call__(mols, **kwargs)
        if is_dtype_numpy_array(dtype):
            fp = np.array(fp, dtype=dtype)
        elif is_dtype_torch_tensor(dtype):
            fp = totensor(fp, gpu=cuda, dtype=dtype)
        else:
            raise (TypeError('The type {} is not supported'.format(dtype)))
        return fp, ids
コード例 #8
0
def convert_to_atompair(SMILES):
    mol = MS(SMILES)
    atom_pair = AtomPair(mol)
    atom_pair_float_list = list(
        np.asarray(list(atom_pair.ToBitString()), dtype=float))
    return atom_pair_float_list
コード例 #9
0
    def Fingerprint(self):
        if self.FPtype == 'Hashed_atom_pair' or self.FPtype == 'HAP':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Pairs import GetHashedAtomPairFingerprint
                self.fps = [
                    GetHashedAtomPairFingerprint(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                data = pd.DataFrame(dict_nonzero, columns=range(self.nBits))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetHashedAtomPairFingerprintAsBitVect
                self.fps = [
                    GetHashedAtomPairFingerprintAsBitVect(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Atom_pair' or self.FPtype == 'AP':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint
                self.fps = [GetAtomPairFingerprint(m) for m in self.molecules]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += [key for key in fp]
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprintAsBitVect
                self.fps = [
                    GetAtomPairFingerprintAsBitVect(m) for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)

                print len(data.columns)
                d_des = data.describe()
                for i in data.columns:
                    if d_des[i]['mean'] == 0:
                        data.drop(i, 1)
                print len(data.columns)

                dict_nonzero = []
                for fp in self.fps:
                    dict_nonzero.append(
                        {i: el
                         for i, el in enumerate(fp) if el != 0})
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += [key for key in fp]
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'MACCS':
            if self.vector == 'int':
                msg = "There is no RDKit function to encode int vectors for MACCS keys"
                raise ValueError(msg)
            elif self.vector == 'bit':
                from rdkit.Chem.MACCSkeys import GenMACCSKeys
                self.fps = [GenMACCSKeys(mol) for mol in self.molecules]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The vector argument can only be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Morgan':
            if self.vector == 'int':
                from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
                self.fps = [
                    GetMorganFingerprint(mol, self.radius)
                    for mol in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += list(fp)
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
                self.fps = [
                    GetMorganFingerprintAsBitVect(mol,
                                                  self.radius,
                                                  nBits=self.nBits)
                    for mol in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can only be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Hashed_topological_torsion' or self.FPtype == 'HTT':
            if self.vector == 'int':
                from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprint
                self.fps = [
                    GetHashedTopologicalTorsionFingerprint(m, nBits=self.nBits)
                    for m in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                data = pd.DataFrame(dict_nonzero, columns=range(self.nBits))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                from rdkit.Chem.rdMolDescriptors import GetHashedTopologicalTorsionFingerprintAsBitVect
                self.fps = [
                    GetHashedTopologicalTorsionFingerprintAsBitVect(
                        m, nBits=self.nBits) for m in self.molecules
                ]
                data = np.array(self.fps)
                data = pd.DataFrame(data)
                return data
            else:
                msg = "The argument vector can be 'int' or 'bit'"
                raise ValueError(msg)
        elif self.FPtype == 'Topological_torsion' or self.FPtype == 'TT':
            if self.vector == 'int':
                from rdkit.Chem.AtomPairs.Torsions import GetTopologicalTorsionFingerprintAsIntVect
                self.fps = [
                    GetTopologicalTorsionFingerprintAsIntVect(mol)
                    for mol in self.molecules
                ]
                dict_nonzero = [fp.GetNonzeroElements() for fp in self.fps]
                pairScores = []
                for fp in dict_nonzero:
                    pairScores += list(fp)
                data = pd.DataFrame(dict_nonzero,
                                    columns=list(set(pairScores)))
                data.fillna(0, inplace=True)
                return data
            elif self.vector == 'bit':
                msg = "There is no RDKit function to encode bit vectors for Topological Torsion Fingerprints"
                raise ValueError(msg)
            else:
                msg = "The argument vector can only be 'int'"
                raise ValueError(msg)
        else:
            msg = "The type argument '%s' is not a valid fingerprint type" % self.FPtype
            raise ValueError(msg)
コード例 #10
0
class FingerprintsTransformer(MoleculeTransformer):
    """Molecule transformer into molecular fingerprint

    Parameters
    ----------
    kind : {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular',
        'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'}, optional, default='global_properties'
        Name of the fingerprinting technique used
    length: int
        Length of the fingerprint to use

    Attributes
    ----------
    kind : str
        Name of the fingerprinting technique used
    length : int
        Length of the fingerprint to use
    fpfun : function
        function to call to compute the fingerprint
    """
    mapping = OrderedDict(
        # physiochemical=lambda x: GetBPFingerprint(x),
        atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect(
            x, **params),
        topological_torsion=lambda x, params:
        GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params),
        morgan_circular=lambda x, params: GetMorganFingerprintAsBitVect(
            x, 2, **params),
        estate=lambda x, params: FingerprintMol(x)[0],
        avalon_bit=lambda x, params: GetAvalonFP(x, **params),
        avalon_count=lambda x, params: GetAvalonCountFP(x, **params),
        erg=lambda x, params: GetErGFingerprint(x),
        rdkit=lambda x, params: RDKFingerprint(x, **params),
        maccs=lambda x, params: GetMACCSKeysFingerprint(x))

    def __init__(self, kind='morgan_circular', length=2000):
        super(FingerprintsTransformer, self).__init__()
        if not (isinstance(kind, str) and
                (kind in FingerprintsTransformer.mapping)):
            raise ValueError("Argument kind must be in: " +
                             ', '.join(FingerprintsTransformer.mapping.keys()))
        self.kind = kind
        self.length = length
        self.fpfun = self.mapping.get(kind, None)
        if not self.fpfun:
            raise ValueError("Fingerprint {} is not offered".format(kind))
        self._params = {}
        self._params.update({
            ('fpSize' if kind == 'rdkit' else 'nBits'): length
        })

    def _transform(self, mol):
        """Transform a molecule into a fingerprint vector

        Parameters
        ----------
        mol: str or rdkit.Chem.Mol
            The smiles of the molecule of interest or the molecule itself
        Returns
        -------
        fp : np.ndarray
            The computed fingerprint
        """
        if mol is None:
            warnings.warn("None value received for argument mol")
            fp = np.zeros(self.length)
        else:
            fp = self.fpfun(mol, self._params)
        if isinstance(fp, ExplicitBitVect):
            fp = explicit_bit_vect_to_array(fp)
        else:
            fp = np.array(list(fp))
        return fp

    def transform(self, mols):
        """Transform a batch of molecule into a fingerprint vectors

        Parameters
        ----------
        X: (str or rdkit.Chem.Mol) list
            The list of smiles or molecule

        Returns
        -------
        fp : 2d np.ndarray
            The computed fingerprint vectors
        """
        res = np.array(
            super(FingerprintsTransformer, self).transform(mols,
                                                           as_numpy=True))
        return res