Example #1
0
    def get_fingerprint(self, mol):
        """
        Gets the fingerprint according to the chosen fingerprint type
        """
        if self.fp_type == 'rdkit':
            fp = RDKFingerprint(mol)
        elif self.fp_type == 'morgan':
            fp = GetMorganFingerprintAsBitVect(mol, 2)

        return np.array(list(map(int, fp.ToBitString())))
Example #2
0
def get_fingerprints(smiles_df, r=2, length=512,
                     type_='morgan'):
    if type_ == 'morgan':
        fp = [AllChem.GetMorganFingerprintAsBitVect(m, r,
                                                    nBits = length)\
              for m in smiles_df['mol']]
    elif type_ == 'fcpf':
        fp = [AllChem.GetMorganFingerprintAsBitVect(m, r,
                                                    useFeatures=True,
                                                    nBits = length)\
              for m in smiles_df['mol']]
    elif type_ == 'atom pair':
        fp = [GetHashedAtomPairFingerprintAsBitVect(m,
                                                    nBits = length)\
              for m in smiles_df['mol']]
    elif type_ == 'avalon':
         fp = [GetAvalonFP(m, nBits = length) for m in smiles_df['mol']]
    elif type_ == 'torsion':
        fp = [GetHashedTopologicalTorsionFingerprintAsBitVect(m,
                                                         nBits = length)\
         for m in smiles_df['mol']]
    elif type_ == 'rdkit':
        fp = [RDKFingerprint(m, fpSize = length) for m in smiles_df['mol']]
    else:
        raise ValueError("Possible values: morgan, fcpf, atom pair, avalon, torision and rdkit")

    drug_names = smiles_df['drug'].values
    return fp_to_pandas(fp=fp, drug_names=drug_names)
def make_fingerprints(data, length=512, verbose=False):
    fp_list = [
        fingerprint(Chem.rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect,
                    "Torsion "),
        fingerprint(lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=length),
                    "Morgan"),
        fingerprint(FingerprintMol, "Estate (1995)"),
        fingerprint(lambda x: GetAvalonFP(x, nBits=length),
                    "Avalon bit based (2006)"),
        fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)),
                    "Avalon+mol. weight"),
        fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)"),
        fingerprint(lambda x: RDKFingerprint(x, fpSize=length),
                    "RDKit fingerprint"),
        fingerprint(lambda x: MACCSkeys.GenMACCSKeys(x),
                    "MACCS fingerprint"),
        fingerprint(lambda x: get_fingerprint(x,fp_type='pubchem'), "PubChem"),
        # fingerprint(lambda x: get_fingerprint(x, fp_type='FP4'), "FP4")
        fingerprint(lambda x: Generate.Gen2DFingerprint(x,Gobbi_Pharm2D.factory,dMat=Chem.Get3DDistanceMatrix(x)),
                    "3D pharmacophore"),

    ]

    for fp in fp_list:
        if (verbose): print("doing", fp.name)
        fp.apply_fp(data)

    return fp_list
Example #4
0
def make_fingerprints(mols, length=1024, verbose=False):

    fp_list = [
        #fingerprint(lambda x : GetBPFingerprint(x, fpfn=GetHashedAtomPairFingerprintAsBitVect),
        #            "Physiochemical properties (1996)"), ##NOTE: takes a long time to compute
        fingerprint(
            lambda x: GetHashedAtomPairFingerprintAsBitVect(x, nBits=length),
            "Atom pair (1985)"),
        fingerprint(
            lambda x: GetHashedTopologicalTorsionFingerprintAsBitVect(
                x, nBits=length), "Topological Torsion (1987)"),
        fingerprint(
            lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=length),
            "ECFPs/Morgan Circular (2010) "),
        fingerprint(fp_Estate_ints, "E-state (fixed length) (1995)"),
        #fingerprint(fp_Estate_and_mw, "E-state + MW weight (1995)"),
        #fingerprint(FingerprintMol, "E-state, index sum (1995)"),
        fingerprint(lambda x: GetAvalonFP(x, nBits=length), "Avalon (2006)"),
        #fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)),
        #           "Avalon+mol. weight"),
        fingerprint(lambda x: GetErGFingerprint(x), "ErG (2006)"),
        fingerprint(lambda x: RDKFingerprint(x, fpSize=length),
                    "RDKit topological (2006)")
    ]

    for fp in fp_list:
        if (verbose): print("doing", fp.name)

        fp.apply_fp(mols)

    return fp_list
Example #5
0
def GetRDkitFPs(mol, nBits=2048, return_bitInfo=False):
    """
    #################################################################
    Calculate Daylight-like fingerprint or topological fingerprint
    
    (1024 bits).
    
    Usage:
        
        result=CalculateDaylightFingerprint(mol)
        
        Input: mol is a molecule object.
        
        Output: result is a tuple form. The first is the number of 
        
        fingerprints. The second is a dict form whose keys are the 
        
        position which this molecule has some substructure. The third
        
        is the DataStructs which is used for calculating the similarity.
    #################################################################
    """

    bitInfo = {}
    fp = RDKFingerprint(mol, fpSize=nBits, bitInfo=bitInfo)
    arr = np.zeros((0, ), dtype=np.bool)
    DataStructs.ConvertToNumpyArray(fp, arr)
    if return_bitInfo:
        return arr, return_bitInfo
    return arr
Example #6
0
def convert_smiles_into_fingerprints(smile):
    """
    Converts SMILES text strings into RDKit standard fingerprints

    :param smile:       (str) SMILE chemical formula. E.g.: 'O=C(C)Oc1ccccc1C(=O)O'
    :return:            RDKit fingerprint object
    """
    mol = Chem.MolFromSmiles(smile)
    return list(RDKFingerprint(mol))
Example #7
0
    def _transform_mol(self, mol):

        return np.array(
            list(
                RDKFingerprint(mol,
                               minPath=self.min_path,
                               maxPath=self.max_path,
                               fpSize=self.n_feats,
                               nBitsPerHash=self.n_bits_per_hash,
                               useHs=self.use_hs,
                               tgtDensity=self.target_density,
                               minSize=self.min_size,
                               branchedPaths=self.branched_paths,
                               useBondOrder=self.use_bond_types)))
Example #8
0
 def CalculateDaylight(self, mol):
     """
     
     :param mol: molecule
     :type mol: rdkit.Chem.rdchem.Mol
     :return: fingerprint
     :rtype: list
     
     """
     fp = RDKFingerprint(mol,
                         minPath=self.minPath,
                         maxPath=self.maxPath,
                         fpSize=self.nBits)
     fp = list(fp)
     return fp
Example #9
0
def main():

    args = parse_arguments()
    print('Read from {}'.format(args.infile))
    print('Write to {}'.format(args.outfile))

    suppl = create_mol_supplier(args.infile)
    if not suppl:
        print('ERROR - unrecognised file format for {}'.format(infile))
        exit(1)

    with open(args.outfile, 'w') as of:
        for mol in suppl:
            if args.fptype == 'Morgan':
                fp = GetMorganFingerprintAsBitVect(mol, args.size, nBits=args.numbits)
            elif args.fptype == 'Path':
                fp = RDKFingerprint(mol, fpSize=args.numbits, maxPath=args.size)
            of.write('{} {}\n'.format(mol.GetProp('_Name'), DataStructs.BitVectToText(fp)))
Example #10
0
    def __call__(self, mol):
        """Generates the Daylight fingerprint for passed `mol` object.

        Parameters
        ----------
        mol : rdkit.Chem.rdchem.Mol
            Rdkit mol object.

        Returns
        -------
        fp_arr : np.ndarray, shape(self.nbits, )
            Fingerprint expressed as a numpy row vector.
        """
        fp = RDKFingerprint(mol,
                            fpSize=self.nbits,
                            minPath=self.min_path,
                            maxPath=self.max_path)
        fp_arr = _rdkit_fp_to_np_arr(fp)
        return fp_arr
def make_fingerprints(length, verbose, mols, chosen=None):
    if chosen   == 1:
        fp_list = [
            fingerprint(lambda x : GetHashedAtomPairFingerprintAsBitVect(x, nBits = length),
                     "&qfuot;Atom pair (1985)")]
    elif chosen == 2:
        fp_list = [
             fingerprint(lambda x : GetHashedTopologicalTorsionFingerprintAsBitVect(x, nBits = length),
                     "Topological torsion (1987)")]
    elif chosen == 3:
        fp_list = [
             fingerprint(lambda x : GetMorganFingerprintAsBitVect(x, 3, nBits = length),
                     "Morgan circular ")]
    elif chosen == 4:
         fp_list = [
             fingerprint(FingerprintMol, "Estate (1995)")]
    elif chosen == 5:
        fp_list = [
             fingerprint(lambda x: GetAvalonFP(x, nBits=length),
                    "Avalon bit based (2006)")]
    elif chosen == 6:
        fp_list = [
            fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)),
                    "Avalon+mol. weight")]
    elif chosen == 7:
        fp_list = [
            fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)")]
    elif chosen == 8:
        fp_list = [
            fingerprint(lambda x : RDKFingerprint(x, fpSize=length),
                     "RDKit fingerprint")]
    elif chosen == 9:
        fp_list = [
            fingerprint(lambda x : FingerprintMols.FingerprintMol(x),
                     "RDKit fingerprint2")]
    else:
        fp_list = [fingerprint(lambda x : MACCSkeys.GenMACCSKeys(x), "RDKit MACCSkeys")]

    for fp in fp_list:
        if (verbose): print("doing", fp.name)
        fp.apply_fp(mols)

    return fp_list
Example #12
0
def GetFoldedPathFragment(mol,
                          minPath=1, maxPath=7,
                          nBits=1024, maxFragment=True,
                          disposed=True):
    """Calculate folded path fragment.

    Parameters
    ----------
    mol : dkit.Chem.rdchem.Mol object
        Compound to be Calculated
    minPath : int, optional
        The probable minimum length of path-based fragment, by default 1
    maxPath : int, optional
        The probable maximum length of path-based fragment, by default 7
    nBits : int, optional
        the number of bit of morgan, by default 1014
    maxFragment : bool, optional
        Whether only return the maximum fragment of a given start atom, by default True
    disposed : bool, optional
        Whether dispose the original bitinfo, by default True

    Returns
    -------
    fragments : tuple
        The first element is the ID of all fragments generated,
        and the second one is a dict whose key is the ID of output fragments,
        value is corresponding SMARTS and svg string (is svg set as True)
    """
    bitInfo = {}
    fp = RDKFingerprint(mol,
                        minPath=minPath,
                        maxPath=maxPath,
                        fpSize=nBits,
                        bitInfo=bitInfo)

    fragments = _DisposePathFragments(
            bitInfo, maxFragment=maxFragment
        ) if disposed else bitInfo
    return fragments
Example #13
0
def rdk_fingerprinter(mols,
                      min_path=1,
                      max_path=7,
                      fp_size=2048,
                      n_bits_per_hash=2,
                      use_hs=True,
                      tgt_density=0.0,
                      min_size=128,
                      branched_paths=True,
                      use_bond_order=True,
                      atom_invariants=0,
                      from_atoms=0,
                      atom_bits=None,
                      bit_info=None):
    return (
        _rdkit_dense_fingerprinter(
            mols=mols,
            which=(lambda mol: RDKFingerprint(
                mol,
                minPath=min_path,
                maxPath=max_path,
                fpSize=fp_size,
                nBitsPerHash=n_bits_per_hash,
                useHs=use_hs,
                tgtDensity=tgt_density,
                minSize=min_size,
                branchedPaths=branched_paths,
                useBondOrder=use_bond_order,
                atomInvariants=atom_invariants,
                fromAtoms=from_atoms,
                atomBits=atom_bits,
                bitInfo=bit_info,
            )),
        ),
        _zfill_dense_header(np.array(range(fp_size), dtype=np.str)),
    )
Example #14
0
for Gap_opt_RF_SMILE in RF_on_gap_data1['SMILE']:
    #print (Gap_opt_RF_SMILE)
    # translate mol from its SMILE formula
    mol = Chem.MolFromSmiles(Gap_opt_RF_SMILE)

    # calculate the Morgan fingerprint
    #print(AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024).ToBitString())
    Morgan_fingerprint.append(
        AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024))

    # calculate the Estate fingerprint
    #print(FingerprintMol(mol)[0])
    Estate_fingerprint.append(FingerprintMol(mol)[0])

    # calculate the RDKit fingerprint
    RDKit_fingerprint.append(RDKFingerprint(mol, fpSize=1024))

# Morgan_fingerprint and bandgaps using RF model
# use sorted(sklearn.metrics.SCORERS.keys()) to find what are available in sklearn lib
RF_on_gap_Morgan = GridSearchCV(
    RandomForestRegressor(),
    cv=8,
    param_grid={"n_estimators": np.linspace(50, 300, 25).astype('int')},
    scoring='neg_mean_absolute_error',
    n_jobs=-1)
RF_on_gap_Morgan.fit(Morgan_fingerprint, Experimental_Gap)

Best_RF_on_gap_Morgan = RF_on_gap_Morgan.best_estimator_
print("Best parameters", RF_on_gap_Morgan.best_params_)
print("Score function used", RF_on_gap_Morgan.scorer_)
print("Score for the best tuning", -1 * RF_on_gap_Morgan.best_score_)
Example #15
0
def get_rdk(mol):
    from rdkit.Chem.rdmolops import RDKFingerprint
    bitstring = RDKFingerprint(mol, 1, fpSize=2048).ToBitString()
    return np.array(list(bitstring))
def get_rdk_fps(mols):
    fps = list()
    for mol in mols:
        fps.append(np.array(RDKFingerprint(mol)))
    fps = np.array(fps)
    return sparse.csr_matrix(fps).astype('int')
Example #17
0
    #read in input file
    try:
        t = pd.read_csv(args.input_file, dtype=str)
    except:
        t = pd.read_csv(args.input_file, dtype=str, encoding='ISO-8859-1')
    print('Original file length: {0}'.format(len(t)))

    t.drop_duplicates(subset=['Ligand_ID'], inplace=True)
    print('after drop duplicates: {0}'.format(len(t)))
    t['Mol'] = t['SMILES'].apply(molutil.s2m)
    t = t[t["Mol"] != None]
    print('after convert mol length: {0}'.format(len(t)))

    if args.fp == 'RDKit' or args.fp == 'rdkit':
        fp = fingerprint(lambda x: RDKFingerprint(x, fpSize=args.length),
                         "RDKit fingerprint")
    elif args.fp == 'Morgan' or args.fp == 'morgan':
        fp = fingerprint(
            lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=args.length),
            "Morgan circular")

    mol_id = t['Ligand_ID'].values
    fp.apply_fp(list(t['Mol']))
    print('Size of the fingerprint {0}'.format(len(fp.x), len(fp.x[0])))

    col_names = []
    for i in range(len(fp.x[0])):
        col_names.append('col' + str(i))
    ft = pd.DataFrame(fp.x, columns=col_names)
    ft.insert(loc=0, column='Ligand_ID', value=mol_id)
Example #18
0
class FingerprintsTransformer(MoleculeTransformer):
    r"""
    Fingerprint molecule transformer.
    This transformer is able to compute various fingerprints regularly used in QSAR modeling.

    Arguments
    ----------
        kind: str, optional
            Name of the fingerprinting method used. Should be one of
            {'global_properties', 'atom_pair', 'topological_torsion',
            'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg',
            'rdkit', 'maccs'}
            (Default value = 'morgan_circular')
        length: int, optional
            Length of the fingerprint to use
            (Default value = 2000)

    Attributes
    ----------
        kind: str
            Name of the fingerprinting technique used
        length: int
            Length of the fingerprint to use
        fpfun: function
            function to call to compute the fingerprint
    """
    MAPPING = OrderedDict(
        # global_properties=lambda x, params: augmented_mol_properties(x),
        # physiochemical=lambda x: GetBPFingerprint(x),
        atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect(
            x, **params),
        topological_torsion=lambda x, params:
        GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params),
        ecfp2=lambda x, params: GetMorganFingerprintAsBitVect(x, 1, **params),
        ecfp4=lambda x, params: GetMorganFingerprintAsBitVect(x, 2, **params),
        ecfp6=lambda x, params: GetMorganFingerprintAsBitVect(x, 3, **params),
        estate=lambda x, params: FingerprintMol(x)[0],
        avalon_bit=lambda x, params: GetAvalonFP(x, **params),
        avalon_count=lambda x, params: GetAvalonCountFP(x, **params),
        erg=lambda x, params: GetErGFingerprint(x),
        rdkit=lambda x, params: RDKFingerprint(x, **params),
        maccs=lambda x, params: GetMACCSKeysFingerprint(x))

    def __init__(self, kind='ecfp2', length=4096):
        super(FingerprintsTransformer, self).__init__()
        if not (isinstance(kind, str) and
                (kind in FingerprintsTransformer.MAPPING.keys())):
            raise ValueError("Argument kind must be in: " +
                             ', '.join(FingerprintsTransformer.MAPPING.keys()))
        self.kind = kind
        self.length = length
        self.fpfun = self.MAPPING.get(kind, None)
        if not self.fpfun:
            raise ValueError("Fingerprint {} is not offered".format(kind))
        self._params = {}
        self._params.update({
            ('fpSize' if kind == 'rdkit' else 'nBits'): length
        })

    def _transform(self, mol):
        r"""
        Transforms a molecule into a fingerprint vector
        :raises ValueError: when the input molecule is None

        Arguments
        ----------
            mol: rdkit.Chem.Mol
                Molecule of interest

        Returns
        -------
            fp: np.ndarray
                The computed fingerprint

        """

        if mol is None:
            raise ValueError("Expecting a Chem.Mol object, got None")
        # expect cryptic rdkit errors here if this fails, #rdkitdev
        fp = self.fpfun(mol, self._params)
        if isinstance(fp, ExplicitBitVect):
            fp = explicit_bit_vect_to_array(fp)
        else:
            fp = list(fp)
        return fp

    def transform(self, mols, **kwargs):
        r"""
        Transforms a batch of molecules into fingerprint vectors.

        .. note::
            The recommended way is to use the object as a callable.

        Arguments
        ----------
            mols: (str or rdkit.Chem.Mol) iterable
                List of SMILES or molecules
            kwargs: named parameters for transform (see below)

        Returns
        -------
            fp: array
                computed fingerprints of size NxD, where D is the
                requested length of features and N is the number of input
                molecules that have been successfully featurized.

        See Also
        --------
            :func:`~ivbase.transformers.features.MoleculeTransformer.transform`

        """
        mol_list = [
            self.to_mol(mol, addHs=False) for i, mol in enumerate(mols)
        ]
        # idx = [i for i, m in enumerate(mol_list) if m is None]
        mol_list = list(filter(None.__ne__, mol_list))
        features = np.array([self._transform(mol)
                             for mol in mol_list]).astype(np.float32)
        features = totensor(features, gpu=False)

        return features

    def __call__(self, mols, dtype=torch.long, cuda=False, **kwargs):
        r"""
        Transforms a batch of molecules into fingerprint vectors,
        and return the transformation in the desired data type format as well as
        the set of valid indexes.

        Arguments
        ----------
            mols: (str or rdkit.Chem.Mol) iterable
                The list of input smiles or molecules
            dtype: torch.dtype or numpy.dtype, optional
                Datatype of the transformed variable.
                Expect a tensor if you provide a torch dtype, a numpy array if you provide a
                numpy dtype (supports valid strings) or a vanilla int/float. Any other option will
                return the output of the transform function.
                (Default value = torch.long)
            cuda: bool, optional
                Whether to transfer tensor on the GPU (if output is a tensor)
            kwargs: named parameters for transform (see below)

        Returns
        -------
            fp: array
                computed fingerprints (in `dtype` datatype) of size NxD,
                where D is the requested length of features and N is the number
                of input molecules that have been successfully featurized.
            ids: array
                all valid molecule positions that did not failed during featurization

        See Also
        --------
            :func:`~ivbase.transformers.features.FingerprintsTransformer.transform`

        """
        fp, ids = super(FingerprintsTransformer, self).__call__(mols, **kwargs)
        if is_dtype_numpy_array(dtype):
            fp = np.array(fp, dtype=dtype)
        elif is_dtype_torch_tensor(dtype):
            fp = totensor(fp, gpu=cuda, dtype=dtype)
        else:
            raise (TypeError('The type {} is not supported'.format(dtype)))
        return fp, ids
Example #19
0
def convert_to_rdkit(SMILES):
    mol = MS(SMILES)
    desc_val = RDKFingerprint(mol)
    desc_val_float_list = list(
        np.asarray(list(desc_val.ToBitString()), dtype=float))
    return desc_val_float_list
Example #20
0
class FingerprintsTransformer(MoleculeTransformer):
    """Molecule transformer into molecular fingerprint

    Parameters
    ----------
    kind : {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular',
        'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'}, optional, default='global_properties'
        Name of the fingerprinting technique used
    length: int
        Length of the fingerprint to use

    Attributes
    ----------
    kind : str
        Name of the fingerprinting technique used
    length : int
        Length of the fingerprint to use
    fpfun : function
        function to call to compute the fingerprint
    """
    mapping = OrderedDict(
        # physiochemical=lambda x: GetBPFingerprint(x),
        atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect(
            x, **params),
        topological_torsion=lambda x, params:
        GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params),
        morgan_circular=lambda x, params: GetMorganFingerprintAsBitVect(
            x, 2, **params),
        estate=lambda x, params: FingerprintMol(x)[0],
        avalon_bit=lambda x, params: GetAvalonFP(x, **params),
        avalon_count=lambda x, params: GetAvalonCountFP(x, **params),
        erg=lambda x, params: GetErGFingerprint(x),
        rdkit=lambda x, params: RDKFingerprint(x, **params),
        maccs=lambda x, params: GetMACCSKeysFingerprint(x))

    def __init__(self, kind='morgan_circular', length=2000):
        super(FingerprintsTransformer, self).__init__()
        if not (isinstance(kind, str) and
                (kind in FingerprintsTransformer.mapping)):
            raise ValueError("Argument kind must be in: " +
                             ', '.join(FingerprintsTransformer.mapping.keys()))
        self.kind = kind
        self.length = length
        self.fpfun = self.mapping.get(kind, None)
        if not self.fpfun:
            raise ValueError("Fingerprint {} is not offered".format(kind))
        self._params = {}
        self._params.update({
            ('fpSize' if kind == 'rdkit' else 'nBits'): length
        })

    def _transform(self, mol):
        """Transform a molecule into a fingerprint vector

        Parameters
        ----------
        mol: str or rdkit.Chem.Mol
            The smiles of the molecule of interest or the molecule itself
        Returns
        -------
        fp : np.ndarray
            The computed fingerprint
        """
        if mol is None:
            warnings.warn("None value received for argument mol")
            fp = np.zeros(self.length)
        else:
            fp = self.fpfun(mol, self._params)
        if isinstance(fp, ExplicitBitVect):
            fp = explicit_bit_vect_to_array(fp)
        else:
            fp = np.array(list(fp))
        return fp

    def transform(self, mols):
        """Transform a batch of molecule into a fingerprint vectors

        Parameters
        ----------
        X: (str or rdkit.Chem.Mol) list
            The list of smiles or molecule

        Returns
        -------
        fp : 2d np.ndarray
            The computed fingerprint vectors
        """
        res = np.array(
            super(FingerprintsTransformer, self).transform(mols,
                                                           as_numpy=True))
        return res
Example #21
0
def rdk(fpSize=2048):
    return lambda x: RDKFingerprint(x, fpSize=fpSize)