def get_fingerprint(self, mol): """ Gets the fingerprint according to the chosen fingerprint type """ if self.fp_type == 'rdkit': fp = RDKFingerprint(mol) elif self.fp_type == 'morgan': fp = GetMorganFingerprintAsBitVect(mol, 2) return np.array(list(map(int, fp.ToBitString())))
def get_fingerprints(smiles_df, r=2, length=512, type_='morgan'): if type_ == 'morgan': fp = [AllChem.GetMorganFingerprintAsBitVect(m, r, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'fcpf': fp = [AllChem.GetMorganFingerprintAsBitVect(m, r, useFeatures=True, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'atom pair': fp = [GetHashedAtomPairFingerprintAsBitVect(m, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'avalon': fp = [GetAvalonFP(m, nBits = length) for m in smiles_df['mol']] elif type_ == 'torsion': fp = [GetHashedTopologicalTorsionFingerprintAsBitVect(m, nBits = length)\ for m in smiles_df['mol']] elif type_ == 'rdkit': fp = [RDKFingerprint(m, fpSize = length) for m in smiles_df['mol']] else: raise ValueError("Possible values: morgan, fcpf, atom pair, avalon, torision and rdkit") drug_names = smiles_df['drug'].values return fp_to_pandas(fp=fp, drug_names=drug_names)
def make_fingerprints(data, length=512, verbose=False): fp_list = [ fingerprint(Chem.rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect, "Torsion "), fingerprint(lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=length), "Morgan"), fingerprint(FingerprintMol, "Estate (1995)"), fingerprint(lambda x: GetAvalonFP(x, nBits=length), "Avalon bit based (2006)"), fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)), "Avalon+mol. weight"), fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)"), fingerprint(lambda x: RDKFingerprint(x, fpSize=length), "RDKit fingerprint"), fingerprint(lambda x: MACCSkeys.GenMACCSKeys(x), "MACCS fingerprint"), fingerprint(lambda x: get_fingerprint(x,fp_type='pubchem'), "PubChem"), # fingerprint(lambda x: get_fingerprint(x, fp_type='FP4'), "FP4") fingerprint(lambda x: Generate.Gen2DFingerprint(x,Gobbi_Pharm2D.factory,dMat=Chem.Get3DDistanceMatrix(x)), "3D pharmacophore"), ] for fp in fp_list: if (verbose): print("doing", fp.name) fp.apply_fp(data) return fp_list
def make_fingerprints(mols, length=1024, verbose=False): fp_list = [ #fingerprint(lambda x : GetBPFingerprint(x, fpfn=GetHashedAtomPairFingerprintAsBitVect), # "Physiochemical properties (1996)"), ##NOTE: takes a long time to compute fingerprint( lambda x: GetHashedAtomPairFingerprintAsBitVect(x, nBits=length), "Atom pair (1985)"), fingerprint( lambda x: GetHashedTopologicalTorsionFingerprintAsBitVect( x, nBits=length), "Topological Torsion (1987)"), fingerprint( lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=length), "ECFPs/Morgan Circular (2010) "), fingerprint(fp_Estate_ints, "E-state (fixed length) (1995)"), #fingerprint(fp_Estate_and_mw, "E-state + MW weight (1995)"), #fingerprint(FingerprintMol, "E-state, index sum (1995)"), fingerprint(lambda x: GetAvalonFP(x, nBits=length), "Avalon (2006)"), #fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)), # "Avalon+mol. weight"), fingerprint(lambda x: GetErGFingerprint(x), "ErG (2006)"), fingerprint(lambda x: RDKFingerprint(x, fpSize=length), "RDKit topological (2006)") ] for fp in fp_list: if (verbose): print("doing", fp.name) fp.apply_fp(mols) return fp_list
def GetRDkitFPs(mol, nBits=2048, return_bitInfo=False): """ ################################################################# Calculate Daylight-like fingerprint or topological fingerprint (1024 bits). Usage: result=CalculateDaylightFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ bitInfo = {} fp = RDKFingerprint(mol, fpSize=nBits, bitInfo=bitInfo) arr = np.zeros((0, ), dtype=np.bool) DataStructs.ConvertToNumpyArray(fp, arr) if return_bitInfo: return arr, return_bitInfo return arr
def convert_smiles_into_fingerprints(smile): """ Converts SMILES text strings into RDKit standard fingerprints :param smile: (str) SMILE chemical formula. E.g.: 'O=C(C)Oc1ccccc1C(=O)O' :return: RDKit fingerprint object """ mol = Chem.MolFromSmiles(smile) return list(RDKFingerprint(mol))
def _transform_mol(self, mol): return np.array( list( RDKFingerprint(mol, minPath=self.min_path, maxPath=self.max_path, fpSize=self.n_feats, nBitsPerHash=self.n_bits_per_hash, useHs=self.use_hs, tgtDensity=self.target_density, minSize=self.min_size, branchedPaths=self.branched_paths, useBondOrder=self.use_bond_types)))
def CalculateDaylight(self, mol): """ :param mol: molecule :type mol: rdkit.Chem.rdchem.Mol :return: fingerprint :rtype: list """ fp = RDKFingerprint(mol, minPath=self.minPath, maxPath=self.maxPath, fpSize=self.nBits) fp = list(fp) return fp
def main(): args = parse_arguments() print('Read from {}'.format(args.infile)) print('Write to {}'.format(args.outfile)) suppl = create_mol_supplier(args.infile) if not suppl: print('ERROR - unrecognised file format for {}'.format(infile)) exit(1) with open(args.outfile, 'w') as of: for mol in suppl: if args.fptype == 'Morgan': fp = GetMorganFingerprintAsBitVect(mol, args.size, nBits=args.numbits) elif args.fptype == 'Path': fp = RDKFingerprint(mol, fpSize=args.numbits, maxPath=args.size) of.write('{} {}\n'.format(mol.GetProp('_Name'), DataStructs.BitVectToText(fp)))
def __call__(self, mol): """Generates the Daylight fingerprint for passed `mol` object. Parameters ---------- mol : rdkit.Chem.rdchem.Mol Rdkit mol object. Returns ------- fp_arr : np.ndarray, shape(self.nbits, ) Fingerprint expressed as a numpy row vector. """ fp = RDKFingerprint(mol, fpSize=self.nbits, minPath=self.min_path, maxPath=self.max_path) fp_arr = _rdkit_fp_to_np_arr(fp) return fp_arr
def make_fingerprints(length, verbose, mols, chosen=None): if chosen == 1: fp_list = [ fingerprint(lambda x : GetHashedAtomPairFingerprintAsBitVect(x, nBits = length), "&qfuot;Atom pair (1985)")] elif chosen == 2: fp_list = [ fingerprint(lambda x : GetHashedTopologicalTorsionFingerprintAsBitVect(x, nBits = length), "Topological torsion (1987)")] elif chosen == 3: fp_list = [ fingerprint(lambda x : GetMorganFingerprintAsBitVect(x, 3, nBits = length), "Morgan circular ")] elif chosen == 4: fp_list = [ fingerprint(FingerprintMol, "Estate (1995)")] elif chosen == 5: fp_list = [ fingerprint(lambda x: GetAvalonFP(x, nBits=length), "Avalon bit based (2006)")] elif chosen == 6: fp_list = [ fingerprint(lambda x: np.append(GetAvalonFP(x, nBits=length), Descriptors.MolWt(x)), "Avalon+mol. weight")] elif chosen == 7: fp_list = [ fingerprint(lambda x: GetErGFingerprint(x), "ErG fingerprint (2006)")] elif chosen == 8: fp_list = [ fingerprint(lambda x : RDKFingerprint(x, fpSize=length), "RDKit fingerprint")] elif chosen == 9: fp_list = [ fingerprint(lambda x : FingerprintMols.FingerprintMol(x), "RDKit fingerprint2")] else: fp_list = [fingerprint(lambda x : MACCSkeys.GenMACCSKeys(x), "RDKit MACCSkeys")] for fp in fp_list: if (verbose): print("doing", fp.name) fp.apply_fp(mols) return fp_list
def GetFoldedPathFragment(mol, minPath=1, maxPath=7, nBits=1024, maxFragment=True, disposed=True): """Calculate folded path fragment. Parameters ---------- mol : dkit.Chem.rdchem.Mol object Compound to be Calculated minPath : int, optional The probable minimum length of path-based fragment, by default 1 maxPath : int, optional The probable maximum length of path-based fragment, by default 7 nBits : int, optional the number of bit of morgan, by default 1014 maxFragment : bool, optional Whether only return the maximum fragment of a given start atom, by default True disposed : bool, optional Whether dispose the original bitinfo, by default True Returns ------- fragments : tuple The first element is the ID of all fragments generated, and the second one is a dict whose key is the ID of output fragments, value is corresponding SMARTS and svg string (is svg set as True) """ bitInfo = {} fp = RDKFingerprint(mol, minPath=minPath, maxPath=maxPath, fpSize=nBits, bitInfo=bitInfo) fragments = _DisposePathFragments( bitInfo, maxFragment=maxFragment ) if disposed else bitInfo return fragments
def rdk_fingerprinter(mols, min_path=1, max_path=7, fp_size=2048, n_bits_per_hash=2, use_hs=True, tgt_density=0.0, min_size=128, branched_paths=True, use_bond_order=True, atom_invariants=0, from_atoms=0, atom_bits=None, bit_info=None): return ( _rdkit_dense_fingerprinter( mols=mols, which=(lambda mol: RDKFingerprint( mol, minPath=min_path, maxPath=max_path, fpSize=fp_size, nBitsPerHash=n_bits_per_hash, useHs=use_hs, tgtDensity=tgt_density, minSize=min_size, branchedPaths=branched_paths, useBondOrder=use_bond_order, atomInvariants=atom_invariants, fromAtoms=from_atoms, atomBits=atom_bits, bitInfo=bit_info, )), ), _zfill_dense_header(np.array(range(fp_size), dtype=np.str)), )
for Gap_opt_RF_SMILE in RF_on_gap_data1['SMILE']: #print (Gap_opt_RF_SMILE) # translate mol from its SMILE formula mol = Chem.MolFromSmiles(Gap_opt_RF_SMILE) # calculate the Morgan fingerprint #print(AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024).ToBitString()) Morgan_fingerprint.append( AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)) # calculate the Estate fingerprint #print(FingerprintMol(mol)[0]) Estate_fingerprint.append(FingerprintMol(mol)[0]) # calculate the RDKit fingerprint RDKit_fingerprint.append(RDKFingerprint(mol, fpSize=1024)) # Morgan_fingerprint and bandgaps using RF model # use sorted(sklearn.metrics.SCORERS.keys()) to find what are available in sklearn lib RF_on_gap_Morgan = GridSearchCV( RandomForestRegressor(), cv=8, param_grid={"n_estimators": np.linspace(50, 300, 25).astype('int')}, scoring='neg_mean_absolute_error', n_jobs=-1) RF_on_gap_Morgan.fit(Morgan_fingerprint, Experimental_Gap) Best_RF_on_gap_Morgan = RF_on_gap_Morgan.best_estimator_ print("Best parameters", RF_on_gap_Morgan.best_params_) print("Score function used", RF_on_gap_Morgan.scorer_) print("Score for the best tuning", -1 * RF_on_gap_Morgan.best_score_)
def get_rdk(mol): from rdkit.Chem.rdmolops import RDKFingerprint bitstring = RDKFingerprint(mol, 1, fpSize=2048).ToBitString() return np.array(list(bitstring))
def get_rdk_fps(mols): fps = list() for mol in mols: fps.append(np.array(RDKFingerprint(mol))) fps = np.array(fps) return sparse.csr_matrix(fps).astype('int')
#read in input file try: t = pd.read_csv(args.input_file, dtype=str) except: t = pd.read_csv(args.input_file, dtype=str, encoding='ISO-8859-1') print('Original file length: {0}'.format(len(t))) t.drop_duplicates(subset=['Ligand_ID'], inplace=True) print('after drop duplicates: {0}'.format(len(t))) t['Mol'] = t['SMILES'].apply(molutil.s2m) t = t[t["Mol"] != None] print('after convert mol length: {0}'.format(len(t))) if args.fp == 'RDKit' or args.fp == 'rdkit': fp = fingerprint(lambda x: RDKFingerprint(x, fpSize=args.length), "RDKit fingerprint") elif args.fp == 'Morgan' or args.fp == 'morgan': fp = fingerprint( lambda x: GetMorganFingerprintAsBitVect(x, 2, nBits=args.length), "Morgan circular") mol_id = t['Ligand_ID'].values fp.apply_fp(list(t['Mol'])) print('Size of the fingerprint {0}'.format(len(fp.x), len(fp.x[0]))) col_names = [] for i in range(len(fp.x[0])): col_names.append('col' + str(i)) ft = pd.DataFrame(fp.x, columns=col_names) ft.insert(loc=0, column='Ligand_ID', value=mol_id)
class FingerprintsTransformer(MoleculeTransformer): r""" Fingerprint molecule transformer. This transformer is able to compute various fingerprints regularly used in QSAR modeling. Arguments ---------- kind: str, optional Name of the fingerprinting method used. Should be one of {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'} (Default value = 'morgan_circular') length: int, optional Length of the fingerprint to use (Default value = 2000) Attributes ---------- kind: str Name of the fingerprinting technique used length: int Length of the fingerprint to use fpfun: function function to call to compute the fingerprint """ MAPPING = OrderedDict( # global_properties=lambda x, params: augmented_mol_properties(x), # physiochemical=lambda x: GetBPFingerprint(x), atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect( x, **params), topological_torsion=lambda x, params: GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params), ecfp2=lambda x, params: GetMorganFingerprintAsBitVect(x, 1, **params), ecfp4=lambda x, params: GetMorganFingerprintAsBitVect(x, 2, **params), ecfp6=lambda x, params: GetMorganFingerprintAsBitVect(x, 3, **params), estate=lambda x, params: FingerprintMol(x)[0], avalon_bit=lambda x, params: GetAvalonFP(x, **params), avalon_count=lambda x, params: GetAvalonCountFP(x, **params), erg=lambda x, params: GetErGFingerprint(x), rdkit=lambda x, params: RDKFingerprint(x, **params), maccs=lambda x, params: GetMACCSKeysFingerprint(x)) def __init__(self, kind='ecfp2', length=4096): super(FingerprintsTransformer, self).__init__() if not (isinstance(kind, str) and (kind in FingerprintsTransformer.MAPPING.keys())): raise ValueError("Argument kind must be in: " + ', '.join(FingerprintsTransformer.MAPPING.keys())) self.kind = kind self.length = length self.fpfun = self.MAPPING.get(kind, None) if not self.fpfun: raise ValueError("Fingerprint {} is not offered".format(kind)) self._params = {} self._params.update({ ('fpSize' if kind == 'rdkit' else 'nBits'): length }) def _transform(self, mol): r""" Transforms a molecule into a fingerprint vector :raises ValueError: when the input molecule is None Arguments ---------- mol: rdkit.Chem.Mol Molecule of interest Returns ------- fp: np.ndarray The computed fingerprint """ if mol is None: raise ValueError("Expecting a Chem.Mol object, got None") # expect cryptic rdkit errors here if this fails, #rdkitdev fp = self.fpfun(mol, self._params) if isinstance(fp, ExplicitBitVect): fp = explicit_bit_vect_to_array(fp) else: fp = list(fp) return fp def transform(self, mols, **kwargs): r""" Transforms a batch of molecules into fingerprint vectors. .. note:: The recommended way is to use the object as a callable. Arguments ---------- mols: (str or rdkit.Chem.Mol) iterable List of SMILES or molecules kwargs: named parameters for transform (see below) Returns ------- fp: array computed fingerprints of size NxD, where D is the requested length of features and N is the number of input molecules that have been successfully featurized. See Also -------- :func:`~ivbase.transformers.features.MoleculeTransformer.transform` """ mol_list = [ self.to_mol(mol, addHs=False) for i, mol in enumerate(mols) ] # idx = [i for i, m in enumerate(mol_list) if m is None] mol_list = list(filter(None.__ne__, mol_list)) features = np.array([self._transform(mol) for mol in mol_list]).astype(np.float32) features = totensor(features, gpu=False) return features def __call__(self, mols, dtype=torch.long, cuda=False, **kwargs): r""" Transforms a batch of molecules into fingerprint vectors, and return the transformation in the desired data type format as well as the set of valid indexes. Arguments ---------- mols: (str or rdkit.Chem.Mol) iterable The list of input smiles or molecules dtype: torch.dtype or numpy.dtype, optional Datatype of the transformed variable. Expect a tensor if you provide a torch dtype, a numpy array if you provide a numpy dtype (supports valid strings) or a vanilla int/float. Any other option will return the output of the transform function. (Default value = torch.long) cuda: bool, optional Whether to transfer tensor on the GPU (if output is a tensor) kwargs: named parameters for transform (see below) Returns ------- fp: array computed fingerprints (in `dtype` datatype) of size NxD, where D is the requested length of features and N is the number of input molecules that have been successfully featurized. ids: array all valid molecule positions that did not failed during featurization See Also -------- :func:`~ivbase.transformers.features.FingerprintsTransformer.transform` """ fp, ids = super(FingerprintsTransformer, self).__call__(mols, **kwargs) if is_dtype_numpy_array(dtype): fp = np.array(fp, dtype=dtype) elif is_dtype_torch_tensor(dtype): fp = totensor(fp, gpu=cuda, dtype=dtype) else: raise (TypeError('The type {} is not supported'.format(dtype))) return fp, ids
def convert_to_rdkit(SMILES): mol = MS(SMILES) desc_val = RDKFingerprint(mol) desc_val_float_list = list( np.asarray(list(desc_val.ToBitString()), dtype=float)) return desc_val_float_list
class FingerprintsTransformer(MoleculeTransformer): """Molecule transformer into molecular fingerprint Parameters ---------- kind : {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'}, optional, default='global_properties' Name of the fingerprinting technique used length: int Length of the fingerprint to use Attributes ---------- kind : str Name of the fingerprinting technique used length : int Length of the fingerprint to use fpfun : function function to call to compute the fingerprint """ mapping = OrderedDict( # physiochemical=lambda x: GetBPFingerprint(x), atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect( x, **params), topological_torsion=lambda x, params: GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params), morgan_circular=lambda x, params: GetMorganFingerprintAsBitVect( x, 2, **params), estate=lambda x, params: FingerprintMol(x)[0], avalon_bit=lambda x, params: GetAvalonFP(x, **params), avalon_count=lambda x, params: GetAvalonCountFP(x, **params), erg=lambda x, params: GetErGFingerprint(x), rdkit=lambda x, params: RDKFingerprint(x, **params), maccs=lambda x, params: GetMACCSKeysFingerprint(x)) def __init__(self, kind='morgan_circular', length=2000): super(FingerprintsTransformer, self).__init__() if not (isinstance(kind, str) and (kind in FingerprintsTransformer.mapping)): raise ValueError("Argument kind must be in: " + ', '.join(FingerprintsTransformer.mapping.keys())) self.kind = kind self.length = length self.fpfun = self.mapping.get(kind, None) if not self.fpfun: raise ValueError("Fingerprint {} is not offered".format(kind)) self._params = {} self._params.update({ ('fpSize' if kind == 'rdkit' else 'nBits'): length }) def _transform(self, mol): """Transform a molecule into a fingerprint vector Parameters ---------- mol: str or rdkit.Chem.Mol The smiles of the molecule of interest or the molecule itself Returns ------- fp : np.ndarray The computed fingerprint """ if mol is None: warnings.warn("None value received for argument mol") fp = np.zeros(self.length) else: fp = self.fpfun(mol, self._params) if isinstance(fp, ExplicitBitVect): fp = explicit_bit_vect_to_array(fp) else: fp = np.array(list(fp)) return fp def transform(self, mols): """Transform a batch of molecule into a fingerprint vectors Parameters ---------- X: (str or rdkit.Chem.Mol) list The list of smiles or molecule Returns ------- fp : 2d np.ndarray The computed fingerprint vectors """ res = np.array( super(FingerprintsTransformer, self).transform(mols, as_numpy=True)) return res
def rdk(fpSize=2048): return lambda x: RDKFingerprint(x, fpSize=fpSize)