def truncated_Estate_featurizer(mol_list, return_names=False): X = np.array([FingerprintMol(mol)[0][6:37] for mol in mol_list]) Estate_names=['-CH3', '=CH2', '—CH2—', '\\#CH', '=CH-', 'aCHa', '>CH-', '=c=', '\\#C-', '=C$<$', 'aCa', 'aaCa', '$>$C$<$', '-NH3[+1]', '-NH2', '-NH2-[+1]', '=NH', '-NH-', 'aNHa', '\\#N', '$>$NH-[+1]', '=N—', 'aNa', '$>$N—', '—N$<$$<$', 'aaNs', '$>$N$<$[+1]', '-OH', '=0', '-0-', 'aOa'] if (return_names == True): return Estate_names, X else: return X
# Print out the SMILE data to verify proper read in SMILE (Special characters in SMILE are properly stored.) # Also calculate the corresponding Morgan finger print for Gap_opt_RF_SMILE in RF_on_gap_data1['SMILE']: #print (Gap_opt_RF_SMILE) # translate mol from its SMILE formula mol = Chem.MolFromSmiles(Gap_opt_RF_SMILE) # calculate the Morgan fingerprint #print(AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=1024).ToBitString()) Morgan_fingerprint.append( AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)) # calculate the Estate fingerprint #print(FingerprintMol(mol)[0]) Estate_fingerprint.append(FingerprintMol(mol)[0]) # calculate the RDKit fingerprint RDKit_fingerprint.append(RDKFingerprint(mol, fpSize=1024)) # Morgan_fingerprint and bandgaps using RF model # use sorted(sklearn.metrics.SCORERS.keys()) to find what are available in sklearn lib RF_on_gap_Morgan = GridSearchCV( RandomForestRegressor(), cv=8, param_grid={"n_estimators": np.linspace(50, 300, 25).astype('int')}, scoring='neg_mean_absolute_error', n_jobs=-1) RF_on_gap_Morgan.fit(Morgan_fingerprint, Experimental_Gap) Best_RF_on_gap_Morgan = RF_on_gap_Morgan.best_estimator_
def fp_Estate_and_mw(mol): return np.append(FingerprintMol(mol)[0][6:37], Descriptors.MolWt(mol))
def fp_Estate_reals(mol): return FingerprintMol(mol)[1][6:37]
def fp_Estate_ints(mol): return FingerprintMol(mol)[0][6:37]
def truncated_Estate_fp(mol_list): return np.array([FingerprintMol(mol)[0][6:37] for mol in mol_list])
class FingerprintsTransformer(MoleculeTransformer): r""" Fingerprint molecule transformer. This transformer is able to compute various fingerprints regularly used in QSAR modeling. Arguments ---------- kind: str, optional Name of the fingerprinting method used. Should be one of {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'} (Default value = 'morgan_circular') length: int, optional Length of the fingerprint to use (Default value = 2000) Attributes ---------- kind: str Name of the fingerprinting technique used length: int Length of the fingerprint to use fpfun: function function to call to compute the fingerprint """ MAPPING = OrderedDict( # global_properties=lambda x, params: augmented_mol_properties(x), # physiochemical=lambda x: GetBPFingerprint(x), atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect( x, **params), topological_torsion=lambda x, params: GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params), ecfp2=lambda x, params: GetMorganFingerprintAsBitVect(x, 1, **params), ecfp4=lambda x, params: GetMorganFingerprintAsBitVect(x, 2, **params), ecfp6=lambda x, params: GetMorganFingerprintAsBitVect(x, 3, **params), estate=lambda x, params: FingerprintMol(x)[0], avalon_bit=lambda x, params: GetAvalonFP(x, **params), avalon_count=lambda x, params: GetAvalonCountFP(x, **params), erg=lambda x, params: GetErGFingerprint(x), rdkit=lambda x, params: RDKFingerprint(x, **params), maccs=lambda x, params: GetMACCSKeysFingerprint(x)) def __init__(self, kind='ecfp2', length=4096): super(FingerprintsTransformer, self).__init__() if not (isinstance(kind, str) and (kind in FingerprintsTransformer.MAPPING.keys())): raise ValueError("Argument kind must be in: " + ', '.join(FingerprintsTransformer.MAPPING.keys())) self.kind = kind self.length = length self.fpfun = self.MAPPING.get(kind, None) if not self.fpfun: raise ValueError("Fingerprint {} is not offered".format(kind)) self._params = {} self._params.update({ ('fpSize' if kind == 'rdkit' else 'nBits'): length }) def _transform(self, mol): r""" Transforms a molecule into a fingerprint vector :raises ValueError: when the input molecule is None Arguments ---------- mol: rdkit.Chem.Mol Molecule of interest Returns ------- fp: np.ndarray The computed fingerprint """ if mol is None: raise ValueError("Expecting a Chem.Mol object, got None") # expect cryptic rdkit errors here if this fails, #rdkitdev fp = self.fpfun(mol, self._params) if isinstance(fp, ExplicitBitVect): fp = explicit_bit_vect_to_array(fp) else: fp = list(fp) return fp def transform(self, mols, **kwargs): r""" Transforms a batch of molecules into fingerprint vectors. .. note:: The recommended way is to use the object as a callable. Arguments ---------- mols: (str or rdkit.Chem.Mol) iterable List of SMILES or molecules kwargs: named parameters for transform (see below) Returns ------- fp: array computed fingerprints of size NxD, where D is the requested length of features and N is the number of input molecules that have been successfully featurized. See Also -------- :func:`~ivbase.transformers.features.MoleculeTransformer.transform` """ mol_list = [ self.to_mol(mol, addHs=False) for i, mol in enumerate(mols) ] # idx = [i for i, m in enumerate(mol_list) if m is None] mol_list = list(filter(None.__ne__, mol_list)) features = np.array([self._transform(mol) for mol in mol_list]).astype(np.float32) features = totensor(features, gpu=False) return features def __call__(self, mols, dtype=torch.long, cuda=False, **kwargs): r""" Transforms a batch of molecules into fingerprint vectors, and return the transformation in the desired data type format as well as the set of valid indexes. Arguments ---------- mols: (str or rdkit.Chem.Mol) iterable The list of input smiles or molecules dtype: torch.dtype or numpy.dtype, optional Datatype of the transformed variable. Expect a tensor if you provide a torch dtype, a numpy array if you provide a numpy dtype (supports valid strings) or a vanilla int/float. Any other option will return the output of the transform function. (Default value = torch.long) cuda: bool, optional Whether to transfer tensor on the GPU (if output is a tensor) kwargs: named parameters for transform (see below) Returns ------- fp: array computed fingerprints (in `dtype` datatype) of size NxD, where D is the requested length of features and N is the number of input molecules that have been successfully featurized. ids: array all valid molecule positions that did not failed during featurization See Also -------- :func:`~ivbase.transformers.features.FingerprintsTransformer.transform` """ fp, ids = super(FingerprintsTransformer, self).__call__(mols, **kwargs) if is_dtype_numpy_array(dtype): fp = np.array(fp, dtype=dtype) elif is_dtype_torch_tensor(dtype): fp = totensor(fp, gpu=cuda, dtype=dtype) else: raise (TypeError('The type {} is not supported'.format(dtype))) return fp, ids
def estate_fingerprint(mol): return FingerprintMol(mol)[0]
mymols = make_molecules(cno) #Make sum over bonds descriptor bond_types, bonds_in_molecule = sum_over_bonds(mymols) np.savetxt("sum_over_bonds.out", bonds_in_molecule, delimiter=" ") #*********** Generate Estate indices************************ # #Note that there are 79 possible Estate descriptors, #however only a subset are non-zero for the Huang-Massa/Mathieu dataset so I #remove the null vectors using scrub_null_columns() num_smiles = len(smi) icount = 0 estate_fingers = np.zeros((num_smiles, 79)) #There are 79 possible descriptors while icount < num_smiles: m = Chem.MolFromSmiles(smi[icount]) counts, sums = FingerprintMol(m) estate_fingers[icount, :] = np.transpose( counts) #can also use sums as descriptor icount += 1 nz_estate = scrub_null_columns(estate_fingers) np.savetxt("nz_estate.out", nz_estate, delimiter=" ") # # #**********Done with Estate Generation************************** # Make Morgan fingerprints using Dan's code dan_prints = make_fingerprints(mymols) morgan_prints = np.asarray(dan_prints[2].x) np.savetxt("morgan_prints.out", morgan_prints, delimiter=" ")
class FingerprintsTransformer(MoleculeTransformer): """Molecule transformer into molecular fingerprint Parameters ---------- kind : {'global_properties', 'atom_pair', 'topological_torsion', 'morgan_circular', 'estate', 'avalon_bit', 'avalon_count', 'erg', 'rdkit', 'maccs'}, optional, default='global_properties' Name of the fingerprinting technique used length: int Length of the fingerprint to use Attributes ---------- kind : str Name of the fingerprinting technique used length : int Length of the fingerprint to use fpfun : function function to call to compute the fingerprint """ mapping = OrderedDict( # physiochemical=lambda x: GetBPFingerprint(x), atom_pair=lambda x, params: GetHashedAtomPairFingerprintAsBitVect( x, **params), topological_torsion=lambda x, params: GetHashedTopologicalTorsionFingerprintAsBitVect(x, **params), morgan_circular=lambda x, params: GetMorganFingerprintAsBitVect( x, 2, **params), estate=lambda x, params: FingerprintMol(x)[0], avalon_bit=lambda x, params: GetAvalonFP(x, **params), avalon_count=lambda x, params: GetAvalonCountFP(x, **params), erg=lambda x, params: GetErGFingerprint(x), rdkit=lambda x, params: RDKFingerprint(x, **params), maccs=lambda x, params: GetMACCSKeysFingerprint(x)) def __init__(self, kind='morgan_circular', length=2000): super(FingerprintsTransformer, self).__init__() if not (isinstance(kind, str) and (kind in FingerprintsTransformer.mapping)): raise ValueError("Argument kind must be in: " + ', '.join(FingerprintsTransformer.mapping.keys())) self.kind = kind self.length = length self.fpfun = self.mapping.get(kind, None) if not self.fpfun: raise ValueError("Fingerprint {} is not offered".format(kind)) self._params = {} self._params.update({ ('fpSize' if kind == 'rdkit' else 'nBits'): length }) def _transform(self, mol): """Transform a molecule into a fingerprint vector Parameters ---------- mol: str or rdkit.Chem.Mol The smiles of the molecule of interest or the molecule itself Returns ------- fp : np.ndarray The computed fingerprint """ if mol is None: warnings.warn("None value received for argument mol") fp = np.zeros(self.length) else: fp = self.fpfun(mol, self._params) if isinstance(fp, ExplicitBitVect): fp = explicit_bit_vect_to_array(fp) else: fp = np.array(list(fp)) return fp def transform(self, mols): """Transform a batch of molecule into a fingerprint vectors Parameters ---------- X: (str or rdkit.Chem.Mol) list The list of smiles or molecule Returns ------- fp : 2d np.ndarray The computed fingerprint vectors """ res = np.array( super(FingerprintsTransformer, self).transform(mols, as_numpy=True)) return res