def classify(sdf, label, lambdas): new_filename = "%s_class.sdf" % sdf.split('.sdf')[0] new_label = label + "_class" sdm = ForwardSDMolSupplier(sdf, strictParsing=False, removeHs=False, sanitize=False) sdw = SDWriter(new_filename) counter = -1 i = 0 for mol in sdm: print(i) sys.stdout.flush() i += 1 counter += 1 if mol is None: print("%d rdkit couldn't read molecule" % counter, file=sys.stderr) sys.stderr.flush() continue c = None prop = floatify(mol.GetProp(label)) if prop is None: print("couldn't convert %s to float or int...skip" % mol.GetProp(label), file=sys.stderr) sys.stderr.flush() continue for k, l in lambdas.items(): if l(prop): c = k print("hit %s" % k) sys.stdout.flush() break if c is None: print("%d no prop range matched '%s' ..skip" % (counter, mol.GetProp(label)), prop, type(prop), file=sys.stderr) sys.stderr.flush() sys.stdout.flush() continue mol.SetProp(new_label, c) try: sdw.write(mol) except: print( "couldn't write mol %d to file, try to build mol from smiles" % i, file=sys.stderr) mol = MolFromSmiles(mol.GetProp("SMILES")) AllChem.Compute2DCoords(mol) mol.SetProp(new_label, c) try: sdw.write(mol) except: print("couldn't write mol %d to file...skip" % i, file=sys.stderr) sdw.close()
def setUp(self): self.dataset = dict() self.dataset_inchi = dict() inf = gzip.open(os.path.join(RDConfig.RDCodeDir, 'Chem/test_data', 'pubchem-hard-set.sdf.gz'), 'r') self.dataset['problematic'] = ForwardSDMolSupplier(inf, sanitize=False, removeHs=False) with open(os.path.join(RDConfig.RDCodeDir, 'Chem/test_data', 'pubchem-hard-set.inchi'), 'r') as intF: buf = intF.read().replace('\r\n', '\n').encode('latin1') intF.close() with io.BytesIO(buf) as inF: pkl = inF.read() self.dataset_inchi['problematic'] = pickle.loads(pkl, encoding='latin1') # disable logging RDLogger.DisableLog('rdApp.warning')
def open_molecule_file(uploadedfile, logfile=os.devnull, filetype=None): #charset = 'utf-8' #if "charset" in uploadedfile and uploadedfile.charset is not None: #charset = uploadedfile.charset if filetype is None: if "filetype" not in uploadedfile or uploadedfile.filetype is None: basename, ext = os.path.splitext(uploadedfile.name) ext = ext.lower() ext = ext.strip('.') if ext in MOLECULE_EXTENSION_TYPES.keys(): filetype = MOLECULE_EXTENSION_TYPES[ext] uploadedfile.filetype = filetype else: raise InvalidMoleculeFileExtension(ext=ext) else: filetype = uploadedfile.filetype with stdout_redirected(to=logfile, stdout=sys.stderr): with stdout_redirected(to=logfile, stdout=sys.stdout): print('Loading molecule...') uploadedfile.seek(0) if filetype == 'sdf' or filetype == 'mol': suppl = ForwardSDMolSupplier(uploadedfile, removeHs=False) mol = next(suppl) try: next(suppl) except StopIteration: pass except: raise else: raise MultipleMoleculesinSDF() finally: del suppl if mol is None: if filetype == 'sdf': raise ParsingError("Invalid SDFile file.") else: raise ParsingError("Invalid MDL Mol file.") print('Assigning chirality from struture...') AssignAtomChiralTagsFromStructure(mol, replaceExistingTags=False) print('Finished loading molecule.') return mol
def read_sdf(sdf_file, requires_length=False): """Read an sdf file. Parameters ---------- sdf_file: A file-like object requires_length: If True returns an enumerated Mol supplier, i.e. when monitoring progress Returns ------- either a MolSupplier or an EnumeratedSupplier depending on whether a length is required """ supplier = ForwardSDMolSupplier(sdf_file) if not requires_length: return MolSupplier(supplier) count = sdf_count(sdf_file) sdf_file.seek(0) return EnumeratedMolSupplier(supplier, count)
def read_sdf(sdf_file, requires_length=False): """Read molecules from an SDF. Parameters ---------- sdf_file : file-like object An open SDF. requires_length : bool, optional If True returns an enumerated MolSupplier, i.e. when monitoring progress. The default is False. Returns ------- MolSupplier or EnumeratedSupplier """ supplier = ForwardSDMolSupplier(sdf_file) if not requires_length: return MolSupplier(supplier) count = sdf_count(sdf_file) sdf_file.seek(0) return EnumeratedMolSupplier(supplier, count)
def _read_sdf() -> Iterator[Mol]: reader = ForwardSDMolSupplier(fileobj, removeHs=removeHs) return iter(reader)
import warnings from tqdm import tqdm import gzip from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import ForwardSDMolSupplier from itertools import islice from nfp.preprocessing import MolAPreprocessor, GraphSequence mols = [] with gzip.open('../../../../data/DFT8K/DFT.sdf.gz', 'r') as sdfile: mol_supplier = ForwardSDMolSupplier(sdfile, removeHs=False, sanitize=False) for mol in tqdm(mol_supplier): if mol: mols += [(int(mol.GetProp('_Name')), mol, mol.GetNumAtoms())] mols = pd.DataFrame(mols, columns=['mol_id', 'Mol', 'n_atoms']) mols = mols.set_index('mol_id', drop=True) df = pd.read_csv('../../../../data/DFT8K/DFT8K.csv.gz', index_col=0) #only choose C and H df = df.loc[df.atom_type == 6] df['Mol'] = mols.reindex(df.mol_id).Mol.values grouped_df = df.groupby(['mol_id']) df_Shift = []
import gzip from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import ForwardSDMolSupplier from itertools import islice from nfp.preprocessing import MolPreprocessor, GraphSequence from sklearn.preprocessing import RobustScaler df = pd.read_csv('../data/qm9.csv.gz') df.index = df['index'].apply(lambda x: 'gdb_{}'.format(x)) f = gzip.open('../data/gdb9.sdf.gz') mol_supplier = ForwardSDMolSupplier(f, removeHs=False) mols = [] total_mols = len(df) for mol in tqdm(mol_supplier, total=total_mols): if mol: mols += [(mol.GetProp('_Name'), mol, mol.GetNumAtoms())] mols = pd.DataFrame(mols, columns=['mol_id', 'Mol', 'n_atoms']) test = mols.sample(10000, random_state=0) valid = mols[~mols.mol_id.isin(test.mol_id)].sample(10000, random_state=0) train = mols[(~mols.mol_id.isin(test.mol_id) & ~mols.mol_id.isin(valid.mol_id))].sample(frac=1., random_state=0)