Esempio n. 1
0
def classify(sdf, label, lambdas):
    new_filename = "%s_class.sdf" % sdf.split('.sdf')[0]
    new_label = label + "_class"
    sdm = ForwardSDMolSupplier(sdf,
                               strictParsing=False,
                               removeHs=False,
                               sanitize=False)
    sdw = SDWriter(new_filename)
    counter = -1
    i = 0
    for mol in sdm:
        print(i)
        sys.stdout.flush()
        i += 1
        counter += 1
        if mol is None:
            print("%d rdkit couldn't read molecule" % counter, file=sys.stderr)
            sys.stderr.flush()
            continue
        c = None
        prop = floatify(mol.GetProp(label))
        if prop is None:
            print("couldn't convert %s to float or int...skip" %
                  mol.GetProp(label),
                  file=sys.stderr)
            sys.stderr.flush()
            continue
        for k, l in lambdas.items():
            if l(prop):
                c = k
                print("hit %s" % k)
                sys.stdout.flush()
                break
        if c is None:
            print("%d no prop range matched '%s' ..skip" %
                  (counter, mol.GetProp(label)),
                  prop,
                  type(prop),
                  file=sys.stderr)
            sys.stderr.flush()
            sys.stdout.flush()
            continue
        mol.SetProp(new_label, c)
        try:
            sdw.write(mol)
        except:
            print(
                "couldn't write mol %d to file, try to build mol from smiles" %
                i,
                file=sys.stderr)
            mol = MolFromSmiles(mol.GetProp("SMILES"))
            AllChem.Compute2DCoords(mol)
            mol.SetProp(new_label, c)
            try:
                sdw.write(mol)
            except:
                print("couldn't write mol %d to file...skip" % i,
                      file=sys.stderr)
    sdw.close()
Esempio n. 2
0
 def setUp(self):
   self.dataset = dict()
   self.dataset_inchi = dict()
   inf = gzip.open(os.path.join(RDConfig.RDCodeDir, 'Chem/test_data', 'pubchem-hard-set.sdf.gz'),
                   'r')
   self.dataset['problematic'] = ForwardSDMolSupplier(inf, sanitize=False, removeHs=False)
   with open(os.path.join(RDConfig.RDCodeDir, 'Chem/test_data', 'pubchem-hard-set.inchi'),
             'r') as intF:
     buf = intF.read().replace('\r\n', '\n').encode('latin1')
     intF.close()
   with io.BytesIO(buf) as inF:
     pkl = inF.read()
   self.dataset_inchi['problematic'] = pickle.loads(pkl, encoding='latin1')
   # disable logging
   RDLogger.DisableLog('rdApp.warning')
def open_molecule_file(uploadedfile, logfile=os.devnull, filetype=None):

    #charset = 'utf-8'
    #if "charset" in uploadedfile and uploadedfile.charset is not None:
    #charset = uploadedfile.charset
    if filetype is None:
        if "filetype" not in uploadedfile or uploadedfile.filetype is None:
            basename, ext = os.path.splitext(uploadedfile.name)
            ext = ext.lower()
            ext = ext.strip('.')
            if ext in MOLECULE_EXTENSION_TYPES.keys():
                filetype = MOLECULE_EXTENSION_TYPES[ext]
                uploadedfile.filetype = filetype
            else:
                raise InvalidMoleculeFileExtension(ext=ext)

        else:
            filetype = uploadedfile.filetype

    with stdout_redirected(to=logfile, stdout=sys.stderr):
        with stdout_redirected(to=logfile, stdout=sys.stdout):
            print('Loading molecule...')
            uploadedfile.seek(0)
            if filetype == 'sdf' or filetype == 'mol':

                suppl = ForwardSDMolSupplier(uploadedfile, removeHs=False)
                mol = next(suppl)
                try:
                    next(suppl)
                except StopIteration:
                    pass
                except:
                    raise
                else:
                    raise MultipleMoleculesinSDF()
                finally:
                    del suppl
                if mol is None:
                    if filetype == 'sdf':
                        raise ParsingError("Invalid SDFile file.")
                    else:
                        raise ParsingError("Invalid MDL Mol file.")
            print('Assigning chirality from struture...')
            AssignAtomChiralTagsFromStructure(mol, replaceExistingTags=False)
            print('Finished loading molecule.')

    return mol
Esempio n. 4
0
def read_sdf(sdf_file, requires_length=False):
    """Read an sdf file.

    Parameters
    ----------
    sdf_file: A file-like object
    requires_length: If True returns an enumerated Mol
        supplier, i.e. when monitoring progress

    Returns
    -------
    either a MolSupplier or an EnumeratedSupplier
    depending on whether a length is required
    """

    supplier = ForwardSDMolSupplier(sdf_file)
    if not requires_length:
        return MolSupplier(supplier)
    count = sdf_count(sdf_file)
    sdf_file.seek(0)
    return EnumeratedMolSupplier(supplier, count)
Esempio n. 5
0
def read_sdf(sdf_file, requires_length=False):
    """Read molecules from an SDF.

    Parameters
    ----------
    sdf_file : file-like object
        An open SDF.
    requires_length : bool, optional
        If True returns an enumerated MolSupplier,
        i.e. when monitoring progress. The default
        is False.

    Returns
    -------
    MolSupplier or EnumeratedSupplier

    """
    supplier = ForwardSDMolSupplier(sdf_file)
    if not requires_length:
        return MolSupplier(supplier)
    count = sdf_count(sdf_file)
    sdf_file.seek(0)
    return EnumeratedMolSupplier(supplier, count)
 def _read_sdf() -> Iterator[Mol]:
     reader = ForwardSDMolSupplier(fileobj, removeHs=removeHs)
     return iter(reader)
Esempio n. 7
0
import warnings
from tqdm import tqdm

import gzip
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import ForwardSDMolSupplier

from itertools import islice

from nfp.preprocessing import MolAPreprocessor, GraphSequence

mols = []
with gzip.open('../../../../data/DFT8K/DFT.sdf.gz', 'r') as sdfile:
    mol_supplier = ForwardSDMolSupplier(sdfile, removeHs=False, sanitize=False)
    for mol in tqdm(mol_supplier):
        if mol:
            mols += [(int(mol.GetProp('_Name')), mol, mol.GetNumAtoms())]

mols = pd.DataFrame(mols, columns=['mol_id', 'Mol', 'n_atoms'])
mols = mols.set_index('mol_id', drop=True)

df = pd.read_csv('../../../../data/DFT8K/DFT8K.csv.gz', index_col=0)
#only choose C and H
df = df.loc[df.atom_type == 6]

df['Mol'] = mols.reindex(df.mol_id).Mol.values

grouped_df = df.groupby(['mol_id'])
df_Shift = []
Esempio n. 8
0
import gzip
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import ForwardSDMolSupplier

from itertools import islice

from nfp.preprocessing import MolPreprocessor, GraphSequence
from sklearn.preprocessing import RobustScaler

df = pd.read_csv('../data/qm9.csv.gz')
df.index = df['index'].apply(lambda x: 'gdb_{}'.format(x))

f = gzip.open('../data/gdb9.sdf.gz')

mol_supplier = ForwardSDMolSupplier(f, removeHs=False)

mols = []
total_mols = len(df)

for mol in tqdm(mol_supplier, total=total_mols):
    if mol:
        mols += [(mol.GetProp('_Name'), mol, mol.GetNumAtoms())]

mols = pd.DataFrame(mols, columns=['mol_id', 'Mol', 'n_atoms'])

test = mols.sample(10000, random_state=0)
valid = mols[~mols.mol_id.isin(test.mol_id)].sample(10000, random_state=0)
train = mols[(~mols.mol_id.isin(test.mol_id)
              & ~mols.mol_id.isin(valid.mol_id))].sample(frac=1.,
                                                         random_state=0)