Example #1
0
def read_rd_mols_from_sdf_file(sdf_file, sanitize=True):
    if sdf_file.endswith('.gz'):
        with gzip.open(sdf_file) as f:
            suppl = Chem.ForwardSDMolSupplier(f, sanitize=sanitize)
            return [Molecule(mol) for mol in suppl]
    else:
        suppl = Chem.SDMolSupplier(sdf_file, sanitize=sanitize)
        return [Molecule(mol) for mol in suppl]
Example #2
0
def ExtractMol(List, score_best, prefix):
    ConsScore = {}
    Saved_Mol = []

    # cluster molecules based on which SDF file they belong to
    for Mol in sorted(List, key=lambda tup: tup[2]):
        if Mol[2] in ConsScore:
            ConsScore[Mol[2]].append(Mol)
        else:
            ConsScore[Mol[2]] = [Mol]

    # from each SDF file, extract the docked pose
    for file_id in tqdm(ConsScore, total=len(ConsScore)):
        file_prefix = file_id.split('txt')[0]
        SDF = glob.glob(file_prefix + 'sdf*')
        if len(SDF) == 0:
            sys.exit('{0} or related SD file not found.'.format(file_prefix +
                                                                'sdf*'))
        else:
            sdf_file = SDF[0]

        handle = file_handle(sdf_file)
        Temp = [
            x for x in Chem.ForwardSDMolSupplier(handle, removeHs=False)
            if x is not None
        ]
        SDMol = {}
        for mol in Temp:
            name = mol.GetProp('_Name')
            #      name = mol.GetProp('_Name').split()[0]   # if name is separated
            if re.search(r':',
                         name):  # when the SD file is processed from docking
                SDMol[name.split(':')[0]] = mol
            else:
                SDMol[name] = mol

        for Mol in ConsScore[file_id]:
            try:
                test = SDMol[Mol[1]]
            except KeyError:
                print('{0} is not registered in database. Skip.'.format(
                    Mol[1]))
                continue
#      if score_best is True:
#        Saved_Mol.append([Mol[0], Mol[1], SDMol[Mol[1]]])
#      else:
            Saved_Mol.append([Mol[0], Mol[1], SDMol[Mol[1]], Mol[3]])
        del Temp
        del SDMol

#############

# Sort all mol based on score and write out
    saved_sdf = Chem.SDWriter(prefix + '.sdf')
    for M in sorted(Saved_Mol, key=lambda tup: tup[0]):
        saved_sdf.write(M[2])
    saved_sdf.flush()
    saved_sdf.close()
Example #3
0
def LoadSDF(filename,
            idName='ID',
            molColName='ROMol',
            includeFingerprints=False,
            isomericSmiles=True,
            smilesName=None,
            embedProps=False,
            removeHs=True,
            strictParsing=True):
    '''Read file in SDF format and return as Pandas data frame.
    If embedProps=True all properties also get embedded in Mol objects in the molecule column.
    If molColName=None molecules would not be present in resulting DataFrame (only properties
    would be read).
    '''
    if isinstance(filename, str):
        if filename.lower()[-3:] == ".gz":
            import gzip
            f = gzip.open(filename, "rb")
        else:
            f = open(filename, 'rb')
        close = f.close
    else:
        f = filename
        close = None  # don't close an open file that was passed in
    records = []
    indices = []
    for i, mol in enumerate(
            Chem.ForwardSDMolSupplier(f,
                                      sanitize=(molColName is not None),
                                      removeHs=removeHs,
                                      strictParsing=strictParsing)):
        if mol is None:
            continue
        row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
        if molColName is not None and not embedProps:
            for prop in mol.GetPropNames():
                mol.ClearProp(prop)
        if mol.HasProp('_Name'):
            row[idName] = mol.GetProp('_Name')
        if smilesName is not None:
            try:
                row[smilesName] = Chem.MolToSmiles(
                    mol, isomericSmiles=isomericSmiles)
            except:
                log.warning(
                    'No valid smiles could be generated for molecule %s', i)
                row[smilesName] = None
        if molColName is not None and not includeFingerprints:
            row[molColName] = mol
        elif molColName is not None:
            row[molColName] = _MolPlusFingerprint(mol)
        records.append(row)
        indices.append(i)

    if close is not None:
        close()
    RenderImagesInAllDataFrames(images=True)
    return pd.DataFrame(records, index=indices)
Example #4
0
def get_dataframe_from_library(file_path, library, value, index='VPC ID'):
    suppl = Chem.ForwardSDMolSupplier(file_path)
    mols = [x for x in suppl if x is not None]
    rows_list = []
    for molecule in mols:
        rows_list.append(
            get_molstring_from_library(molecule, library, value, index=index))

    return pd.concat(rows_list, axis=0)
def load_from_gzip(input_filepath, filename):
    """
    Loads a gzipped .sd file, and returns it
    as a not-None python list for later
    pickling.
    """
    with gzip.open(os.path.join(input_filepath, filename)) as gzinfile:
        infile = Chem.ForwardSDMolSupplier(gzinfile)
        return [x for x in infile if x is not None]
Example #6
0
def process(
    refmol_filename,
    inputs_filename,
    outputs_filename,
    refmol_index=None,
    refmol_format=None,
    tani=False,
    score_mode=FeatMaps.FeatMapScoreMode.All,
):

    ref_mol = utils.read_single_molecule(refmol_filename,
                                         index=refmol_index,
                                         format=refmol_format)
    # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms")
    ref_features = getRawFeatures(ref_mol)

    input_file = utils.open_file_for_reading(inputs_filename)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    output_file = utils.open_file_for_writing(outputs_filename)
    writer = Chem.SDWriter(output_file)

    count = 0
    total = 0
    errors = 0
    for mol in suppl:
        count += 1
        if mol is None:
            continue
        # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms")
        try:
            sucos_score, fm_score, val3 = get_SucosScore(
                ref_mol,
                mol,
                tani=tani,
                ref_features=ref_features,
                score_mode=score_mode,
            )
            mol.SetDoubleProp("SuCOS_Score", sucos_score)
            mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score)
            if tani:
                mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3)
            else:
                mol.SetDoubleProp("SuCOS_Protrude_Score", val3)
            utils.log("Scores:", sucos_score, fm_score, val3)
            writer.write(mol)
            total += 1
        except ValueError as e:
            errors += 1
            utils.log("Molecule", count, "failed to score:", e.message)

    input_file.close()
    writer.flush()
    writer.close()
    output_file.close()

    utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors,
              "errors")
Example #7
0
def read_sdf(
    urlpath: Union[str, os.PathLike, TextIO],
    as_df: bool = False,
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    include_private: bool = False,
    include_computed: bool = False,
) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]:
    """Read an SDF file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        as_df: Whether to return a list mol or a pandas DataFrame.
        smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
        include_private: Include private properties in the columns.  Only relevant if
            `as_df` is True.
        include_computed: Include computed properties in the columns.  Only relevant if
            `as_df` is True.
    """

    # File-like object
    if isinstance(urlpath, io.IOBase):
        supplier = Chem.ForwardSDMolSupplier(urlpath)
        mols = [mol for mol in supplier if mol is not None]

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath) as f:
            if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"):
                f = gzip.open(f)
            supplier = Chem.ForwardSDMolSupplier(f)
            mols = [mol for mol in supplier if mol is not None]

    if as_df:
        return dm.to_df(
            mols,
            smiles_column=smiles_column,
            mol_column=mol_column,
            include_private=include_private,
            include_computed=include_computed,
        )  # type: ignore

    return mols
Example #8
0
def read_molecules(infile=None, stream=None, molecules=None):
    if stream:
        suppl = Chem.ForwardSDMolSupplier(stream)
        mols = [x for x in suppl if x is not None]
    elif isinstance(molecules, list):
        mols = molecules
    else:
        if infile.endswith('.sdf.gz'):
            suppl = Chem.ForwardSDMolSupplier(gzip.open(infile))
        elif infile.endswith('.sdf'):
            suppl = Chem.SDMolSupplier(infile)
        else:
            print('Wrong Format!')
            return 1
        mols = [x for x in suppl if x is not None]

    LOGGER.info('{} valid molecules in {} dataset'.format(len(mols), infile))

    return mols
def load_shard(shard, shards_dir, id_prefix):
  if "sdf.gz" not in shard:
    return  
  print("Processing shard %s" % shard)
  shard = os.path.join(shards_dir, shard)
  with gzip.open(shard) as f:
    supp = Chem.ForwardSDMolSupplier(f)
    mols = [mol for mol in supp if mol is not None]
  mol_dict = mols_to_dict(mols, id_prefix)
  return mol_dict
Example #10
0
def makePrints(s):
    try:
        inf = gzip.open(s)
        gzsuppl = Chem.ForwardSDMolSupplier(inf)
        mols = [x for x in gzsuppl if x is not None]
        prints = [finger(mol) for mol in mols]
        prints = pd.DataFrame(prints).dropna()
        return prints
    except:
        print('Unable to open...')
        return
Example #11
0
def default_open_input_sdf(inputDef):
    """Open the input as a SD file (possibly gzipped if ending with .gz) according to RDKit's ForwardSDMolSupplier

    :param inputDef: The name of the file. If None then STDIN is used (and assumed not to be gzipped)
    """
    if inputDef:
        input = open_file(inputDef)
    else:
        input = sys.stdin
    suppl = Chem.ForwardSDMolSupplier(input)
    return input, suppl
def generate_pdbs(ligand_file, out_dir, out_template):
    """Generate pdb files for ligands."""
    with gzip.open(ligand_file) as inf:
        gzsuppl = Chem.ForwardSDMolSupplier(inf)
        mols = [x for x in gzsuppl if x is not None]
        print "Number molecules: " + str(len(mols))
        for id, mol in enumerate(mols):
            ligand_pdb = os.path.join(out_dir, out_template % id)
            print "writing " + ligand_pdb
            w = Chem.PDBWriter(ligand_pdb)
            w.write(mol)
Example #13
0
def sdf_mol_supplier(
    filename: str, gen_ids: bool, **kwargs
) -> IterableType[Tuple[int, Chem.Mol]]:
    """Generator function that reads from a .sdf file.

    Parameters
    ----------
    filename : str
        .sdf filename.

    gen_ids: bool
        generate ids or not.

    Yields
    -------
    tuple
        int id and rdkit mol.
    """
    if filename.endswith('.gz'):
        import gzip
        gzf = gzip.open(filename)
        suppl = Chem.ForwardSDMolSupplier(gzf)
    else:
        suppl = Chem.ForwardSDMolSupplier(filename)
    for new_mol_id, rdmol in enumerate(suppl, 1):
        if rdmol:
            if gen_ids:
                mol_id = new_mol_id
            else:
                mol_id = rdmol.GetProp(kwargs["mol_id_prop"])
            try:
                int(mol_id)
            except ValueError:
                raise Exception(
                    "FPSim only supports integer ids for molecules, "
                    "cosinder setting gen_ids=True when running "
                    "create_db_file to autogenerate them."
                )
            yield mol_id, rdmol
        else:
            continue
Example #14
0
def parse_sdfgz(filename):

    f = gzip.open(filename)
    suppl = Chem.ForwardSDMolSupplier(f, removeHs=False, sanitize=True)

    for molobj in suppl:

        if molobj is None: continue

        inertia = parse_molobj(molobj)

        yield inertia
Example #15
0
def save_sdf(mol_paths, mol_names, out_name=''):
    # Setup writer
    out_file = os.path.join(os.path.abspath(sys.argv[1]), f'{out_name}.sdf')
    writer = AllChem.SDWriter(out_file)

    for path, name in zip(mol_paths, mol_names):
        if ('.sdfgz' in path) or ('.sdf.gz' in path):
            with gzip.open(path) as rf:
                suppl = Chem.ForwardSDMolSupplier(rf, removeHs=False)
                mol = suppl.__next__()  # Grab first mol
                mol.SetProp('_Name', name)
                writer.write(mol)
        elif '.sdf' in path:
            with open(path) as rf:
                suppl = Chem.ForwardSDMolSupplier(rf, removeHs=False)
                mol = suppl.__next__()
                mol.SetProp('_Name', name)
                writer.write(mol)
    writer.flush()
    writer.close()
    #st.write(f'Saved to: {out_file}')
    return
Example #16
0
def __read_stdin_sdf(sanitize=True):
    molblock = ''
    line = sys.stdin.readline()
    while line:
        molblock += line
        if line == '$$$$\n':
            mol = [x for x in Chem.ForwardSDMolSupplier(BytesIO(molblock.encode('utf-8')), sanitize=sanitize)][0]
            mol_title = molblock.split('\n', 1)[0]
            if not mol_title:
                mol_title = __get_smi_as_molname(mol)
            yield mol, mol_title
            molblock = ''
        line = sys.stdin.readline()
Example #17
0
def corpus(input, output, suffix='sdf'):
    if suffix =='sdf':
        inf = gzip.open(input)
        mols = Chem.ForwardSDMolSupplier(inf)
        # mols = [mol for mol in suppl]
    else:
        df = pd.read_table(input).Smiles.dropna()
        mols = [Chem.MolFromSmiles(s) for s in df]
    voc = Voc('data/voc_smiles.txt')
    charger = rdMolStandardize.Uncharger()
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    words = set()
    canons = []
    tokens = []
    smiles = set()
    for mol in tqdm(mols):
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = charger.uncharge(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            smileR = Chem.MolToSmiles(mol, 0)
            smiles.add(Chem.CanonSmiles(smileR))
        except:
            print('Parsing Error:') #, Chem.MolToSmiles(mol))

    for smile in tqdm(smiles):
        token = voc.split(smile) + ['EOS']
        if {'C', 'c'}.isdisjoint(token):
            print('Warning:', smile)
            continue
        if not {'[Na]', '[Zn]'}.isdisjoint(token):
            print('Redudent', smile)
            continue
        if 10 < len(token) <= 100:
            words.update(token)
            canons.append(smile)
            tokens.append(' '.join(token))
    log = open(output + '_voc.txt', 'w')
    log.write('\n'.join(sorted(words)))
    log.close()

    log = pd.DataFrame()
    log['Smiles'] = canons
    log['Token'] = tokens
    log.drop_duplicates(subset='Smiles')
    log.to_csv(output + '_corpus.txt', sep='\t', index=False)
Example #18
0
def getFPdict_sdf(fh, molID=None, fpType='ecfp', radius=4):

    suppl = Chem.ForwardSDMolSupplier(fh, removeHs=False)
    fpD = {}
    count = 0
    for mol in suppl:
        count += 1
        if mol is None: continue
        name = getName(mol, count, molID)
        mol.UpdatePropertyCache(strict=False)
        mh = Chem.AddHs(mol, addCoords=True)
        fpD[name] = [getFP(mh, fpType, radius), Chem.MolToSmiles(mol)]

    return fpD
Example #19
0
def get_molecules_from_sdf_bytes(dataset):
    """
    Method which make RDKit molecules from dataset bytes-object

    :param dataset: bytearray with molecules
    :return: list of RDKit molecules
    :type dataset: bytearray
    :rtype: list
    """

    stream = io.BytesIO(dataset)
    supplier = Chem.ForwardSDMolSupplier(stream)
    molecules = [x for x in supplier if x]

    return molecules
Example #20
0
def rdkit_open(File_Tuple):

    List = []

    for f in (File_Tuple):
        handle = file_handle(f)

        if re.search(r'.sdf', f):
            if re.search(r'.gz$|.bz2$', f):
                Mol = [
                    x
                    for x in Chem.ForwardSDMolSupplier(handle, removeHs=False)
                    if x is not None
                ]
            else:
                Mol = [
                    x for x in Chem.SDMolSupplier(handle, removeHs=False)
                    if x is not None
                ]

        if re.search(r'.smi', f):
            with handle as fi:
                first_line = fi.readline()

            if re.search(r'smiles', first_line, re.IGNORECASE):
                Mol = [
                    x for x in Chem.SmilesMolSupplier(
                        f, titleLine=True, delimiter=' |\t|,') if x is not None
                ]
            else:
                Mol = [
                    x for x in Chem.SmilesMolSupplier(
                        f, titleLine=False, delimiter=' |\t|,')
                    if x is not None
                ]

        ## not the official RDkit function, may fail
        if re.search(r'.mol2', f):
            Mol = [
                x for x in Mol2MolSupplier(f, removeHs=False) if x is not None
            ]

        print("# Found mol in {0}: {1}".format(f, len(Mol)))
        for mol in Mol:
            List.append(mol)

    gc.collect()
    return List
def readAndCreateFingerprint(file_name, counts=False):
    if not counts:
        fingerprints = [
            AllChem.GetMorganFingerprintAsBitVect(m, 2, 1024)
            for m in Chem.ForwardSDMolSupplier(file_name, removeHs=False)
            if m is not None
        ]
        return fingerprints
    else:
        info = {}
        fingerprints = [
            AllChem.GetMorganFingerprint(m, 2, bitInfo=info)
            for m in Chem.ForwardSDMOLSupplier(file_name, removeHs=False)
            if m is not None
        ]
        return fingerprints
Example #22
0
def extractField(args):
    # Write header in output file
    args.out.write('{}\n'.format('\t'.join(['Name', args.field])))

    # Get data and print to output file
    suppl = Chem.ForwardSDMolSupplier(args.sdf)
    count = 0
    for m in suppl:
        count += 1
        if m is None: continue
        name = mh.getName(m, count)
        if m.HasProp(args.field):
            value = m.GetProp(args.field)
        else:
            value = 'NA'
        args.out.write('{}\n'.format('\t'.join([name, value])))
Example #23
0
def __read_sdf(fname, input_format, id_field_name=None, sanitize=True):
    if input_format == 'sdf':
        suppl = Chem.SDMolSupplier(fname, sanitize=sanitize)
    elif input_format == 'sdf.gz':
        suppl = Chem.ForwardSDMolSupplier(gzip.open(fname), sanitize=sanitize)
    else:
        return
    for mol in suppl:
        if mol is not None:
            if id_field_name is not None:
                mol_title = mol.GetProp(id_field_name)
            else:
                if mol.GetProp("_Name"):
                    mol_title = mol.GetProp("_Name")
                else:
                    mol_title = Chem.MolToSmiles(mol, isomericSmiles=True)
            yield PropertyMol(mol), mol_title
def main(directory: str, chebml_version: str):
    """Download the ChEBML data."""
    os.makedirs(directory, exist_ok=True)

    bradley_path = os.path.join(directory, 'jm020472j_s2.xls')
    if not os.path.exists(bradley_path):
        try:
            wget.download(bradley_url, out=directory)
        except:
            click.echo('There goes ACS stopping science')

    chembl_url = (
        f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/'
        f'chembl_{chebml_version}/chembl_{chebml_version}.sdf.gz')

    sdf_path = os.path.join(directory, f'chembl_{chebml_version}.sdf.gz')
    if not os.path.exists(sdf_path):
        wget.download(chembl_url, out=directory)

    sss_path = os.path.join(directory, f'chembl{chebml_version}_sssdata.pkl')
    if not os.path.exists(sss_path):
        click.echo(f'RDKit Version: {rdBase.rdkitVersion}')
        data = []

        with gzip.GzipFile(sdf_path) as gz:
            suppl = Chem.ForwardSDMolSupplier(gz)
            for mol in tqdm(suppl,
                            desc=f'Processing ChEBML {chebml_version}',
                            unit_scale=True):
                if mol is None or mol.GetNumAtoms() > 50:
                    continue
                fp = Chem.PatternFingerprint(mol)
                smi = Chem.MolToSmiles(mol)
                data.append((smi, fp))

        click.echo(f'Outputting to {sss_path}')
        with open(sss_path, 'wb') as file:
            mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder()
            fps = rdSubstructLibrary.PatternHolder()
            for smi, fp in data:
                mols.AddSmiles(smi)
                fps.AddFingerprint(fp)
            library = rdSubstructLibrary.SubstructLibrary(mols, fps)
            pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL)

    click.echo('Done ;)')
Example #25
0
def default_open_input_sdf(inputDef):
    """Open the input as a SD file (possibly gzipped if ending with .gz) according to RDKit's ForwardSDMolSupplier

    :param inputDef: The name of the file. If None then STDIN is used (and assumed not to be gzipped)
    """
    if inputDef:
        input = utils.open_file(inputDef)
    else:
        # We need to use the (Python 3) stdin.buffer
        # (a binary representation of the input stream)
        # for RDKit in Python 3.
        if sys.version_info[0] >= 3:
            input = sys.stdin.buffer
        else:
            input = sys.stdin
    suppl = Chem.ForwardSDMolSupplier(input)
    return input, suppl
Example #26
0
def __read_stdin_sdf(sanitize=True, removeHs=True):
    molblock = ''
    line = sys.stdin.readline()
    while line:
        molblock += line
        if line == '$$$$\n':
            mol = [
                x for x in Chem.ForwardSDMolSupplier(BytesIO(
                    molblock.encode('utf-8')),
                                                     sanitize=sanitize,
                                                     removeHs=removeHs)
            ][0]
            mol_title = molblock.split('\n', 1)[0]
            if not mol_title:
                mol_title = Chem.MolToSmiles(mol, isomericSmiles=True)
            yield mol, mol_title
            molblock = ''
        line = sys.stdin.readline()
Example #27
0
def zinc(first=-1, *args, **kwargs):
    """ ZINC collection.

    ..[1] Irwin, John J, and Brian K Shoichet.
    “ZINC
    --a free database of commercially available compounds for virtual screening.”
    Journal of chemical information and modeling
    vol. 45,1 (2005): 177-82. doi:10.1021/ci049714+
    """
    import tarfile
    from os.path import exists
    from openff.toolkit.topology import Molecule
    from rdkit import Chem

    fname = "parm_at_Frosst.tgz"
    url = "http://www.ccl.net/cca/data/parm_at_Frosst/parm_at_Frosst.tgz"

    if not exists(fname):
        import urllib.request

        urllib.request.urlretrieve(url, fname)

    archive = tarfile.open(fname)
    zinc_file = archive.extractfile("parm_at_Frosst/zinc.sdf")
    _mols = Chem.ForwardSDMolSupplier(zinc_file, removeHs=False)

    count = 0
    gs = []

    for mol in _mols:
        try:
            gs.append(
                esp.Graph(Molecule.from_rdkit(mol,
                                              allow_undefined_stereo=True)))

            count += 1

        except:
            pass

        if first != -1 and count >= first:
            break

    return esp.data.dataset.GraphDataset(gs, *args, **kwargs)
Example #28
0
def processSDF(cursor, filename, split, storeMolblock):
    nmol = 0
    # capture stderr when processing mol
    #sio = sys.stderr = StringIO()
    #fp = open(filename, 'rb')
    #suppl = Chem.ForwardSDMolSupplier(fp)

    if filename.endswith(".gz"):
        import gzip
        inf = gzip.open(filename)
        suppl = Chem.ForwardSDMolSupplier(inf)
        if storeMolblock:
            print("Warning: cannot store molblock from gzip file",
                  file=sys.stderr)
    else:
        suppl = Chem.SDMolSupplier(filename)

    for mol in suppl:
        if hasattr(suppl, "GetItemText"):
            (molblock, sep,
             moldata) = suppl.GetItemText(nmol).partition('M  END')
            if storeMolblock:
                molstore = molblock + sep
            else:
                molstore = None
        else:
            molstore = None
        nmol += 1
        imol = addMol(cursor, mol, molstore, nmol)
        if mol:
            for p in mol.GetPropNames():
                if split:
                    for sp in mol.GetProp(p).split(","):
                        addProp(cursor, imol, p, sp)
                else:
                    addProp(cursor, imol, p, mol.GetProp(p))
        else:
            print("Error adding molecule #%d" % nmol, file=sys.stderr)
            # molname stores the stderr when processing the mol
            #cursor.execute("Update molecule Set molname = ? Where molid = ?", [sio.getvalue(), imol])
            #sio = sys.stderr = StringIO() # reset

    return nmol
Example #29
0
def usrcat_write_binary(sdf_file_path: Path, gzip_output_binary: bool = False):
    assert sdf_file_path.exists(), "SDF file not found"
    binary_file_path = Path(str(sdf_file_path) + ".usrcatsl.bin")
    if gzip_output_binary:
        binary_file_path = Path(str(binary_file_path) + ".gz")
    assert not Path(binary_file_path).exists(), "Output binary exists"
    output_binary = open_file_may_be_gzipped(binary_file_path, "wb")
    output_smiles_index = open_file_may_be_gzipped(
        Path(str(sdf_file_path) + ".usrcatsl.smi"), "w")
    bar = progressbar.ProgressBar(prefix="Generating binary")
    pos_and_desc_bytes = bytearray(
        struct.calcsize(usrcat_binary_struct_format_string))
    num_gets = 0
    num_good_mols = 0
    sdf_reader = None
    gz_compressed_file = None
    if str(sdf_file_path).endswith(".gz"):
        gz_compressed_file = gzip.open(str(sdf_file_path))
        sdf_reader = Chem.ForwardSDMolSupplier(gz_compressed_file)
    else:
        sdf_reader = Chem.SDMolSupplier(str(sdf_file_path))

    for mol in sdf_reader:
        num_gets += 1
        if mol is not None:
            if mol.GetNumHeavyAtoms() > 2:
                num_good_mols += 1
                usrcat_descriptos = GetUSRCAT(mol)
                struct.pack_into(usrcat_binary_struct_format_string,
                                 pos_and_desc_bytes, 0, num_good_mols,
                                 *usrcat_descriptos)
                # Note we use num_good mols, this means that the first line is #1, not 0 - the smiles lines are not zero-indexed.
                output_binary.write(pos_and_desc_bytes)
                output_smiles_index.write(
                    Chem.MolToSmiles(mol) + " " + mol.GetProp("_Name") + "\n")
            if num_good_mols % 1000 == 0:
                bar.update(num_gets)
    bar.update(num_gets)
    output_binary.close()
    output_smiles_index.close()
    print("Num gets", num_gets)
    print("Num good mols", num_good_mols)
Example #30
0
def LoadSDF(filename, smilesName='SMILES', idName='ID',molColName = 'ROMol',includeFingerprints=False):
  """ Read file in SDF format and return as Panda data frame """
  df = None
  if type(filename) is str:
    f = open(filename, 'rb') #'rU')
  else:
    f = filename
  for i, mol in enumerate(Chem.ForwardSDMolSupplier(f)):
    if mol is None: continue
    row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
    if mol.HasProp('_Name'): row[idName] = mol.GetProp('_Name')
    row[smilesName] = Chem.MolToSmiles(mol)
    row = pd.DataFrame(row, index=[i])
    if df is None:
      df = row
    else:
      df = df.append(row)
  f.close()
  AddMoleculeColumnToFrame(df, smilesCol=smilesName, molCol = molColName,includeFingerprints=includeFingerprints)
  return df