Python Chem.ForwardSDMolSupplier Examples

Programming Language: Python

Namespace/Package Name: rdkit

Class/Type: Chem

Method/Function: ForwardSDMolSupplier

Examples at hotexamples.com: 30

The ForwardSDMolSupplier is a class in the RDKit Chem module of Python that provides a way to read and parse a stream of molecules stored in the SD file format. The SD file format is a common format used to store chemical information, such as molecule structures and associated data. This class allows users to efficiently read multiple molecules from an SD file one by one, by providing an iterator-like interface. It is a convenient tool for processing large datasets of chemical compounds stored in SD files.

Python Chem.ForwardSDMolSupplier - 30 examples found. These are the top rated real world Python examples of rdkit.Chem.ForwardSDMolSupplier extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

AddHs(30)

CanonSmiles(30)

GetAdjacencyMatrix(30)

FragmentOnBonds(30)

ForwardSDMolSupplier(30)

FindMolChiralCenters(30)

FindAtomEnvironmentOfRadiusN(30)

DeleteSubstructs(30)

Conformer(30)

CombineMols(30)

CanonicalRankAtoms(30)

GetDistanceMatrix(30)

Atom(30)

AssignStereochemistry(30)

Get3DDistanceMatrix(29)

FindAllPathsOfLengthN(29)

EditableMol(24)

AssignAtomChiralTagsFromStructure(24)

AtomPDBResidueInfo(21)

FastFindRings(14)

AdjustQueryProperties(13)

DetectBondStereochemistry(10)

BondType(9)

FindPotentialStereoBonds(9)

Cleanup(8)

AdjustQueryParameters(8)

AtomFromSmiles(5)

AssignStereochemistryFrom3D(5)

EmbedMolecule(5)

FragmentOnBRICSBonds(4)

DetectChemistryProblems(4)

ClearMolSubstanceGroups(4)

CreateAtomDoublePropertyList(3)

ETKDG(3)

CreateMolSubstanceGroup(2)

FindAllSubgraphsOfLengthN(2)

AssignRadicals(2)

FindUniqueSubgraphsOfLengthN(2)

CalcPMI3(1)

GetBondBetweenAtoms(1)

GetBestRMS(1)

GetAtomPairFingerPrint(1)

CalcNumSpiroAtoms(1)

CalcPMI1(1)

FragmentOnSomeBonds(1)

CalcPMI2(1)

ClearMolSGroups(1)

AtomMonomerInfo(1)

ForwardSDMOLSupplier(1)

DetectBondStereoChemistry(1)

Example #1

Show file

def read_rd_mols_from_sdf_file(sdf_file, sanitize=True):
    if sdf_file.endswith('.gz'):
        with gzip.open(sdf_file) as f:
            suppl = Chem.ForwardSDMolSupplier(f, sanitize=sanitize)
            return [Molecule(mol) for mol in suppl]
    else:
        suppl = Chem.SDMolSupplier(sdf_file, sanitize=sanitize)
        return [Molecule(mol) for mol in suppl]

Example #2

Show file

def ExtractMol(List, score_best, prefix):
    ConsScore = {}
    Saved_Mol = []

    # cluster molecules based on which SDF file they belong to
    for Mol in sorted(List, key=lambda tup: tup[2]):
        if Mol[2] in ConsScore:
            ConsScore[Mol[2]].append(Mol)
        else:
            ConsScore[Mol[2]] = [Mol]

    # from each SDF file, extract the docked pose
    for file_id in tqdm(ConsScore, total=len(ConsScore)):
        file_prefix = file_id.split('txt')[0]
        SDF = glob.glob(file_prefix + 'sdf*')
        if len(SDF) == 0:
            sys.exit('{0} or related SD file not found.'.format(file_prefix +
                                                                'sdf*'))
        else:
            sdf_file = SDF[0]

        handle = file_handle(sdf_file)
        Temp = [
            x for x in Chem.ForwardSDMolSupplier(handle, removeHs=False)
            if x is not None
        ]
        SDMol = {}
        for mol in Temp:
            name = mol.GetProp('_Name')
            #      name = mol.GetProp('_Name').split()[0]   # if name is separated
            if re.search(r':',
                         name):  # when the SD file is processed from docking
                SDMol[name.split(':')[0]] = mol
            else:
                SDMol[name] = mol

        for Mol in ConsScore[file_id]:
            try:
                test = SDMol[Mol[1]]
            except KeyError:
                print('{0} is not registered in database. Skip.'.format(
                    Mol[1]))
                continue
#      if score_best is True:
#        Saved_Mol.append([Mol[0], Mol[1], SDMol[Mol[1]]])
#      else:
            Saved_Mol.append([Mol[0], Mol[1], SDMol[Mol[1]], Mol[3]])
        del Temp
        del SDMol

#############

# Sort all mol based on score and write out
    saved_sdf = Chem.SDWriter(prefix + '.sdf')
    for M in sorted(Saved_Mol, key=lambda tup: tup[0]):
        saved_sdf.write(M[2])
    saved_sdf.flush()
    saved_sdf.close()

Example #3

Show file

File: PandasTools.py Project: zhijianx8421/rdkit

def LoadSDF(filename,
            idName='ID',
            molColName='ROMol',
            includeFingerprints=False,
            isomericSmiles=True,
            smilesName=None,
            embedProps=False,
            removeHs=True,
            strictParsing=True):
    '''Read file in SDF format and return as Pandas data frame.
    If embedProps=True all properties also get embedded in Mol objects in the molecule column.
    If molColName=None molecules would not be present in resulting DataFrame (only properties
    would be read).
    '''
    if isinstance(filename, str):
        if filename.lower()[-3:] == ".gz":
            import gzip
            f = gzip.open(filename, "rb")
        else:
            f = open(filename, 'rb')
        close = f.close
    else:
        f = filename
        close = None  # don't close an open file that was passed in
    records = []
    indices = []
    for i, mol in enumerate(
            Chem.ForwardSDMolSupplier(f,
                                      sanitize=(molColName is not None),
                                      removeHs=removeHs,
                                      strictParsing=strictParsing)):
        if mol is None:
            continue
        row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
        if molColName is not None and not embedProps:
            for prop in mol.GetPropNames():
                mol.ClearProp(prop)
        if mol.HasProp('_Name'):
            row[idName] = mol.GetProp('_Name')
        if smilesName is not None:
            try:
                row[smilesName] = Chem.MolToSmiles(
                    mol, isomericSmiles=isomericSmiles)
            except:
                log.warning(
                    'No valid smiles could be generated for molecule %s', i)
                row[smilesName] = None
        if molColName is not None and not includeFingerprints:
            row[molColName] = mol
        elif molColName is not None:
            row[molColName] = _MolPlusFingerprint(mol)
        records.append(row)
        indices.append(i)

    if close is not None:
        close()
    RenderImagesInAllDataFrames(images=True)
    return pd.DataFrame(records, index=indices)

Example #4

Show file

File: desc_finder.py Project: scidatasoft/ml-services

def get_dataframe_from_library(file_path, library, value, index='VPC ID'):
    suppl = Chem.ForwardSDMolSupplier(file_path)
    mols = [x for x in suppl if x is not None]
    rows_list = []
    for molecule in mols:
        rows_list.append(
            get_molstring_from_library(molecule, library, value, index=index))

    return pd.concat(rows_list, axis=0)

Example #5

Show file

File: split_dataset.py Project: Matt-HJ-Bailey/ChemBL-Topology

def load_from_gzip(input_filepath, filename):
    """
    Loads a gzipped .sd file, and returns it
    as a not-None python list for later
    pickling.
    """
    with gzip.open(os.path.join(input_filepath, filename)) as gzinfile:
        infile = Chem.ForwardSDMolSupplier(gzinfile)
        return [x for x in infile if x is not None]

Example #6

Show file

File: sucos.py Project: qiagu/galaxytools

def process(
    refmol_filename,
    inputs_filename,
    outputs_filename,
    refmol_index=None,
    refmol_format=None,
    tani=False,
    score_mode=FeatMaps.FeatMapScoreMode.All,
):

    ref_mol = utils.read_single_molecule(refmol_filename,
                                         index=refmol_index,
                                         format=refmol_format)
    # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms")
    ref_features = getRawFeatures(ref_mol)

    input_file = utils.open_file_for_reading(inputs_filename)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    output_file = utils.open_file_for_writing(outputs_filename)
    writer = Chem.SDWriter(output_file)

    count = 0
    total = 0
    errors = 0
    for mol in suppl:
        count += 1
        if mol is None:
            continue
        # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms")
        try:
            sucos_score, fm_score, val3 = get_SucosScore(
                ref_mol,
                mol,
                tani=tani,
                ref_features=ref_features,
                score_mode=score_mode,
            )
            mol.SetDoubleProp("SuCOS_Score", sucos_score)
            mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score)
            if tani:
                mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3)
            else:
                mol.SetDoubleProp("SuCOS_Protrude_Score", val3)
            utils.log("Scores:", sucos_score, fm_score, val3)
            writer.write(mol)
            total += 1
        except ValueError as e:
            errors += 1
            utils.log("Molecule", count, "failed to score:", e.message)

    input_file.close()
    writer.flush()
    writer.close()
    output_file.close()

    utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors,
              "errors")

Example #7

Show file

def read_sdf(
    urlpath: Union[str, os.PathLike, TextIO],
    as_df: bool = False,
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    include_private: bool = False,
    include_computed: bool = False,
) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]:
    """Read an SDF file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        as_df: Whether to return a list mol or a pandas DataFrame.
        smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
        include_private: Include private properties in the columns.  Only relevant if
            `as_df` is True.
        include_computed: Include computed properties in the columns.  Only relevant if
            `as_df` is True.
    """

    # File-like object
    if isinstance(urlpath, io.IOBase):
        supplier = Chem.ForwardSDMolSupplier(urlpath)
        mols = [mol for mol in supplier if mol is not None]

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath) as f:
            if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"):
                f = gzip.open(f)
            supplier = Chem.ForwardSDMolSupplier(f)
            mols = [mol for mol in supplier if mol is not None]

    if as_df:
        return dm.to_df(
            mols,
            smiles_column=smiles_column,
            mol_column=mol_column,
            include_private=include_private,
            include_computed=include_computed,
        )  # type: ignore

    return mols

Example #8

Show file

File: processor.py Project: ArqiSoft/ml-services

def read_molecules(infile=None, stream=None, molecules=None):
    if stream:
        suppl = Chem.ForwardSDMolSupplier(stream)
        mols = [x for x in suppl if x is not None]
    elif isinstance(molecules, list):
        mols = molecules
    else:
        if infile.endswith('.sdf.gz'):
            suppl = Chem.ForwardSDMolSupplier(gzip.open(infile))
        elif infile.endswith('.sdf'):
            suppl = Chem.SDMolSupplier(infile)
        else:
            print('Wrong Format!')
            return 1
        mols = [x for x in suppl if x is not None]

    LOGGER.info('{} valid molecules in {} dataset'.format(len(mols), infile))

    return mols

Example #9

Show file

File: process_vs_datasets.py Project: swayam01/schrodingerdeepchem

def load_shard(shard, shards_dir, id_prefix):
  if "sdf.gz" not in shard:
    return  
  print("Processing shard %s" % shard)
  shard = os.path.join(shards_dir, shard)
  with gzip.open(shard) as f:
    supp = Chem.ForwardSDMolSupplier(f)
    mols = [mol for mol in supp if mol is not None]
  mol_dict = mols_to_dict(mols, id_prefix)
  return mol_dict

Example #10

Show file

def makePrints(s):
    try:
        inf = gzip.open(s)
        gzsuppl = Chem.ForwardSDMolSupplier(inf)
        mols = [x for x in gzsuppl if x is not None]
        prints = [finger(mol) for mol in mols]
        prints = pd.DataFrame(prints).dropna()
        return prints
    except:
        print('Unable to open...')
        return

Example #11

Show file

def default_open_input_sdf(inputDef):
    """Open the input as a SD file (possibly gzipped if ending with .gz) according to RDKit's ForwardSDMolSupplier

    :param inputDef: The name of the file. If None then STDIN is used (and assumed not to be gzipped)
    """
    if inputDef:
        input = open_file(inputDef)
    else:
        input = sys.stdin
    suppl = Chem.ForwardSDMolSupplier(input)
    return input, suppl

Example #12

Show file

File: generate_ligand_pdbs.py Project: evanfeinberg/cluster_utils

def generate_pdbs(ligand_file, out_dir, out_template):
    """Generate pdb files for ligands."""
    with gzip.open(ligand_file) as inf:
        gzsuppl = Chem.ForwardSDMolSupplier(inf)
        mols = [x for x in gzsuppl if x is not None]
        print "Number molecules: " + str(len(mols))
        for id, mol in enumerate(mols):
            ligand_pdb = os.path.join(out_dir, out_template % id)
            print "writing " + ligand_pdb
            w = Chem.PDBWriter(ligand_pdb)
            w.write(mol)

Example #13

Show file

File: chem.py Project: stjordanis/FPSim2

def sdf_mol_supplier(
    filename: str, gen_ids: bool, **kwargs
) -> IterableType[Tuple[int, Chem.Mol]]:
    """Generator function that reads from a .sdf file.

    Parameters
    ----------
    filename : str
        .sdf filename.

    gen_ids: bool
        generate ids or not.

    Yields
    -------
    tuple
        int id and rdkit mol.
    """
    if filename.endswith('.gz'):
        import gzip
        gzf = gzip.open(filename)
        suppl = Chem.ForwardSDMolSupplier(gzf)
    else:
        suppl = Chem.ForwardSDMolSupplier(filename)
    for new_mol_id, rdmol in enumerate(suppl, 1):
        if rdmol:
            if gen_ids:
                mol_id = new_mol_id
            else:
                mol_id = rdmol.GetProp(kwargs["mol_id_prop"])
            try:
                int(mol_id)
            except ValueError:
                raise Exception(
                    "FPSim only supports integer ids for molecules, "
                    "cosinder setting gen_ids=True when running "
                    "create_db_file to autogenerate them."
                )
            yield mol_id, rdmol
        else:
            continue

Example #14

Show file

File: calculate_inertia.py Project: charnley/inertia

def parse_sdfgz(filename):

    f = gzip.open(filename)
    suppl = Chem.ForwardSDMolSupplier(f, removeHs=False, sanitize=True)

    for molobj in suppl:

        if molobj is None: continue

        inertia = parse_molobj(molobj)

        yield inertia

Example #15

Show file

def save_sdf(mol_paths, mol_names, out_name=''):
    # Setup writer
    out_file = os.path.join(os.path.abspath(sys.argv[1]), f'{out_name}.sdf')
    writer = AllChem.SDWriter(out_file)

    for path, name in zip(mol_paths, mol_names):
        if ('.sdfgz' in path) or ('.sdf.gz' in path):
            with gzip.open(path) as rf:
                suppl = Chem.ForwardSDMolSupplier(rf, removeHs=False)
                mol = suppl.__next__()  # Grab first mol
                mol.SetProp('_Name', name)
                writer.write(mol)
        elif '.sdf' in path:
            with open(path) as rf:
                suppl = Chem.ForwardSDMolSupplier(rf, removeHs=False)
                mol = suppl.__next__()
                mol.SetProp('_Name', name)
                writer.write(mol)
    writer.flush()
    writer.close()
    #st.write(f'Saved to: {out_file}')
    return

Example #16

Show file

def __read_stdin_sdf(sanitize=True):
    molblock = ''
    line = sys.stdin.readline()
    while line:
        molblock += line
        if line == '$$$$\n':
            mol = [x for x in Chem.ForwardSDMolSupplier(BytesIO(molblock.encode('utf-8')), sanitize=sanitize)][0]
            mol_title = molblock.split('\n', 1)[0]
            if not mol_title:
                mol_title = __get_smi_as_molname(mol)
            yield mol, mol_title
            molblock = ''
        line = sys.stdin.readline()

Example #17

Show file

File: dataset.py Project: XuhanLiu/DrugEx

def corpus(input, output, suffix='sdf'):
    if suffix =='sdf':
        inf = gzip.open(input)
        mols = Chem.ForwardSDMolSupplier(inf)
        # mols = [mol for mol in suppl]
    else:
        df = pd.read_table(input).Smiles.dropna()
        mols = [Chem.MolFromSmiles(s) for s in df]
    voc = Voc('data/voc_smiles.txt')
    charger = rdMolStandardize.Uncharger()
    chooser = rdMolStandardize.LargestFragmentChooser()
    disconnector = rdMolStandardize.MetalDisconnector()
    normalizer = rdMolStandardize.Normalizer()
    words = set()
    canons = []
    tokens = []
    smiles = set()
    for mol in tqdm(mols):
        try:
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            mol = chooser.choose(mol)
            mol = charger.uncharge(mol)
            mol = disconnector.Disconnect(mol)
            mol = normalizer.normalize(mol)
            smileR = Chem.MolToSmiles(mol, 0)
            smiles.add(Chem.CanonSmiles(smileR))
        except:
            print('Parsing Error:') #, Chem.MolToSmiles(mol))

    for smile in tqdm(smiles):
        token = voc.split(smile) + ['EOS']
        if {'C', 'c'}.isdisjoint(token):
            print('Warning:', smile)
            continue
        if not {'[Na]', '[Zn]'}.isdisjoint(token):
            print('Redudent', smile)
            continue
        if 10 < len(token) <= 100:
            words.update(token)
            canons.append(smile)
            tokens.append(' '.join(token))
    log = open(output + '_voc.txt', 'w')
    log.write('\n'.join(sorted(words)))
    log.close()

    log = pd.DataFrame()
    log['Smiles'] = canons
    log['Token'] = tokens
    log.drop_duplicates(subset='Smiles')
    log.to_csv(output + '_corpus.txt', sep='\t', index=False)

Example #18

Show file

File: moleculeHelper.py Project: bet-gregori/phitools

def getFPdict_sdf(fh, molID=None, fpType='ecfp', radius=4):

    suppl = Chem.ForwardSDMolSupplier(fh, removeHs=False)
    fpD = {}
    count = 0
    for mol in suppl:
        count += 1
        if mol is None: continue
        name = getName(mol, count, molID)
        mol.UpdatePropertyCache(strict=False)
        mh = Chem.AddHs(mol, addCoords=True)
        fpD[name] = [getFP(mh, fpType, radius), Chem.MolToSmiles(mol)]

    return fpD

Example #19

Show file

def get_molecules_from_sdf_bytes(dataset):
    """
    Method which make RDKit molecules from dataset bytes-object

    :param dataset: bytearray with molecules
    :return: list of RDKit molecules
    :type dataset: bytearray
    :rtype: list
    """

    stream = io.BytesIO(dataset)
    supplier = Chem.ForwardSDMolSupplier(stream)
    molecules = [x for x in supplier if x]

    return molecules

Example #20

Show file

def rdkit_open(File_Tuple):

    List = []

    for f in (File_Tuple):
        handle = file_handle(f)

        if re.search(r'.sdf', f):
            if re.search(r'.gz$|.bz2$', f):
                Mol = [
                    x
                    for x in Chem.ForwardSDMolSupplier(handle, removeHs=False)
                    if x is not None
                ]
            else:
                Mol = [
                    x for x in Chem.SDMolSupplier(handle, removeHs=False)
                    if x is not None
                ]

        if re.search(r'.smi', f):
            with handle as fi:
                first_line = fi.readline()

            if re.search(r'smiles', first_line, re.IGNORECASE):
                Mol = [
                    x for x in Chem.SmilesMolSupplier(
                        f, titleLine=True, delimiter=' |\t|,') if x is not None
                ]
            else:
                Mol = [
                    x for x in Chem.SmilesMolSupplier(
                        f, titleLine=False, delimiter=' |\t|,')
                    if x is not None
                ]

        ## not the official RDkit function, may fail
        if re.search(r'.mol2', f):
            Mol = [
                x for x in Mol2MolSupplier(f, removeHs=False) if x is not None
            ]

        print("# Found mol in {0}: {1}".format(f, len(Mol)))
        for mol in Mol:
            List.append(mol)

    gc.collect()
    return List

Example #21

Show file

File: MolecularRepresentations.py Project: wxlsummer/chemical_clustering

def readAndCreateFingerprint(file_name, counts=False):
    if not counts:
        fingerprints = [
            AllChem.GetMorganFingerprintAsBitVect(m, 2, 1024)
            for m in Chem.ForwardSDMolSupplier(file_name, removeHs=False)
            if m is not None
        ]
        return fingerprints
    else:
        info = {}
        fingerprints = [
            AllChem.GetMorganFingerprint(m, 2, bitInfo=info)
            for m in Chem.ForwardSDMOLSupplier(file_name, removeHs=False)
            if m is not None
        ]
        return fingerprints

Example #22

Show file

File: extractData.py Project: bet-gregori/phitools

def extractField(args):
    # Write header in output file
    args.out.write('{}\n'.format('\t'.join(['Name', args.field])))

    # Get data and print to output file
    suppl = Chem.ForwardSDMolSupplier(args.sdf)
    count = 0
    for m in suppl:
        count += 1
        if m is None: continue
        name = mh.getName(m, count)
        if m.HasProp(args.field):
            value = m.GetProp(args.field)
        else:
            value = 'NA'
        args.out.write('{}\n'.format('\t'.join([name, value])))

Example #23

Show file

File: read_input.py Project: meddwl/rdkit-scripts

def __read_sdf(fname, input_format, id_field_name=None, sanitize=True):
    if input_format == 'sdf':
        suppl = Chem.SDMolSupplier(fname, sanitize=sanitize)
    elif input_format == 'sdf.gz':
        suppl = Chem.ForwardSDMolSupplier(gzip.open(fname), sanitize=sanitize)
    else:
        return
    for mol in suppl:
        if mol is not None:
            if id_field_name is not None:
                mol_title = mol.GetProp(id_field_name)
            else:
                if mol.GetProp("_Name"):
                    mol_title = mol.GetProp("_Name")
                else:
                    mol_title = Chem.MolToSmiles(mol, isomericSmiles=True)
            yield PropertyMol(mol), mol_title

Example #24

Show file

File: dataprep.py Project: pk-organics/AutomatedSeriesClassification

def main(directory: str, chebml_version: str):
    """Download the ChEBML data."""
    os.makedirs(directory, exist_ok=True)

    bradley_path = os.path.join(directory, 'jm020472j_s2.xls')
    if not os.path.exists(bradley_path):
        try:
            wget.download(bradley_url, out=directory)
        except:
            click.echo('There goes ACS stopping science')

    chembl_url = (
        f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/'
        f'chembl_{chebml_version}/chembl_{chebml_version}.sdf.gz')

    sdf_path = os.path.join(directory, f'chembl_{chebml_version}.sdf.gz')
    if not os.path.exists(sdf_path):
        wget.download(chembl_url, out=directory)

    sss_path = os.path.join(directory, f'chembl{chebml_version}_sssdata.pkl')
    if not os.path.exists(sss_path):
        click.echo(f'RDKit Version: {rdBase.rdkitVersion}')
        data = []

        with gzip.GzipFile(sdf_path) as gz:
            suppl = Chem.ForwardSDMolSupplier(gz)
            for mol in tqdm(suppl,
                            desc=f'Processing ChEBML {chebml_version}',
                            unit_scale=True):
                if mol is None or mol.GetNumAtoms() > 50:
                    continue
                fp = Chem.PatternFingerprint(mol)
                smi = Chem.MolToSmiles(mol)
                data.append((smi, fp))

        click.echo(f'Outputting to {sss_path}')
        with open(sss_path, 'wb') as file:
            mols = rdSubstructLibrary.CachedTrustedSmilesMolHolder()
            fps = rdSubstructLibrary.PatternHolder()
            for smi, fp in data:
                mols.AddSmiles(smi)
                fps.AddFingerprint(fp)
            library = rdSubstructLibrary.SubstructLibrary(mols, fps)
            pickle.dump(library, file, protocol=pickle.HIGHEST_PROTOCOL)

    click.echo('Done ;)')

Example #25

Show file

def default_open_input_sdf(inputDef):
    """Open the input as a SD file (possibly gzipped if ending with .gz) according to RDKit's ForwardSDMolSupplier

    :param inputDef: The name of the file. If None then STDIN is used (and assumed not to be gzipped)
    """
    if inputDef:
        input = utils.open_file(inputDef)
    else:
        # We need to use the (Python 3) stdin.buffer
        # (a binary representation of the input stream)
        # for RDKit in Python 3.
        if sys.version_info[0] >= 3:
            input = sys.stdin.buffer
        else:
            input = sys.stdin
    suppl = Chem.ForwardSDMolSupplier(input)
    return input, suppl

Example #26

Show file

def __read_stdin_sdf(sanitize=True, removeHs=True):
    molblock = ''
    line = sys.stdin.readline()
    while line:
        molblock += line
        if line == '$$$$\n':
            mol = [
                x for x in Chem.ForwardSDMolSupplier(BytesIO(
                    molblock.encode('utf-8')),
                                                     sanitize=sanitize,
                                                     removeHs=removeHs)
            ][0]
            mol_title = molblock.split('\n', 1)[0]
            if not mol_title:
                mol_title = Chem.MolToSmiles(mol, isomericSmiles=True)
            yield mol, mol_title
            molblock = ''
        line = sys.stdin.readline()

Example #27

Show file

File: collection.py Project: choderalab/espaloma

def zinc(first=-1, *args, **kwargs):
    """ ZINC collection.

    ..[1] Irwin, John J, and Brian K Shoichet.
    “ZINC
    --a free database of commercially available compounds for virtual screening.”
    Journal of chemical information and modeling
    vol. 45,1 (2005): 177-82. doi:10.1021/ci049714+
    """
    import tarfile
    from os.path import exists
    from openff.toolkit.topology import Molecule
    from rdkit import Chem

    fname = "parm_at_Frosst.tgz"
    url = "http://www.ccl.net/cca/data/parm_at_Frosst/parm_at_Frosst.tgz"

    if not exists(fname):
        import urllib.request

        urllib.request.urlretrieve(url, fname)

    archive = tarfile.open(fname)
    zinc_file = archive.extractfile("parm_at_Frosst/zinc.sdf")
    _mols = Chem.ForwardSDMolSupplier(zinc_file, removeHs=False)

    count = 0
    gs = []

    for mol in _mols:
        try:
            gs.append(
                esp.Graph(Molecule.from_rdkit(mol,
                                              allow_undefined_stereo=True)))

            count += 1

        except:
            pass

        if first != -1 and count >= first:
            break

    return esp.data.dataset.GraphDataset(gs, *args, **kwargs)

Example #28

Show file

File: sdf2sql.py Project: tjod/fragments

def processSDF(cursor, filename, split, storeMolblock):
    nmol = 0
    # capture stderr when processing mol
    #sio = sys.stderr = StringIO()
    #fp = open(filename, 'rb')
    #suppl = Chem.ForwardSDMolSupplier(fp)

    if filename.endswith(".gz"):
        import gzip
        inf = gzip.open(filename)
        suppl = Chem.ForwardSDMolSupplier(inf)
        if storeMolblock:
            print("Warning: cannot store molblock from gzip file",
                  file=sys.stderr)
    else:
        suppl = Chem.SDMolSupplier(filename)

    for mol in suppl:
        if hasattr(suppl, "GetItemText"):
            (molblock, sep,
             moldata) = suppl.GetItemText(nmol).partition('M  END')
            if storeMolblock:
                molstore = molblock + sep
            else:
                molstore = None
        else:
            molstore = None
        nmol += 1
        imol = addMol(cursor, mol, molstore, nmol)
        if mol:
            for p in mol.GetPropNames():
                if split:
                    for sp in mol.GetProp(p).split(","):
                        addProp(cursor, imol, p, sp)
                else:
                    addProp(cursor, imol, p, mol.GetProp(p))
        else:
            print("Error adding molecule #%d" % nmol, file=sys.stderr)
            # molname stores the stderr when processing the mol
            #cursor.execute("Update molecule Set molname = ? Where molid = ?", [sio.getvalue(), imol])
            #sio = sys.stderr = StringIO() # reset

    return nmol

Example #29

Show file

def usrcat_write_binary(sdf_file_path: Path, gzip_output_binary: bool = False):
    assert sdf_file_path.exists(), "SDF file not found"
    binary_file_path = Path(str(sdf_file_path) + ".usrcatsl.bin")
    if gzip_output_binary:
        binary_file_path = Path(str(binary_file_path) + ".gz")
    assert not Path(binary_file_path).exists(), "Output binary exists"
    output_binary = open_file_may_be_gzipped(binary_file_path, "wb")
    output_smiles_index = open_file_may_be_gzipped(
        Path(str(sdf_file_path) + ".usrcatsl.smi"), "w")
    bar = progressbar.ProgressBar(prefix="Generating binary")
    pos_and_desc_bytes = bytearray(
        struct.calcsize(usrcat_binary_struct_format_string))
    num_gets = 0
    num_good_mols = 0
    sdf_reader = None
    gz_compressed_file = None
    if str(sdf_file_path).endswith(".gz"):
        gz_compressed_file = gzip.open(str(sdf_file_path))
        sdf_reader = Chem.ForwardSDMolSupplier(gz_compressed_file)
    else:
        sdf_reader = Chem.SDMolSupplier(str(sdf_file_path))

    for mol in sdf_reader:
        num_gets += 1
        if mol is not None:
            if mol.GetNumHeavyAtoms() > 2:
                num_good_mols += 1
                usrcat_descriptos = GetUSRCAT(mol)
                struct.pack_into(usrcat_binary_struct_format_string,
                                 pos_and_desc_bytes, 0, num_good_mols,
                                 *usrcat_descriptos)
                # Note we use num_good mols, this means that the first line is #1, not 0 - the smiles lines are not zero-indexed.
                output_binary.write(pos_and_desc_bytes)
                output_smiles_index.write(
                    Chem.MolToSmiles(mol) + " " + mol.GetProp("_Name") + "\n")
            if num_good_mols % 1000 == 0:
                bar.update(num_gets)
    bar.update(num_gets)
    output_binary.close()
    output_smiles_index.close()
    print("Num gets", num_gets)
    print("Num good mols", num_good_mols)

Example #30

Show file

def LoadSDF(filename, smilesName='SMILES', idName='ID',molColName = 'ROMol',includeFingerprints=False):
  """ Read file in SDF format and return as Panda data frame """
  df = None
  if type(filename) is str:
    f = open(filename, 'rb') #'rU')
  else:
    f = filename
  for i, mol in enumerate(Chem.ForwardSDMolSupplier(f)):
    if mol is None: continue
    row = dict((k, mol.GetProp(k)) for k in mol.GetPropNames())
    if mol.HasProp('_Name'): row[idName] = mol.GetProp('_Name')
    row[smilesName] = Chem.MolToSmiles(mol)
    row = pd.DataFrame(row, index=[i])
    if df is None:
      df = row
    else:
      df = df.append(row)
  f.close()
  AddMoleculeColumnToFrame(df, smilesCol=smilesName, molCol = molColName,includeFingerprints=includeFingerprints)
  return df