def write_sdf_file(scaffold_graph, output_file): """Write an SDF file from a scaffoldgraph Parameters ---------- scaffold_graph (sg.ScaffoldGraph): graph to be converted output_file (str): path to output file """ N = scaffold_graph.num_scaffold_nodes sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True), key=lambda x: x[1]['hierarchy']) mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N))) writer = SDWriter(output_file) for scaffold, data in sorted_scaffolds: molecule = MolFromSmiles(scaffold) if molecule is not None: subscaffolds = list(scaffold_graph.predecessors(scaffold)) molecule.SetProp('_Name', mapping[scaffold]) molecule.SetIntProp('HIERARCHY', scaffold_graph.nodes[scaffold]['HIERARCHY']) molecule.SetProp('SMILES', scaffold) molecule.SetProp( 'SUBSCAFFOLDS', ', '.join([str(mapping[s]) for s in subscaffolds])) writer.write(molecule) writer.close()
def load_smiles_file(it): for line in it: smiles, cid = str(line).strip().split()[:2] mol = MolFromSmiles(smiles) if mol is not None: mol.SetProp('_Name', cid) yield mol
def filled_fragmentsdb(fragmentsdb, myshelve): fragmentsdb.add_fragments_from_shelve(myshelve) mol = MolFromSmiles('[*]COP(=O)([O-])OP(=O)([O-])OC1OC(C(=O)[O-])C(O)C(O)C1O') mol.SetProp('_Name', '1muu_GDX_frag7') fragmentsdb.add_molecule(mol) pdbs = [{ 'chainId': 'A', 'structureId': '1muu', 'structureTitle': '2.0 A crystal structure of GDP-mannose dehydrogenase', 'ecNo': '1.1.1.132', 'uniprotAcc': 'P11759', 'compound': 'GDP-mannose 6-dehydrogenase', 'uniprotRecommendedName': 'GDP-mannose 6-dehydrogenase', }, { # pdbs which has no fragment should be skipped 'chainId': 'A', 'structureId': '2n2k', 'structureTitle': 'Ensemble structure of the closed state of Lys63-linked diubiquitin in the absence of a ligand', 'ecNo': None, 'uniprotAcc': 'P0CG48', 'compound': 'ubiquitin', 'uniprotRecommendedName': 'Polyubiquitin-C', }] fragmentsdb.add_pdbs(pdbs) return fragmentsdb
def __next__(self): values = next(self.supplier) try: mol = MolFromSmiles(values[0]) mol.SetProp('_Name', str(values[1])) if self.data_cols is not None: for key, value in zip(self.data_cols, values[2]): mol.SetProp(str(key), str(value)) except AttributeError: logger.warning('Molecule {} : {} could not be parsed'.format( self.cursor, values[0])) self.cursor += 1 return None self.cursor += 1 return mol
def parse_smi(self, smi): """parse smiles and return Mol after storing in global dict or return from global dict""" smiles_name = smi.split() asmi = smiles_name[0] if len(smiles_name) > 1: aname = smiles_name[1] else: aname = None if asmi in self.mol: # return copy is slower, but safer? #return Mol(self.mol[asmi]) #plpy.notice('found mol for %s' % asmi) return self.mol[asmi] newmol = MolFromSmiles(asmi) if newmol: if len(self.mol) < self.maxsmi: #plpy.notice('new mol for %s' %asmi) pass else: self.mol.popitem() #key,psmi = self.mol.popitem() #plpy.notice('mol reuse %s for %s' % (key,psmi)) self.mol[asmi] = newmol if aname: newmol.SetProp("_Name", aname) return newmol else: return None
def write_scaffold(self, scaffold): subscaffolds = ', '.join([str(s.id) for s in scaffold.subscaffolds]) if self.args.sdf: molecule = MolFromSmiles(scaffold.smiles) if molecule is not None: molecule.SetProp('_Name', str(scaffold.id)) molecule.SetIntProp('HIERARCHY', scaffold.hierarchy) molecule.SetProp('SMILES', scaffold.smiles) molecule.SetProp('SUBSCAFFOLDS', subscaffolds) self.output.write(molecule) else: logger.warning(f'Failed to parse scaffold: {scaffold.smiles}') else: self.output.write('{0}\t{1}\t{2}\t{3}\n'.format( scaffold.id, scaffold.hierarchy, scaffold.smiles, subscaffolds))
def classify(sdf, label, lambdas): new_filename = "%s_class.sdf" % sdf.split('.sdf')[0] new_label = label + "_class" sdm = ForwardSDMolSupplier(sdf, strictParsing=False, removeHs=False, sanitize=False) sdw = SDWriter(new_filename) counter = -1 i = 0 for mol in sdm: print(i) sys.stdout.flush() i += 1 counter += 1 if mol is None: print("%d rdkit couldn't read molecule" % counter, file=sys.stderr) sys.stderr.flush() continue c = None prop = floatify(mol.GetProp(label)) if prop is None: print("couldn't convert %s to float or int...skip" % mol.GetProp(label), file=sys.stderr) sys.stderr.flush() continue for k, l in lambdas.items(): if l(prop): c = k print("hit %s" % k) sys.stdout.flush() break if c is None: print("%d no prop range matched '%s' ..skip" % (counter, mol.GetProp(label)), prop, type(prop), file=sys.stderr) sys.stderr.flush() sys.stdout.flush() continue mol.SetProp(new_label, c) try: sdw.write(mol) except: print( "couldn't write mol %d to file, try to build mol from smiles" % i, file=sys.stderr) mol = MolFromSmiles(mol.GetProp("SMILES")) AllChem.Compute2DCoords(mol) mol.SetProp(new_label, c) try: sdw.write(mol) except: print("couldn't write mol %d to file...skip" % i, file=sys.stderr) sdw.close()
def write_sdf_file(scaffold_graph, output_file): """Write an SDF file from a ScaffoldGraph. All scaffolds in the scaffoldgraph are written to the SDF, while molecules are ignored. Scaffolds are sorted in ascending order according to their hierarchy level. The output follows the standard SDF specification with the added property fields: TITLE field: scaffold ID SUBSCAFFOLDS field: list of sub-scaffold IDs HIERARCHY field: hierarchy level of scaffold SMILES field: scaffold canonical SMILES Parameters ---------- scaffold_graph : scaffoldgraph.core.ScaffoldGraph ScaffoldGraph to be written to an SDF. output_file : str Filepath to an output file. """ N = scaffold_graph.num_scaffold_nodes sorted_scaffolds = sorted(scaffold_graph.get_scaffold_nodes(data=True), key=lambda x: x[1]['hierarchy']) mapping = dict(zip([s[0] for s in sorted_scaffolds], range(0, N))) writer = SDWriter(output_file) for scaffold, data in sorted_scaffolds: molecule = MolFromSmiles(scaffold) if molecule is not None: subscaffolds = list(scaffold_graph.predecessors(scaffold)) molecule.SetProp('_Name', mapping[scaffold]) molecule.SetIntProp('HIERARCHY', scaffold_graph.nodes[scaffold]['HIERARCHY']) molecule.SetProp('SMILES', scaffold) molecule.SetProp( 'SUBSCAFFOLDS', ', '.join([str(mapping[s]) for s in subscaffolds])) writer.write(molecule) writer.close()
def testCreateRd(self): from rdkit.Chem import MolFromSmiles import decaf.toolkits.rd as rd molstring, name = self.string.split() mol = MolFromSmiles(molstring) mol.SetProp("_Name", name) phar = rd.phar_from_mol(mol) self.assertEqual(phar.numnodes, self.numnodes) self.assertEqual(np.sum(phar.edges > 0) / 2.0, self.numedges) types = {t: 0 for t in self.types} for i in range(phar.numnodes): for t in list(phar.nodes[i]["type"].keys()): types[t] += 1 self.assertEqual(types, self.types)
def __next__(self): smiles, name = next(self.supplier) try: mol = MolFromSmiles(smiles) mol.SetProp('_Name', str(name)) except AttributeError: logger.warning('Molecule {} : {} could not be parsed'.format( self.cursor, smiles)) self.cursor += 1 return None self.cursor += 1 return mol
def csv_to_sdf(csv_file, sdf_file, smiles_col, class_col, delim=','): sdw = SDWriter(sdf_file) with open(csv_file) as fh: for i, line in enumerate(fh.readlines()): if i == 0: continue line_split = line.strip().split(delim) smiles = line_split[smiles_col].replace('"', '') act_class = line_split[class_col].replace('"', '') act_newLabel = activity_label_to_id_map[act_class] mol = MolFromSmiles(smiles) mol.SetProp("TL", act_newLabel) sdw.write(mol) sdw.close()
def get_mols_from_smiles(smiles: List[str], **kwargs) -> List[Mol]: """ Converts a list of smiles to a list of mol objects and adds the provided properties. Parameters ---------- smiles : List[str] Iterable containing smiles strings kwargs : Dict[str, List[Any]] Every provided keyword argument will be added as property to the resulting molecules. The key is the property name and he value have to be list of any values. The lists for each keyword argument must have the same length like the the list of smiles. """ mols = [] for ix, smi in enumerate(smiles): mol = MolFromSmiles(smi) for prop in kwargs: mol.SetProp(prop, str(kwargs[prop][ix])) mols.append(mol) return mols
def main(args, output=sys.stdout, log=logging): parser = argparse.ArgumentParser( """RDKit-based conformer generation proof-of-concept. This program accepts either a mol2 file or a SMILES string and produces an SD file """) input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument('-m', '--mol2', type=str, help="Mol2 file to gererate conformers for") input_group.add_argument('-s', '--smiles', type=str, help="SMILES string of molecule") parser.add_argument('-N', '--name', type=str, default=None, help="Molecule name") parser.add_argument('-H', '--no-hydrogens', action='store_true', default=False, help="Do NOT explicitly add implicit Hydrogens to conformers [default: %(default)s]") parser.add_argument('-r', '--rmsd-threshold', type=float, default=2.0, help="Only accept conformers that have an RMSD of at least this value from previously seen conformers [default: %(default)s") parser.add_argument('-n', '--num-conformers', type=int, default=None, help="Number of conformers to initially generate [default: auto]") parser.add_argument('-F', '--forcefield', type=str, default=DEFAULT_FORCEFIELD, choices=FORCEFIELDS.keys(), help="Forcefield to use for optimization [default: %(default)s]") parser.add_argument('-P', '--parallelism', type=int, default=None, help="Number of processes to use [default: 1]") params = parser.parse_args(args) # Load input molecule if hasattr(params, 'mol2') and params.mol2 is not None: mol = MolFromMol2File(params.mol2, sanitize=False) else: mol = MolFromSmiles(params.smiles, sanitize=False) try: SanitizeMol(mol) except ValueError as e: log.critical("Could not sanitize molecule: {0}:".format(str(e))) sys.exit(2) except Exception: # This is `Boost.Python.ArgumentError` log.critical("Could not parse molecule!") sys.exit(2) # Assign user-provided name if applicable if params.name is not None: mol.SetProp(RD_NAME, params.name) elif not mol.HasProp(RD_NAME): mol.SetProp(RD_NAME, 'Ligand') # Generate 3D conformers embedded, selected = generate_conformers(mol, add_hydrogens=not params.no_hydrogens, rmsd_threshold=params.rmsd_threshold, num_conformers=params.num_conformers, parallelism=params.parallelism, forcefield=params.forcefield, log=log) log.info("Conformers selected: {0}".format(len(selected))) log.info("Energy: min={0:.4f} kcal/mol max={1:.4f} kcal/mol".format(selected[0][1], selected[-1][1])) # Find lowest-energy conformers sorted_by_energy = [item[0] for item in selected] # Render SDF file names = dump_conformers_sdf(embedded, output, conf_ids=sorted_by_energy, renumber=True) for name, (conf_id, energy) in zip(names, selected): log.info("\t{0}: {1:0.4f} kcal/mol".format(name, energy)) return 0