def process(): if len(sys.argv) < 4: print('Usage:', sys.argv[0], '[input.sdf] [exclude-molecules.sdf] [output.sdf]', file=sys.stderr) sys.exit(2) ifs = Base.FileIOStream(sys.argv[1], 'r') xifs = Base.FileIOStream(sys.argv[2], 'r') ofs = Base.FileIOStream(sys.argv[3], 'w') reader = Chem.SDFMoleculeReader(ifs) xreader = Chem.SDFMoleculeReader(xifs) writer = Chem.SDFMolecularGraphWriter(ofs) mol = Chem.BasicMolecule() Chem.setMultiConfImportParameter(reader, False) Chem.setMultiConfImportParameter(xreader, False) Chem.setMultiConfExportParameter(writer, False) stats = Stats() stats.read = 0 stats.dropped = 0 xhashes = set() while xreader.read(mol): setupMolecule(mol) hashcode = Chem.calcHashCode(mol) xhashes.add(hashcode) while reader.read(mol): #print('Processing Molecule ' + str(stats.read) setupMolecule(mol) hashcode = Chem.calcHashCode(mol) if hashcode in xhashes: stats.dropped += 1 print('Dropped Molecule ' + str(stats.read) + ': ' + Chem.generateSMILES(mol) + ' ' + Chem.getName(mol), file=sys.stderr) else: writer.write(mol) stats.read += 1 if stats.read % 10000 == 0: print('Processed ' + str(stats.read) + ' Molecules...', file=sys.stderr) print('', file=sys.stderr) print('-- Summary --', file=sys.stderr) print('Molecules processed: ' + str(stats.read), file=sys.stderr) print('Molecules dropped: ' + str(stats.dropped), file=sys.stderr)
def CDPLmolFromSdf(sdf_path, conformation): ''' generates a single CDPL Molecule from an sdf-file. If conformations is true, then one random conformation will be generated. \n Input: \n sdf_path (string): path to the sdf file \n conformation (boolean): generates one 3d conformation according to MMFF94 \n Return: \n (CDPL BasicMolecule): the corresponding CDPL BasicMolecule ''' mol = Chem.BasicMolecule() ifs = Base.FileIOStream(sdf_path, 'r') sdf_reader = Chem.SDFMoleculeReader(ifs) if not sdf_reader.read(mol): log.error("COULD NOT READ SDF", sdf_path) return False if conformation: return _CDPLgenerateConformation(mol) return mol
def process(sdf_file, psd_file_path): ifs = Base.FileIOStream(sdf_file, 'r') reader = Chem.SDFMoleculeReader(ifs) mol = Chem.BasicMolecule() Chem.setMultiConfImportParameter(reader, True) psd_creator = Pharm.PSDScreeningDBCreator( psd_file_path, Pharm.PSDScreeningDBCreator.CREATE, True) i = 0 t0 = time.clock() while reader.read(mol): setupMolecule(mol) psd_creator.process(mol) i += 1 if i % 100 == 0: print 'Processed ' + str(i) + ' molecules (' + str( time.clock() - t0), 's elapsed)...' t0 = time.clock() mol.clear() print '' print '-- Summary --' print 'Molecules processed: ' + str(psd_creator.numProcessed) print 'Molecules rejected: ' + str(psd_creator.numRejected) print 'Molecules deleted: ' + str(psd_creator.numDeleted) print 'Molecules inserted: ' + str(psd_creator.numInserted) psd_creator.close()
import CDPL.Base as Base import CDPL.Chem as Chem import CDPL.Math as Math def process(): if len(sys.argv) < 4: print('Usage:', sys.argv[0], 'training-set.sdf logP-data regression-coeff-file', file=sys.stderr) sys.exit(2) struct_is = Base.FileIOStream(sys.argv[1], 'r') exp_logp_is = Base.FileIOStream(sys.argv[2], 'r') coeff_os = Base.FileIOStream(sys.argv[3], 'w') mlr_model = Math.DMLRModel() sdf_reader = Chem.SDFMoleculeReader(struct_is) mol = Chem.BasicMolecule() xlogp_calc = Chem.XLogPCalculator() histo = Math.DVector() histo.resize(Chem.XLogPCalculator.FEATURE_VECTOR_SIZE) Chem.setMultiConfImportParameter(sdf_reader, False) while sdf_reader.read(mol): exp_logp = float(exp_logp_is.readline()) Chem.perceiveComponents(mol, False) Chem.perceiveSSSR(mol, False) Chem.setRingFlags(mol, False) Chem.calcImplicitHydrogenCounts(mol, False)
def cleanStructures(): if len(sys.argv) < 5: print('Usage:', sys.argv[0], '[input.sdf] [output.sdf] [dropped.sdf] [start_index] [[count]]', file=sys.stderr) sys.exit(2) ifs = Base.FileIOStream(sys.argv[1], 'r') ofs = Base.FileIOStream(sys.argv[2], 'w') dofs = Base.FileIOStream(sys.argv[3], 'w') offset = int(sys.argv[4]) count = 0 if len(sys.argv) > 5: count = int(sys.argv[5]) reader = Chem.SDFMoleculeReader(ifs) writer = Chem.SDFMolecularGraphWriter(ofs) dwriter = Chem.SDFMolecularGraphWriter(dofs) mol = Chem.BasicMolecule() #Chem.setSMILESRecordFormatParameter(reader, 'SN') stats = Stats() stats.read = 0 stats.dropped = 0 stats.modified = 0 Chem.setMultiConfImportParameter(reader, False) Chem.setMultiConfExportParameter(writer, False) Chem.setMultiConfExportParameter(dwriter, False) if offset > 0: print('Skipping Molecules to Start Index ' + str(offset), file=sys.stderr) reader.setRecordIndex(offset) #print('Finished Setting Record Index', file=sys.stderr) stats.read = offset while reader.read(mol): #print('Processing Molecule ' + str(stats.read) proc_mol = processMolecule(mol, stats) if proc_mol: writer.write(proc_mol) else: stats.dropped += 1 dwriter.write(mol) print('Dropped Molecule ' + str(stats.read) + ': ' + generateSMILES(mol) + ' ' + Chem.getName(mol), file=sys.stderr) stats.read += 1 if stats.read % 10000 == 0: print('Processed ' + str(stats.read - offset) + ' Molecules...', file=sys.stderr) if count > 0 and (stats.read - offset) >= count: break print('', file=sys.stderr) print('-- Summary --', file=sys.stderr) print('Molecules processed: ' + str(stats.read - offset), file=sys.stderr) print('Molecules dropped: ' + str(stats.dropped), file=sys.stderr) print('Molecules modified: ' + str(stats.modified), file=sys.stderr)