def process(): if len(sys.argv) < 4: print('Usage:', sys.argv[0], '[input.sdf] [exclude-molecules.sdf] [output.sdf]', file=sys.stderr) sys.exit(2) ifs = Base.FileIOStream(sys.argv[1], 'r') xifs = Base.FileIOStream(sys.argv[2], 'r') ofs = Base.FileIOStream(sys.argv[3], 'w') reader = Chem.SDFMoleculeReader(ifs) xreader = Chem.SDFMoleculeReader(xifs) writer = Chem.SDFMolecularGraphWriter(ofs) mol = Chem.BasicMolecule() Chem.setMultiConfImportParameter(reader, False) Chem.setMultiConfImportParameter(xreader, False) Chem.setMultiConfExportParameter(writer, False) stats = Stats() stats.read = 0 stats.dropped = 0 xhashes = set() while xreader.read(mol): setupMolecule(mol) hashcode = Chem.calcHashCode(mol) xhashes.add(hashcode) while reader.read(mol): #print('Processing Molecule ' + str(stats.read) setupMolecule(mol) hashcode = Chem.calcHashCode(mol) if hashcode in xhashes: stats.dropped += 1 print('Dropped Molecule ' + str(stats.read) + ': ' + Chem.generateSMILES(mol) + ' ' + Chem.getName(mol), file=sys.stderr) else: writer.write(mol) stats.read += 1 if stats.read % 10000 == 0: print('Processed ' + str(stats.read) + ' Molecules...', file=sys.stderr) print('', file=sys.stderr) print('-- Summary --', file=sys.stderr) print('Molecules processed: ' + str(stats.read), file=sys.stderr) print('Molecules dropped: ' + str(stats.dropped), file=sys.stderr)
def cleanStructures(): if len(sys.argv) < 5: print('Usage:', sys.argv[0], '[input.sdf] [output.sdf] [dropped.sdf] [start_index] [[count]]', file=sys.stderr) sys.exit(2) ifs = Base.FileIOStream(sys.argv[1], 'r') ofs = Base.FileIOStream(sys.argv[2], 'w') dofs = Base.FileIOStream(sys.argv[3], 'w') offset = int(sys.argv[4]) count = 0 if len(sys.argv) > 5: count = int(sys.argv[5]) reader = Chem.SDFMoleculeReader(ifs) writer = Chem.SDFMolecularGraphWriter(ofs) dwriter = Chem.SDFMolecularGraphWriter(dofs) mol = Chem.BasicMolecule() #Chem.setSMILESRecordFormatParameter(reader, 'SN') stats = Stats() stats.read = 0 stats.dropped = 0 stats.modified = 0 Chem.setMultiConfImportParameter(reader, False) Chem.setMultiConfExportParameter(writer, False) Chem.setMultiConfExportParameter(dwriter, False) if offset > 0: print('Skipping Molecules to Start Index ' + str(offset), file=sys.stderr) reader.setRecordIndex(offset) #print('Finished Setting Record Index', file=sys.stderr) stats.read = offset while reader.read(mol): #print('Processing Molecule ' + str(stats.read) proc_mol = processMolecule(mol, stats) if proc_mol: writer.write(proc_mol) else: stats.dropped += 1 dwriter.write(mol) print('Dropped Molecule ' + str(stats.read) + ': ' + generateSMILES(mol) + ' ' + Chem.getName(mol), file=sys.stderr) stats.read += 1 if stats.read % 10000 == 0: print('Processed ' + str(stats.read - offset) + ' Molecules...', file=sys.stderr) if count > 0 and (stats.read - offset) >= count: break print('', file=sys.stderr) print('-- Summary --', file=sys.stderr) print('Molecules processed: ' + str(stats.read - offset), file=sys.stderr) print('Molecules dropped: ' + str(stats.dropped), file=sys.stderr) print('Molecules modified: ' + str(stats.modified), file=sys.stderr)