def process(): if len(sys.argv) < 4: print('Usage:', sys.argv[0], '[input.sdf] [exclude-molecules.sdf] [output.sdf]', file=sys.stderr) sys.exit(2) ifs = Base.FileIOStream(sys.argv[1], 'r') xifs = Base.FileIOStream(sys.argv[2], 'r') ofs = Base.FileIOStream(sys.argv[3], 'w') reader = Chem.SDFMoleculeReader(ifs) xreader = Chem.SDFMoleculeReader(xifs) writer = Chem.SDFMolecularGraphWriter(ofs) mol = Chem.BasicMolecule() Chem.setMultiConfImportParameter(reader, False) Chem.setMultiConfImportParameter(xreader, False) Chem.setMultiConfExportParameter(writer, False) stats = Stats() stats.read = 0 stats.dropped = 0 xhashes = set() while xreader.read(mol): setupMolecule(mol) hashcode = Chem.calcHashCode(mol) xhashes.add(hashcode) while reader.read(mol): #print('Processing Molecule ' + str(stats.read) setupMolecule(mol) hashcode = Chem.calcHashCode(mol) if hashcode in xhashes: stats.dropped += 1 print('Dropped Molecule ' + str(stats.read) + ': ' + Chem.generateSMILES(mol) + ' ' + Chem.getName(mol), file=sys.stderr) else: writer.write(mol) stats.read += 1 if stats.read % 10000 == 0: print('Processed ' + str(stats.read) + ' Molecules...', file=sys.stderr) print('', file=sys.stderr) print('-- Summary --', file=sys.stderr) print('Molecules processed: ' + str(stats.read), file=sys.stderr) print('Molecules dropped: ' + str(stats.dropped), file=sys.stderr)
def calculate_molecule_hashcode(mol, stereo=True): Chem.makeHydrogenDeplete(mol) Chem.calcImplicitHydrogenCounts(mol, True) if stereo: Chem.calcAtomStereoDescriptors(mol, True) Chem.calcBondStereoDescriptors(mol, True) Chem.calcCIPPriorities(mol, True) Chem.calcAtomCIPConfigurations(mol, True) Chem.calcBondCIPConfigurations(mol, True) return Chem.calcHashCode(mol) else: return Chem.calcHashCode(mol, atom_flags=Chem.AtomPropertyFlag.TYPE | Chem.AtomPropertyFlag.H_COUNT | Chem.AtomPropertyFlag.FORMAL_CHARGE | Chem.AtomPropertyFlag.AROMATICITY, bond_flags=Chem.BondPropertyFlag.ORDER | Chem.BondPropertyFlag.TOPOLOGY | Chem.BondPropertyFlag.AROMATICITY)