def _unfolded_ecfps_nodupes(self): if self._ecfps_no_dupes is None: ecfp_nodupes_file = op.join(manysources_dataset_root(self.name), '03-ecfps-nodupes', '%s.ecfps.h5' % self.name) if not op.exists(ecfp_nodupes_file): ecfps = self._unfolded_ecfps() ufp = UnfoldedFingerprints(ecfps.molids, ecfps.i2s, zero_dupes(ecfps.csr, by_rows=False), # Hack 1, this should be Configurable failed_molids=ecfps.failed_molids) ensure_dir(op.dirname(ecfp_nodupes_file)) with open(op.join(op.dirname(ecfp_nodupes_file), 'config.txt'), 'w') as writer: writer.write('Same as 02-ecfps, but removed columns that have the same value accross all rows.') ufp.save(ecfp_nodupes_file) self._ecfps = UnfoldedFingerprints.load(ecfp_nodupes_file) return self._ecfps
def _unfolded_ecfps(self): if self._ecfps is None: ecfp_file = op.join(manysources_dataset_root(self.name), '02-ecfps', '%s.ecfps.h5' % self.name) if not op.exists(ecfp_file): fingerprinter = RDKMorganFingerprinter() for molid, mol in self.mols(): print molid fingerprinter.add_mol(molid, mol) ensure_dir(op.dirname(ecfp_file)) with open(op.join(op.dirname(ecfp_file), 'config.json'), 'w') as writer: writer.write(fingerprinter.what().id()) fingerprinter.fingerprints().save(ecfp_file) self._ecfps = UnfoldedFingerprints.load(ecfp_file) return self._ecfps