def test_create_from_array(self): from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint from e3fp.fingerprint.db import FingerprintDatabase array = (np.random.uniform(0, 1, size=(10, 1024)) > 0.9).astype( np.uint16 ) fprints = [Fingerprint.from_vector(array[i, :]) for i in range(10)] fp_names = [] for i, fp in enumerate(fprints): name = str(i) fp.name = name fp.level = 5 fp_names.append(name) db1 = FingerprintDatabase( fp_type=CountFingerprint, level=5, name="Test" ) db1.add_fingerprints(fprints) db2 = FingerprintDatabase.from_array( array, fp_names, level=5, name="Test" ) self.assertEqual(db1.fp_type, db2.fp_type) np.testing.assert_array_equal( db1.array.todense().getA(), db2.array.todense().getA() )
def test_add_fingerprints(self): from e3fp.fingerprint.fprint import CountFingerprint from e3fp.fingerprint.db import FingerprintDatabase array = (np.random.uniform(0, 1, size=(10, 1024)) > 0.9).astype( np.double ) fprints = [ CountFingerprint.from_vector(array[i, :]) for i in range(10) ] for i, fp in enumerate(fprints): fp.name = str(i) db = FingerprintDatabase(fp_type=CountFingerprint) db.add_fingerprints(fprints) self.assertIs(db.fp_type, CountFingerprint) self.assertTrue(np.issubdtype(db.array.dtype, np.uint16)) self.assertEqual(db.fp_num, 10) self.assertEqual(db.bits, 1024) self.assertEqual(len(db.fp_names_to_indices), 10) for k, v in db.fp_names_to_indices.items(): k = int(k) self.assertEqual(k, v[0]) np.testing.assert_almost_equal( array[k, :], db.array[k, :].todense().getA().flatten() )
def test_update_props(self): from e3fp.fingerprint.fprint import CountFingerprint from e3fp.fingerprint.db import FingerprintDatabase array = (np.random.uniform(0, 1, size=(10, 1024)) > 0.9).astype( np.double ) fprints = [ CountFingerprint.from_vector(array[i, :]) for i in range(10) ] for i, fp in enumerate(fprints): fp.name = str(i) fp.set_prop("index", i) db = FingerprintDatabase(fp_type=CountFingerprint) db.add_fingerprints(fprints) fprints2 = [ CountFingerprint.from_vector(array[i, :]) for i in range(10) ] for i, fp in enumerate(fprints2): fp.name = str(i + len(fprints)) fp.set_prop("index", i) db.add_fingerprints(fprints2) indices = db.get_prop("index") self.assertEqual(indices.shape[0], 20) self.assertListEqual( indices.tolist(), list(range(10)) + list(range(10)) )
def test_fold_db(self): from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint from e3fp.fingerprint.db import FingerprintDatabase fold_len = 32 for fp_type in (Fingerprint, CountFingerprint): array = (np.random.uniform(0, 1, size=(20, 4096)) > 0.9).astype( np.double ) fprints = [fp_type.from_vector(array[i, :]) for i in range(2)] folded_fprints = [] for i, fp in enumerate(fprints): fp.name = str(i) folded_fprints.append(fp.fold(fold_len)) db_fold1 = FingerprintDatabase(fp_type=fp_type) db_fold1.add_fingerprints(folded_fprints) db_unfold = FingerprintDatabase(fp_type=fp_type) db_unfold.add_fingerprints(fprints) db_fold2 = db_unfold.fold(fold_len) np.testing.assert_array_equal( db_fold2.array.todense().getA(), db_fold1.array.todense().getA(), )
def test_roundtrip2(self): """Ensure DB is the same after saving and loading.""" from e3fp.fingerprint.fprint import Fingerprint from e3fp.fingerprint.db import FingerprintDatabase fprints = [] for i in range(10): fprints.append( Fingerprint.from_indices( np.random.uniform(0, 2 ** 32, size=30), bits=2 ** 32, level=5, ) ) fprints[-1].name = "fp" + str(i) fprints[-1].set_prop("index", float(i)) db = FingerprintDatabase(fp_type=Fingerprint, level=5) db.add_fingerprints(fprints) desc, db_file = tempfile.mkstemp(suffix=".fps.bz2") os.close(desc) db.save(db_file) db2 = db.load(db_file) os.unlink(db_file) self.assertEqual(db, db2) self.assertListEqual(db2.get_prop("index").tolist(), list(range(10)))
def molecules_to_array(molecules, mol_list, dense=False, processor=None): """Convert molecules to array or sparse matrix. Parameters ---------- molecules : dict or string Molecules file or mol_list_dict. mol_list : list List of molecules, used to determine order of array. dense : bool, optional Return dense array. processor : InputProcessor, optional Object that processes fingerprints before building the database. Returns ------- fp_array : ndarray or csr_matrix Row-based sparse matrix or ndarray containing fingerprints. mol_indices : dict Map from index in `mol_list` to list of row indices of fingerprints. """ if isinstance(molecules, dict): mol_list_dict = molecules else: _, mol_list_dict, _ = molecules_to_lists_dicts(molecules) assert (set(mol_list_dict.keys()) == set(mol_list)) fprint_dict = { k: [native_tuple_to_fprint(v) for v in vs] for k, vs in mol_list_dict.iteritems() } del mol_list_dict try: fprint_dict = processor.process_fingerprints(fprint_dict) except AttributeError: pass mol_indices_dict = {} fprints_list = [] max_ind = 0 for k, mol_name in enumerate(mol_list): fprints = fprint_dict[mol_name] fp_num = len(fprints) row_inds = range(max_ind, max_ind + fp_num) mol_indices_dict[k] = row_inds max_ind += fp_num fprints_list += fprints db = DB(fp_type=fprints_list[0].__class__, level=fprints_list[0].level) db.add_fingerprints(fprints_list) all_fps = db.array if dense: all_fps = all_fps.toarray().astype(all_fps.dtype) return all_fps, mol_indices_dict
def gen_e3fp_features(smiles_list, rank, size, bits): config = read_params(str(bits) + '_params.cfg') c_params, f_params = params_to_dicts(config) fprint_list = [] for name, smiles in enumerate(smiles_list): fprint_list.append( fprints_from_smiles(smiles, smiles, confgen_params=c_params, fprint_params=f_params)[0]) db = FingerprintDatabase(fp_type=Fingerprint, level=5) db.add_fingerprints(fprint_list) return db
def test_db_equality(self): from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint from e3fp.fingerprint.db import FingerprintDatabase db1 = FingerprintDatabase(name="TestDB") db2 = FingerprintDatabase(name="TestDB2") self.assertEqual(db1, db2) db2 = FingerprintDatabase(fp_type=CountFingerprint, name="TestDB2") self.assertNotEqual(db1, db2) db2 = FingerprintDatabase(level=5, name="TestDB2") self.assertNotEqual(db1, db2) db2 = FingerprintDatabase(name="TestDB2") array = (np.random.uniform(0, 1, size=(10, 1024)) > .9).astype( np.uint16) fprints = [Fingerprint.from_vector(array[i, :]) for i in range(10)] for i, fp in enumerate(fprints): name = str(i) fp.name = name db1.add_fingerprints(fprints) db2.add_fingerprints(fprints) self.assertEqual(db1, db2) db2.add_fingerprints([fprints[0]]) self.assertNotEqual(db1, db2)
def run(sdf_files, bits=BITS, first=FIRST_DEF, level=LEVEL_DEF, radius_multiplier=RADIUS_MULTIPLIER_DEF, counts=COUNTS_DEF, stereo=STEREO_DEF, include_disconnected=INCLUDE_DISCONNECTED_DEF, rdkit_invariants=RDKIT_INVARIANTS_DEF, exclude_floating=EXCLUDE_FLOATING_DEF, params=None, out_dir_base=None, out_ext=OUT_EXT_DEF, db_file=None, overwrite=False, all_iters=False, log=None, num_proc=None, parallel_mode=None, verbose=False): """Generate E3FP fingerprints from SDF files.""" setup_logging(log, verbose=verbose) if params is not None: params = read_params(params, fill_defaults=True) bits = get_value(params, "fingerprinting", "bits", int) first = get_value(params, "fingerprinting", "first", int) level = get_value(params, "fingerprinting", "level", int) radius_multiplier = get_value(params, "fingerprinting", "radius_multiplier", float) counts = get_value(params, "fingerprinting", "counts", bool) stereo = get_value(params, "fingerprinting", "stereo", bool) include_disconnected = get_value(params, "fingerprinting", "include_disconnected", bool) rdkit_invariants = get_value(params, "fingerprinting", "rdkit_invariants", bool) exclude_floating = get_value(params, "fingerprinting", "exclude_floating", bool) para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode) if para.rank == 0: logging.info("Initializing E3FP generation.") logging.info("Getting SDF files") if len(sdf_files) == 1 and os.path.isdir(sdf_files[0]): from glob import glob sdf_files = glob("{:s}/*sdf*".format(sdf_files[0])) data_iterator = make_data_iterator(sdf_files) logging.info("SDF File Number: {:d}".format(len(sdf_files))) if out_dir_base is not None: logging.info("Out Directory Basename: {:s}".format(out_dir_base)) logging.info("Out Extension: {:s}".format(out_ext)) if db_file is not None: logging.info("Database File: {:s}".format(db_file)) if db_file is None and out_dir_base is None: sys.exit('Either `db_file` or `out_dir_base` must be specified.') logging.info("Max First Conformers: {:d}".format(first)) logging.info("Bits: {:d}".format(bits)) logging.info("Level/Max Iterations: {:d}".format(level)) logging.info( "Shell Radius Multiplier: {:.4g}".format(radius_multiplier)) logging.info("Stereo Mode: {!s}".format(stereo)) if include_disconnected: logging.info("Connected-only mode: on") if rdkit_invariants: logging.info("Invariant type: RDKit") else: logging.info("Invariant type: Daylight") logging.info("Parallel Mode: {!s}".format(para.parallel_mode)) logging.info("Starting") else: data_iterator = iter([]) fp_kwargs = { "first": first, "bits": bits, "level": level, "radius_multiplier": radius_multiplier, "stereo": stereo, "counts": counts, "include_disconnected": include_disconnected, "rdkit_invariants": rdkit_invariants, "exclude_floating": exclude_floating, "out_dir_base": out_dir_base, "out_ext": out_ext, "all_iters": all_iters, "overwrite": overwrite, "save": False } if out_dir_base is not None: fp_kwargs['save'] = True run_kwargs = {"kwargs": fp_kwargs} results_iter = para.run_gen(fprints_dict_from_sdf, data_iterator, **run_kwargs) if db_file is not None: fprints = [] for result, data in results_iter: try: fprints.extend(result.get(level, result[max(result.keys())])) except (AttributeError, ValueError): # fprinting failed, assume logged in method continue if len(fprints) > 0: db = FingerprintDatabase(fp_type=type(fprints[0]), level=level) db.add_fingerprints(fprints) db.save(db_file) logging.info("Saved fingerprints to {:s}".format(db_file)) else: list(results_iter)