Esempio n. 1
0
    def test_create_from_array(self):
        from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint
        from e3fp.fingerprint.db import FingerprintDatabase

        array = (np.random.uniform(0, 1, size=(10, 1024)) > 0.9).astype(
            np.uint16
        )
        fprints = [Fingerprint.from_vector(array[i, :]) for i in range(10)]
        fp_names = []
        for i, fp in enumerate(fprints):
            name = str(i)
            fp.name = name
            fp.level = 5
            fp_names.append(name)
        db1 = FingerprintDatabase(
            fp_type=CountFingerprint, level=5, name="Test"
        )
        db1.add_fingerprints(fprints)
        db2 = FingerprintDatabase.from_array(
            array, fp_names, level=5, name="Test"
        )
        self.assertEqual(db1.fp_type, db2.fp_type)
        np.testing.assert_array_equal(
            db1.array.todense().getA(), db2.array.todense().getA()
        )
Esempio n. 2
0
    def test_add_fingerprints(self):
        from e3fp.fingerprint.fprint import CountFingerprint
        from e3fp.fingerprint.db import FingerprintDatabase

        array = (np.random.uniform(0, 1, size=(10, 1024)) > 0.9).astype(
            np.double
        )
        fprints = [
            CountFingerprint.from_vector(array[i, :]) for i in range(10)
        ]
        for i, fp in enumerate(fprints):
            fp.name = str(i)
        db = FingerprintDatabase(fp_type=CountFingerprint)
        db.add_fingerprints(fprints)
        self.assertIs(db.fp_type, CountFingerprint)
        self.assertTrue(np.issubdtype(db.array.dtype, np.uint16))
        self.assertEqual(db.fp_num, 10)
        self.assertEqual(db.bits, 1024)
        self.assertEqual(len(db.fp_names_to_indices), 10)
        for k, v in db.fp_names_to_indices.items():
            k = int(k)
            self.assertEqual(k, v[0])
            np.testing.assert_almost_equal(
                array[k, :], db.array[k, :].todense().getA().flatten()
            )
Esempio n. 3
0
    def test_update_props(self):
        from e3fp.fingerprint.fprint import CountFingerprint
        from e3fp.fingerprint.db import FingerprintDatabase

        array = (np.random.uniform(0, 1, size=(10, 1024)) > 0.9).astype(
            np.double
        )
        fprints = [
            CountFingerprint.from_vector(array[i, :]) for i in range(10)
        ]
        for i, fp in enumerate(fprints):
            fp.name = str(i)
            fp.set_prop("index", i)
        db = FingerprintDatabase(fp_type=CountFingerprint)
        db.add_fingerprints(fprints)
        fprints2 = [
            CountFingerprint.from_vector(array[i, :]) for i in range(10)
        ]
        for i, fp in enumerate(fprints2):
            fp.name = str(i + len(fprints))
            fp.set_prop("index", i)
        db.add_fingerprints(fprints2)
        indices = db.get_prop("index")
        self.assertEqual(indices.shape[0], 20)
        self.assertListEqual(
            indices.tolist(), list(range(10)) + list(range(10))
        )
Esempio n. 4
0
    def test_fold_db(self):
        from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint
        from e3fp.fingerprint.db import FingerprintDatabase

        fold_len = 32
        for fp_type in (Fingerprint, CountFingerprint):
            array = (np.random.uniform(0, 1, size=(20, 4096)) > 0.9).astype(
                np.double
            )
            fprints = [fp_type.from_vector(array[i, :]) for i in range(2)]
            folded_fprints = []
            for i, fp in enumerate(fprints):
                fp.name = str(i)
                folded_fprints.append(fp.fold(fold_len))
            db_fold1 = FingerprintDatabase(fp_type=fp_type)
            db_fold1.add_fingerprints(folded_fprints)

            db_unfold = FingerprintDatabase(fp_type=fp_type)
            db_unfold.add_fingerprints(fprints)

            db_fold2 = db_unfold.fold(fold_len)
            np.testing.assert_array_equal(
                db_fold2.array.todense().getA(),
                db_fold1.array.todense().getA(),
            )
Esempio n. 5
0
    def test_roundtrip2(self):
        """Ensure DB is the same after saving and loading."""
        from e3fp.fingerprint.fprint import Fingerprint
        from e3fp.fingerprint.db import FingerprintDatabase

        fprints = []
        for i in range(10):
            fprints.append(
                Fingerprint.from_indices(
                    np.random.uniform(0, 2 ** 32, size=30),
                    bits=2 ** 32,
                    level=5,
                )
            )
            fprints[-1].name = "fp" + str(i)
            fprints[-1].set_prop("index", float(i))
        db = FingerprintDatabase(fp_type=Fingerprint, level=5)
        db.add_fingerprints(fprints)
        desc, db_file = tempfile.mkstemp(suffix=".fps.bz2")
        os.close(desc)
        db.save(db_file)
        db2 = db.load(db_file)
        os.unlink(db_file)
        self.assertEqual(db, db2)
        self.assertListEqual(db2.get_prop("index").tolist(), list(range(10)))
Esempio n. 6
0
def molecules_to_array(molecules, mol_list, dense=False, processor=None):
    """Convert molecules to array or sparse matrix.

    Parameters
    ----------
    molecules : dict or string
        Molecules file or mol_list_dict.
    mol_list : list
        List of molecules, used to determine order of array.
    dense : bool, optional
        Return dense array.
    processor : InputProcessor, optional
        Object that processes fingerprints before building the database.

    Returns
    -------
    fp_array : ndarray or csr_matrix
        Row-based sparse matrix or ndarray containing fingerprints.
    mol_indices : dict
        Map from index in `mol_list` to list of row indices of fingerprints.
    """
    if isinstance(molecules, dict):
        mol_list_dict = molecules
    else:
        _, mol_list_dict, _ = molecules_to_lists_dicts(molecules)

    assert (set(mol_list_dict.keys()) == set(mol_list))

    fprint_dict = {
        k: [native_tuple_to_fprint(v) for v in vs]
        for k, vs in mol_list_dict.iteritems()
    }
    del mol_list_dict

    try:
        fprint_dict = processor.process_fingerprints(fprint_dict)
    except AttributeError:
        pass

    mol_indices_dict = {}
    fprints_list = []
    max_ind = 0
    for k, mol_name in enumerate(mol_list):
        fprints = fprint_dict[mol_name]
        fp_num = len(fprints)
        row_inds = range(max_ind, max_ind + fp_num)
        mol_indices_dict[k] = row_inds
        max_ind += fp_num
        fprints_list += fprints

    db = DB(fp_type=fprints_list[0].__class__, level=fprints_list[0].level)

    db.add_fingerprints(fprints_list)

    all_fps = db.array
    if dense:
        all_fps = all_fps.toarray().astype(all_fps.dtype)

    return all_fps, mol_indices_dict
Esempio n. 7
0
def gen_e3fp_features(smiles_list, rank, size, bits):
    config = read_params(str(bits) + '_params.cfg')
    c_params, f_params = params_to_dicts(config)

    fprint_list = []
    for name, smiles in enumerate(smiles_list):
        fprint_list.append(
            fprints_from_smiles(smiles,
                                smiles,
                                confgen_params=c_params,
                                fprint_params=f_params)[0])
    db = FingerprintDatabase(fp_type=Fingerprint, level=5)
    db.add_fingerprints(fprint_list)
    return db
Esempio n. 8
0
 def test_db_equality(self):
     from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint
     from e3fp.fingerprint.db import FingerprintDatabase
     db1 = FingerprintDatabase(name="TestDB")
     db2 = FingerprintDatabase(name="TestDB2")
     self.assertEqual(db1, db2)
     db2 = FingerprintDatabase(fp_type=CountFingerprint, name="TestDB2")
     self.assertNotEqual(db1, db2)
     db2 = FingerprintDatabase(level=5, name="TestDB2")
     self.assertNotEqual(db1, db2)
     db2 = FingerprintDatabase(name="TestDB2")
     array = (np.random.uniform(0, 1, size=(10, 1024)) > .9).astype(
         np.uint16)
     fprints = [Fingerprint.from_vector(array[i, :]) for i in range(10)]
     for i, fp in enumerate(fprints):
         name = str(i)
         fp.name = name
     db1.add_fingerprints(fprints)
     db2.add_fingerprints(fprints)
     self.assertEqual(db1, db2)
     db2.add_fingerprints([fprints[0]])
     self.assertNotEqual(db1, db2)
Esempio n. 9
0
def run(sdf_files,
        bits=BITS,
        first=FIRST_DEF,
        level=LEVEL_DEF,
        radius_multiplier=RADIUS_MULTIPLIER_DEF,
        counts=COUNTS_DEF,
        stereo=STEREO_DEF,
        include_disconnected=INCLUDE_DISCONNECTED_DEF,
        rdkit_invariants=RDKIT_INVARIANTS_DEF,
        exclude_floating=EXCLUDE_FLOATING_DEF,
        params=None,
        out_dir_base=None,
        out_ext=OUT_EXT_DEF,
        db_file=None,
        overwrite=False,
        all_iters=False,
        log=None,
        num_proc=None,
        parallel_mode=None,
        verbose=False):
    """Generate E3FP fingerprints from SDF files."""
    setup_logging(log, verbose=verbose)

    if params is not None:
        params = read_params(params, fill_defaults=True)
        bits = get_value(params, "fingerprinting", "bits", int)
        first = get_value(params, "fingerprinting", "first", int)
        level = get_value(params, "fingerprinting", "level", int)
        radius_multiplier = get_value(params, "fingerprinting",
                                      "radius_multiplier", float)
        counts = get_value(params, "fingerprinting", "counts", bool)
        stereo = get_value(params, "fingerprinting", "stereo", bool)
        include_disconnected = get_value(params, "fingerprinting",
                                         "include_disconnected", bool)
        rdkit_invariants = get_value(params, "fingerprinting",
                                     "rdkit_invariants", bool)
        exclude_floating = get_value(params, "fingerprinting",
                                     "exclude_floating", bool)

    para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode)

    if para.rank == 0:
        logging.info("Initializing E3FP generation.")
        logging.info("Getting SDF files")

        if len(sdf_files) == 1 and os.path.isdir(sdf_files[0]):
            from glob import glob
            sdf_files = glob("{:s}/*sdf*".format(sdf_files[0]))

        data_iterator = make_data_iterator(sdf_files)

        logging.info("SDF File Number: {:d}".format(len(sdf_files)))
        if out_dir_base is not None:
            logging.info("Out Directory Basename: {:s}".format(out_dir_base))
            logging.info("Out Extension: {:s}".format(out_ext))
        if db_file is not None:
            logging.info("Database File: {:s}".format(db_file))
        if db_file is None and out_dir_base is None:
            sys.exit('Either `db_file` or `out_dir_base` must be specified.')
        logging.info("Max First Conformers: {:d}".format(first))
        logging.info("Bits: {:d}".format(bits))
        logging.info("Level/Max Iterations: {:d}".format(level))
        logging.info(
            "Shell Radius Multiplier: {:.4g}".format(radius_multiplier))
        logging.info("Stereo Mode: {!s}".format(stereo))
        if include_disconnected:
            logging.info("Connected-only mode: on")
        if rdkit_invariants:
            logging.info("Invariant type: RDKit")
        else:
            logging.info("Invariant type: Daylight")
        logging.info("Parallel Mode: {!s}".format(para.parallel_mode))
        logging.info("Starting")
    else:
        data_iterator = iter([])

    fp_kwargs = {
        "first": first,
        "bits": bits,
        "level": level,
        "radius_multiplier": radius_multiplier,
        "stereo": stereo,
        "counts": counts,
        "include_disconnected": include_disconnected,
        "rdkit_invariants": rdkit_invariants,
        "exclude_floating": exclude_floating,
        "out_dir_base": out_dir_base,
        "out_ext": out_ext,
        "all_iters": all_iters,
        "overwrite": overwrite,
        "save": False
    }
    if out_dir_base is not None:
        fp_kwargs['save'] = True

    run_kwargs = {"kwargs": fp_kwargs}

    results_iter = para.run_gen(fprints_dict_from_sdf, data_iterator,
                                **run_kwargs)

    if db_file is not None:
        fprints = []
        for result, data in results_iter:
            try:
                fprints.extend(result.get(level, result[max(result.keys())]))
            except (AttributeError, ValueError):
                # fprinting failed, assume logged in method
                continue
        if len(fprints) > 0:
            db = FingerprintDatabase(fp_type=type(fprints[0]), level=level)
            db.add_fingerprints(fprints)
            db.save(db_file)
            logging.info("Saved fingerprints to {:s}".format(db_file))
    else:
        list(results_iter)