コード例 #1
0
ファイル: test_fingerprints.py プロジェクト: oddt/oddt
def test_sparse_densify():
    """FP densify"""
    sparse_fp = [0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299,
                 323, 331, 376, 389, 410, 427, 430, 450, 484, 538, 592, 593,
                 636, 646, 658, 698, 699, 702, 741, 753, 807, 850, 861, 882,
                 915, 915, 915, 969, 969, 1023]

    # count vectors
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True)
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True)
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
    resparsed = dense_to_sparse(dense)
    resparsed_csr = csr_matrix_to_sparse(csr)
    assert_array_equal(sparse_fp, resparsed)
    assert_array_equal(sparse_fp, resparsed_csr)

    # bool vectors
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False)
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False)
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
    resparsed = dense_to_sparse(dense)
    resparsed_csr = csr_matrix_to_sparse(csr)
    assert_array_equal(np.unique(sparse_fp), resparsed)
    assert_array_equal(np.unique(sparse_fp), resparsed_csr)

    # test stacking
    np.random.seed(0)
    sparse_fps = np.random.randint(0, 1024, size=(20, 100))
    dense = np.vstack(sparse_to_dense(fp, size=1024) for fp in sparse_fps)
    csr = sparse_vstack(sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps)
    assert_array_equal(dense, csr.toarray())

    # test exceptions
    with pytest.raises(ValueError):
        csr_matrix_to_sparse(np.array([1, 2, 3]))
コード例 #2
0
ファイル: test_fingerprints.py プロジェクト: zchwang/oddt
def test_sparse_densify():
    """FP densify"""
    sparse_fp = [
        0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299, 323, 331,
        376, 389, 410, 427, 430, 450, 484, 538, 592, 593, 636, 646, 658, 698,
        699, 702, 741, 753, 807, 850, 861, 882, 915, 915, 915, 969, 969, 1023
    ]

    # count vectors
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True)
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True)
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
    resparsed = dense_to_sparse(dense)
    resparsed_csr = csr_matrix_to_sparse(csr)
    assert_array_equal(sparse_fp, resparsed)
    assert_array_equal(sparse_fp, resparsed_csr)

    # bool vectors
    dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False)
    csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False)
    assert_array_equal(dense.reshape(1, -1), csr.toarray())
    resparsed = dense_to_sparse(dense)
    resparsed_csr = csr_matrix_to_sparse(csr)
    assert_array_equal(np.unique(sparse_fp), resparsed)
    assert_array_equal(np.unique(sparse_fp), resparsed_csr)

    # test stacking
    np.random.seed(0)
    sparse_fps = np.random.randint(0, 1024, size=(20, 100))
    dense = np.vstack([sparse_to_dense(fp, size=1024) for fp in sparse_fps])
    csr = sparse_vstack(
        sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps)
    assert_array_equal(dense, csr.toarray())

    # test exceptions
    with pytest.raises(ValueError):
        csr_matrix_to_sparse(np.array([1, 2, 3]))
コード例 #3
0
ファイル: __init__.py プロジェクト: xianqiangsun/oddt
    def _gen_pdbbind_desc(self,
                          pdbbind_dir,
                          pdbbind_versions=(2007, 2012, 2013, 2014, 2015, 2016),
                          desc_path=None,
                          include_general_set=False,
                          use_proteins=False,
                          **kwargs):
        pdbbind_versions = sorted(pdbbind_versions)
        opt = kwargs.get('opt', {})

        # generate metadata
        df = None
        for pdbbind_version in pdbbind_versions:
            p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version),
                        version=pdbbind_version,
                        opt=opt)
            # Core set

            for set_name in p.pdbind_sets:
                if set_name == 'general_PL':
                    dataset_key = '%i_general' % pdbbind_version
                else:
                    dataset_key = '%i_%s' % (pdbbind_version, set_name)

                tmp_df = pd.DataFrame({
                    'pdbid': list(p.sets[set_name].keys()),
                    dataset_key: list(p.sets[set_name].values())
                })
                if df is not None:
                    df = pd.merge(tmp_df, df, how='outer', on='pdbid')
                else:
                    df = tmp_df

        df.sort_values('pdbid', inplace=True)
        tmp_act = df['%i_general' % pdbbind_versions[-1]].values
        df = df.set_index('pdbid').notnull()
        df['act'] = tmp_act
        # take non-empty and core + refined set
        df = df[df['act'].notnull() &
                (df.filter(regex='.*_[refined,core]').any(axis=1) |
                 include_general_set)]

        # build descriptos
        pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]),
                             version=pdbbind_versions[-1])
        if not desc_path:
            desc_path = path_join(dirname(__file__) + 'descs.csv')

        if self.n_jobs is None:
            n_jobs = -1
        else:
            n_jobs = self.n_jobs

        blacklist = []
        if use_proteins:
            # list of protein files that segfault OB 2.4.1
            blacklist = pdbbind_db.protein_blacklist[oddt.toolkit.backend]

        # check if PDBID exists or is blacklisted
        desc_idx = [pid for pid in df.index.values
                    if (pid not in blacklist and
                        getattr(pdbbind_db[pid], 'protein'
                                if use_proteins
                                else 'pocket') is not None)]

        result = Parallel(n_jobs=n_jobs, verbose=1)(
            delayed(method_caller)(
                self.descriptor_generator,
                'build',
                [pdbbind_db[pid].ligand],
                protein=getattr(pdbbind_db[pid], 'protein' if use_proteins
                                else 'pocket'))
            for pid in desc_idx)

        # sparse descs may have different shapes, dense are stored np.array
        sparse = (hasattr(self.descriptor_generator, 'sparse') and
                  self.descriptor_generator.sparse)

        if not sparse:
            result = np.vstack(result)

        # create dataframe with descriptors with pdbids as index
        df_desc = pd.DataFrame(result, index=desc_idx)
        df_desc.index.rename('pdbid', inplace=True)

        # for sparse features leave one column and cast explicitly to list
        if sparse:
            if len(df_desc.columns) > 1:
                raise Exception('There are more than one column in the '
                                'sparse descriptor table.')
            df_desc.columns = ['sparse']
            df_desc['sparse'] = df_desc['sparse'].map(
                lambda x: csr_matrix_to_sparse(x).tolist())

        compression = None
        if desc_path[-3:] == '.gz':
            compression = 'gzip'
        # DF are joined by index (pdbid) since some might be missing
        df.join(df_desc, how='inner').to_csv(desc_path,
                                             float_format='%.5g',
                                             compression=compression)