def test_sparse_densify(): """FP densify""" sparse_fp = [0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299, 323, 331, 376, 389, 410, 427, 430, 450, 484, 538, 592, 593, 636, 646, 658, 698, 699, 702, 741, 753, 807, 850, 861, 882, 915, 915, 915, 969, 969, 1023] # count vectors dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True) csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True) assert_array_equal(dense.reshape(1, -1), csr.toarray()) resparsed = dense_to_sparse(dense) resparsed_csr = csr_matrix_to_sparse(csr) assert_array_equal(sparse_fp, resparsed) assert_array_equal(sparse_fp, resparsed_csr) # bool vectors dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False) csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False) assert_array_equal(dense.reshape(1, -1), csr.toarray()) resparsed = dense_to_sparse(dense) resparsed_csr = csr_matrix_to_sparse(csr) assert_array_equal(np.unique(sparse_fp), resparsed) assert_array_equal(np.unique(sparse_fp), resparsed_csr) # test stacking np.random.seed(0) sparse_fps = np.random.randint(0, 1024, size=(20, 100)) dense = np.vstack(sparse_to_dense(fp, size=1024) for fp in sparse_fps) csr = sparse_vstack(sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps) assert_array_equal(dense, csr.toarray()) # test exceptions with pytest.raises(ValueError): csr_matrix_to_sparse(np.array([1, 2, 3]))
def test_sparse_densify(): """FP densify""" sparse_fp = [ 0, 33, 49, 53, 107, 156, 161, 203, 215, 230, 251, 269, 299, 323, 331, 376, 389, 410, 427, 430, 450, 484, 538, 592, 593, 636, 646, 658, 698, 699, 702, 741, 753, 807, 850, 861, 882, 915, 915, 915, 969, 969, 1023 ] # count vectors dense = sparse_to_dense(sparse_fp, size=1024, count_bits=True) csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=True) assert_array_equal(dense.reshape(1, -1), csr.toarray()) resparsed = dense_to_sparse(dense) resparsed_csr = csr_matrix_to_sparse(csr) assert_array_equal(sparse_fp, resparsed) assert_array_equal(sparse_fp, resparsed_csr) # bool vectors dense = sparse_to_dense(sparse_fp, size=1024, count_bits=False) csr = sparse_to_csr_matrix(sparse_fp, size=1024, count_bits=False) assert_array_equal(dense.reshape(1, -1), csr.toarray()) resparsed = dense_to_sparse(dense) resparsed_csr = csr_matrix_to_sparse(csr) assert_array_equal(np.unique(sparse_fp), resparsed) assert_array_equal(np.unique(sparse_fp), resparsed_csr) # test stacking np.random.seed(0) sparse_fps = np.random.randint(0, 1024, size=(20, 100)) dense = np.vstack([sparse_to_dense(fp, size=1024) for fp in sparse_fps]) csr = sparse_vstack( sparse_to_csr_matrix(fp, size=1024) for fp in sparse_fps) assert_array_equal(dense, csr.toarray()) # test exceptions with pytest.raises(ValueError): csr_matrix_to_sparse(np.array([1, 2, 3]))
def _gen_pdbbind_desc(self, pdbbind_dir, pdbbind_versions=(2007, 2012, 2013, 2014, 2015, 2016), desc_path=None, include_general_set=False, use_proteins=False, **kwargs): pdbbind_versions = sorted(pdbbind_versions) opt = kwargs.get('opt', {}) # generate metadata df = None for pdbbind_version in pdbbind_versions: p = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_version), version=pdbbind_version, opt=opt) # Core set for set_name in p.pdbind_sets: if set_name == 'general_PL': dataset_key = '%i_general' % pdbbind_version else: dataset_key = '%i_%s' % (pdbbind_version, set_name) tmp_df = pd.DataFrame({ 'pdbid': list(p.sets[set_name].keys()), dataset_key: list(p.sets[set_name].values()) }) if df is not None: df = pd.merge(tmp_df, df, how='outer', on='pdbid') else: df = tmp_df df.sort_values('pdbid', inplace=True) tmp_act = df['%i_general' % pdbbind_versions[-1]].values df = df.set_index('pdbid').notnull() df['act'] = tmp_act # take non-empty and core + refined set df = df[df['act'].notnull() & (df.filter(regex='.*_[refined,core]').any(axis=1) | include_general_set)] # build descriptos pdbbind_db = pdbbind('%s/v%i/' % (pdbbind_dir, pdbbind_versions[-1]), version=pdbbind_versions[-1]) if not desc_path: desc_path = path_join(dirname(__file__) + 'descs.csv') if self.n_jobs is None: n_jobs = -1 else: n_jobs = self.n_jobs blacklist = [] if use_proteins: # list of protein files that segfault OB 2.4.1 blacklist = pdbbind_db.protein_blacklist[oddt.toolkit.backend] # check if PDBID exists or is blacklisted desc_idx = [pid for pid in df.index.values if (pid not in blacklist and getattr(pdbbind_db[pid], 'protein' if use_proteins else 'pocket') is not None)] result = Parallel(n_jobs=n_jobs, verbose=1)( delayed(method_caller)( self.descriptor_generator, 'build', [pdbbind_db[pid].ligand], protein=getattr(pdbbind_db[pid], 'protein' if use_proteins else 'pocket')) for pid in desc_idx) # sparse descs may have different shapes, dense are stored np.array sparse = (hasattr(self.descriptor_generator, 'sparse') and self.descriptor_generator.sparse) if not sparse: result = np.vstack(result) # create dataframe with descriptors with pdbids as index df_desc = pd.DataFrame(result, index=desc_idx) df_desc.index.rename('pdbid', inplace=True) # for sparse features leave one column and cast explicitly to list if sparse: if len(df_desc.columns) > 1: raise Exception('There are more than one column in the ' 'sparse descriptor table.') df_desc.columns = ['sparse'] df_desc['sparse'] = df_desc['sparse'].map( lambda x: csr_matrix_to_sparse(x).tolist()) compression = None if desc_path[-3:] == '.gz': compression = 'gzip' # DF are joined by index (pdbid) since some might be missing df.join(df_desc, how='inner').to_csv(desc_path, float_format='%.5g', compression=compression)