def test_folding(): """FP Folding""" # Upper bound assert_array_equal(fold([MAX_HASH_VALUE], 1024), [1023]) assert_array_equal(fold([MAX_HASH_VALUE], 1234567890), [1234567889]) assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE / 2), [MAX_HASH_VALUE / 2 - 1]) assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE - 1), [MAX_HASH_VALUE - 2]) # Lower bound assert_array_equal(fold([MIN_HASH_VALUE], 1024), [0]) assert_array_equal(fold([MIN_HASH_VALUE], 1234567890), [0]) assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE / 2), [0]) assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE - 1), [0]) # Range check fp = np.arange(1, MAX_HASH_VALUE, 1e6, dtype=int) assert_array_equal(fold(fp, MAX_HASH_VALUE), fp - 1)
def _load_pdbbind_desc(self, desc_path, pdbbind_version=2016, train_set='refined', test_set='core', train_blacklist=None, fold_size=None): """ TODO: write the docs """ df = pd.read_csv(desc_path, index_col='pdbid') # generate dense representation of sparse descriptor in CSV cols = list(map(str, range(len(self.descriptor_generator)))) if 'sparse' in df.columns: # convert strings to np.arrays df['sparse'] = df['sparse'].map( lambda x: np.fromstring(x[1:-1], dtype=np.uint64, sep=',')) cols = 'sparse' # sparse array will have one column # fold only if necessary if fold_size: df['sparse'] = df['sparse'].map(lambda x: fold(x, fold_size)) # convert to sparse csr_matrix df['sparse'] = df['sparse'].map( partial(sparse_to_csr_matrix, size=len(self.descriptor_generator))) if isinstance(train_set, six.string_types): train_idx = df['%i_%s' % (pdbbind_version, train_set)] else: train_idx = df[['%i_%s' % (pdbbind_version, s) for s in train_set]].any(axis=1) if train_blacklist: train_idx &= ~df.index.isin(train_blacklist) train_idx &= ~df['%i_%s' % (pdbbind_version, test_set)] # load sparse matrices as training is usually faster on them if 'sparse' in df.columns: self.train_descs = sparse_vstack(df.loc[train_idx, cols].values, format='csr') else: self.train_descs = df.loc[train_idx, cols].values self.train_target = df.loc[train_idx, 'act'].values test_idx = df['%i_%s' % (pdbbind_version, test_set)] if 'sparse' in df.columns: self.test_descs = sparse_vstack(df.loc[test_idx, cols].values, format='csr') else: self.test_descs = df.loc[test_idx, cols].values self.test_target = df.loc[test_idx, 'act'].values