コード例 #1
0
ファイル: test_fingerprints.py プロジェクト: oddt/oddt
def test_folding():
    """FP Folding"""
    # Upper bound
    assert_array_equal(fold([MAX_HASH_VALUE], 1024), [1023])
    assert_array_equal(fold([MAX_HASH_VALUE], 1234567890), [1234567889])
    assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE / 2),
                       [MAX_HASH_VALUE / 2 - 1])
    assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE - 1),
                       [MAX_HASH_VALUE - 2])
    # Lower bound
    assert_array_equal(fold([MIN_HASH_VALUE], 1024), [0])
    assert_array_equal(fold([MIN_HASH_VALUE], 1234567890), [0])
    assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE / 2), [0])
    assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE - 1), [0])

    # Range check
    fp = np.arange(1, MAX_HASH_VALUE, 1e6, dtype=int)
    assert_array_equal(fold(fp, MAX_HASH_VALUE), fp - 1)
コード例 #2
0
ファイル: test_fingerprints.py プロジェクト: ravila4/oddt
def test_folding():
    """FP Folding"""
    # Upper bound
    assert_array_equal(fold([MAX_HASH_VALUE], 1024), [1023])
    assert_array_equal(fold([MAX_HASH_VALUE], 1234567890), [1234567889])
    assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE / 2),
                       [MAX_HASH_VALUE / 2 - 1])
    assert_array_equal(fold([MAX_HASH_VALUE], MAX_HASH_VALUE - 1),
                       [MAX_HASH_VALUE - 2])
    # Lower bound
    assert_array_equal(fold([MIN_HASH_VALUE], 1024), [0])
    assert_array_equal(fold([MIN_HASH_VALUE], 1234567890), [0])
    assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE / 2), [0])
    assert_array_equal(fold([MIN_HASH_VALUE], MAX_HASH_VALUE - 1), [0])

    # Range check
    fp = np.arange(1, MAX_HASH_VALUE, 1e6, dtype=int)
    assert_array_equal(fold(fp, MAX_HASH_VALUE), fp - 1)
コード例 #3
0
ファイル: __init__.py プロジェクト: xianqiangsun/oddt
    def _load_pdbbind_desc(self, desc_path, pdbbind_version=2016,
                           train_set='refined', test_set='core',
                           train_blacklist=None, fold_size=None):
        """
        TODO: write the docs

        """

        df = pd.read_csv(desc_path, index_col='pdbid')

        # generate dense representation of sparse descriptor in CSV
        cols = list(map(str, range(len(self.descriptor_generator))))
        if 'sparse' in df.columns:
            # convert strings to np.arrays
            df['sparse'] = df['sparse'].map(
                lambda x: np.fromstring(x[1:-1], dtype=np.uint64, sep=','))
            cols = 'sparse'  # sparse array will have one column
            # fold only if necessary
            if fold_size:
                df['sparse'] = df['sparse'].map(lambda x: fold(x, fold_size))
            # convert to sparse csr_matrix
            df['sparse'] = df['sparse'].map(
                partial(sparse_to_csr_matrix,
                        size=len(self.descriptor_generator)))

        if isinstance(train_set, six.string_types):
            train_idx = df['%i_%s' % (pdbbind_version, train_set)]
        else:
            train_idx = df[['%i_%s' % (pdbbind_version, s)
                            for s in train_set]].any(axis=1)
        if train_blacklist:
            train_idx &= ~df.index.isin(train_blacklist)
        train_idx &= ~df['%i_%s' % (pdbbind_version, test_set)]

        # load sparse matrices as training is usually faster on them
        if 'sparse' in df.columns:
            self.train_descs = sparse_vstack(df.loc[train_idx, cols].values,
                                             format='csr')
        else:
            self.train_descs = df.loc[train_idx, cols].values
        self.train_target = df.loc[train_idx, 'act'].values

        test_idx = df['%i_%s' % (pdbbind_version, test_set)]
        if 'sparse' in df.columns:
            self.test_descs = sparse_vstack(df.loc[test_idx, cols].values,
                                            format='csr')
        else:
            self.test_descs = df.loc[test_idx, cols].values
        self.test_target = df.loc[test_idx, 'act'].values