Ejemplo n.º 1
0
    def train(self, sf_pickle = ''):
        # load precomputed descriptors and target values
        self.train_descs = np.loadtxt(dirname(__file__) + '/NNScore/train_descs.csv', delimiter=',', dtype=float)
        self.train_target = np.loadtxt(dirname(__file__) + '/NNScore/train_target.csv', delimiter=',', dtype=float)
        self.test_descs = np.loadtxt(dirname(__file__) + '/NNScore/test_descs.csv', delimiter=',', dtype=float)
        self.test_target = np.loadtxt(dirname(__file__) + '/NNScore/test_target.csv', delimiter=',', dtype=float)

        n_dim = (~((self.train_descs == 0).all(axis=0) | (self.train_descs.min(axis=0) == self.train_descs.max(axis=0)))).sum()

        # number of network to sample; original implementation did 1000, but 100 give results good enough.
        n = 1000
        trained_nets = Parallel(n_jobs=self.n_jobs)(delayed(_parallel_helper)(neuralnetwork([n_dim,5,1]), 'fit', self.train_descs, self.train_target, train_alg='tnc', maxfun=1000) for i in xrange(n))
        # get 20 best
        best_idx = np.array([net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets]).argsort()[::-1][:20]
        self.model = ensemble_model([trained_nets[i] for i in best_idx])

        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print 'Test set: R**2:', r2, ' R:', r

        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print 'Train set: R**2:', r2, ' R:', r

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('NNScore.pickle')
Ejemplo n.º 2
0
def test_ensemble_model():
    X = np.vstack(
        (np.arange(30, 10,
                   -2, dtype='float64'), np.arange(100,
                                                   90,
                                                   -1,
                                                   dtype='float64'))).T

    Y = np.arange(10, dtype='float64')

    rf = regressors.randomforest(random_state=42)
    nn = regressors.neuralnetwork(solver='lbfgs', random_state=42)
    ensemble = ensemble_model((rf, nn))

    # we do not need to fit underlying models, they change when we fit enseble
    ensemble.fit(X, Y)

    pred = ensemble.predict(X)
    mean_pred = np.vstack((rf.predict(X), nn.predict(X))).mean(axis=0)
    assert_array_almost_equal(pred, mean_pred)
    assert_almost_equal(ensemble.score(X, Y), r2_score(Y, pred))

    # ensemble of a single model should behave exactly like this model
    nn = neuralnetwork(solver='lbfgs', random_state=42)
    ensemble = ensemble_model((nn, ))
    ensemble.fit(X, Y)
    assert_array_almost_equal(ensemble.predict(X), nn.predict(X))
    assert_almost_equal(ensemble.score(X, Y), nn.score(X, Y))
Ejemplo n.º 3
0
def test_regressors():
    X = np.vstack((np.arange(30, 10, -2, dtype='float64'),
                   np.arange(100, 90, -1, dtype='float64'))).T

    Y = np.arange(10, dtype='float64')

    np.random.seed(42)

    for regressor in (regressors.svm(C=10),
                      regressors.randomforest(random_state=42),
                      regressors.neuralnetwork(solver='lbfgs',
                                               random_state=42,
                                               hidden_layer_sizes=(20, 20)),
                      regressors.mlr()):

        regressor.fit(X, Y)

        pred = regressor.predict(X)
        assert_true((np.abs(pred.flatten() - Y) < 1).all())
        assert_greater(regressor.score(X, Y), 0.9)

        pickled = pickle.dumps(regressor)
        reloaded = pickle.loads(pickled)
        pred_reloaded = reloaded.predict(X)
        assert_array_almost_equal(pred, pred_reloaded)
Ejemplo n.º 4
0
    def train(self, home_dir=None, sf_pickle='', pdbbind_version=2007):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'
        # load precomputed descriptors and target values
        self.train_descs = np.loadtxt(home_dir + '/train_descs_pdbbind%i.csv' %
                                      (pdbbind_version),
                                      delimiter=',',
                                      dtype=float)
        self.train_target = np.loadtxt(
            home_dir + '/train_target_pdbbind%i.csv' % (pdbbind_version),
            delimiter=',',
            dtype=float)
        self.test_descs = np.loadtxt(home_dir + '/test_descs_pdbbind%i.csv' %
                                     (pdbbind_version),
                                     delimiter=',',
                                     dtype=float)
        self.test_target = np.loadtxt(home_dir + '/test_target_pdbbind%i.csv' %
                                      (pdbbind_version),
                                      delimiter=',',
                                      dtype=float)

        n_dim = (~((self.train_descs == 0).all(axis=0) |
                   (self.train_descs.min(axis=0)
                    == self.train_descs.max(axis=0)))).sum()

        # number of network to sample; original implementation did 1000, but 100 give results good enough.
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = Parallel(n_jobs=self.n_jobs, verbose=10)(
            delayed(_parallel_helper)(neuralnetwork(
                (5, ),
                random_state=seeds[i],
                activation='logistic',
                solver='lbfgs',
                max_iter=10000,
            ), 'fit', self.train_descs, self.train_target) for i in range(n))
        # get 20 best
        best_idx = np.array([
            net.score(self.test_descs, self.test_target.flatten())
            for net in trained_nets
        ]).argsort()[::-1][:20]
        self.model = ensemble_model([trained_nets[i] for i in best_idx])

        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print('Test set: R**2:', r2, ' R:', r, file=sys.stderr)

        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print('Train set: R**2:', r2, ' R:', r, file=sys.stderr)

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
Ejemplo n.º 5
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        desc_path = path_join(home_dir, 'nnscore_descs.csv')

        super(nnscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=2016)

        # number of network to sample; original implementation did 1000, but
        # 100 give results good enough.
        # TODO: allow user to specify number of nets?
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (Parallel(
            n_jobs=self.n_jobs, verbose=10,
            pre_dispatch='all')(delayed(method_caller)(neuralnetwork(
                (5, ),
                random_state=seeds[i],
                activation='logistic',
                solver='lbfgs',
                max_iter=10000), 'fit', self.train_descs, self.train_target)
                                for i in range(n)))
        # get 20 best
        trained_nets.sort(
            key=lambda n: n.score(self.test_descs, self.test_target.flatten()))
        self.model = ensemble_model(trained_nets[-20:])

        error = rmse(self.model.predict(self.test_descs), self.test_target)
        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print('Test set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        error = rmse(self.model.predict(self.train_descs), self.train_target)
        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print('Train set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        if sf_pickle is None:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
        else:
            return self.save(sf_pickle)
Ejemplo n.º 6
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        desc_path = path_join(home_dir, 'nnscore_descs.csv')

        super(nnscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=pdbbind_version)

        # number of network to sample; original implementation did 1000, but
        # 100 give results good enough.
        # TODO: allow user to specify number of nets?
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (
            Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(
                delayed(method_caller)(
                    neuralnetwork((5,),
                                  random_state=seeds[i],
                                  activation='logistic',
                                  solver='lbfgs',
                                  max_iter=10000),
                    'fit',
                    self.train_descs,
                    self.train_target)
                for i in range(n)))
        # get 20 best
        trained_nets.sort(key=lambda n: n.score(self.test_descs,
                                                self.test_target.flatten()))
        self.model = ensemble_model(trained_nets[-20:])

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target)]

        for name, pred, target in sets:
            if len(target) < 3:
                print('There are less than 3 values to predict, skipping.', file=sys.stderr)
                continue
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t', file=sys.stderr)

        if sf_pickle is None:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
        else:
            return self.save(sf_pickle)
Ejemplo n.º 7
0
    def train(self, sf_pickle=''):
        # load precomputed descriptors and target values
        self.train_descs = np.loadtxt(dirname(__file__) +
                                      '/NNScore/train_descs.csv',
                                      delimiter=',',
                                      dtype=float)
        self.train_target = np.loadtxt(dirname(__file__) +
                                       '/NNScore/train_target.csv',
                                       delimiter=',',
                                       dtype=float)
        self.test_descs = np.loadtxt(dirname(__file__) +
                                     '/NNScore/test_descs.csv',
                                     delimiter=',',
                                     dtype=float)
        self.test_target = np.loadtxt(dirname(__file__) +
                                      '/NNScore/test_target.csv',
                                      delimiter=',',
                                      dtype=float)

        n_dim = (~((self.train_descs == 0).all(axis=0) |
                   (self.train_descs.min(axis=0)
                    == self.train_descs.max(axis=0)))).sum()

        # number of network to sample; original implementation did 1000, but 100 give results good enough.
        n = 1000
        trained_nets = Parallel(n_jobs=self.n_jobs)(
            delayed(_parallel_helper)(neuralnetwork([n_dim, 5, 1]),
                                      'fit',
                                      self.train_descs,
                                      self.train_target,
                                      train_alg='tnc',
                                      maxfun=1000) for i in xrange(n))
        # get 20 best
        best_idx = np.array([
            net.score(self.test_descs, self.test_target.flatten())
            for net in trained_nets
        ]).argsort()[::-1][:20]
        self.model = ensemble_model([trained_nets[i] for i in best_idx])

        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print 'Test set: R**2:', r2, ' R:', r

        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print 'Train set: R**2:', r2, ' R:', r

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('NNScore.pickle')
Ejemplo n.º 8
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        desc_path = path_join(home_dir, 'nnscore_descs.csv')

        super(nnscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=pdbbind_version)

        # number of network to sample; original implementation did 1000, but
        # 100 give results good enough.
        # TODO: allow user to specify number of nets?
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (
            Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(
                delayed(method_caller)(
                    neuralnetwork((5,),
                                  random_state=seeds[i],
                                  activation='logistic',
                                  solver='lbfgs',
                                  max_iter=10000),
                    'fit',
                    self.train_descs,
                    self.train_target)
                for i in range(n)))
        # get 20 best
        trained_nets.sort(key=lambda n: n.score(self.test_descs,
                                                self.test_target.flatten()))
        self.model = ensemble_model(trained_nets[-20:])

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target)]

        for name, pred, target in sets:
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t', file=sys.stderr)

        if sf_pickle is None:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
        else:
            return self.save(sf_pickle)
Ejemplo n.º 9
0
    def train(self, home_dir=None, sf_pickle='', pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        # load precomputed descriptors and target values
        df = pd.read_csv(home_dir + '/nnscore_descs.csv', index_col='pdbid')

        train_set = 'refined'
        test_set = 'core'
        cols = list(map(str, range(len(self.descriptor_generator))))
        self.train_descs = (
            df[(df['%i_%s' % (pdbbind_version, train_set)]
                & ~df['%i_%s' % (pdbbind_version, test_set)])][cols].values)
        self.train_target = (
            df[(df['%i_%s' % (pdbbind_version, train_set)]
                & ~df['%i_%s' % (pdbbind_version, test_set)])]['act'].values)
        self.test_descs = df[df['%i_%s' %
                                (pdbbind_version, test_set)]][cols].values
        self.test_target = df[df['%i_%s' %
                                 (pdbbind_version, test_set)]]['act'].values

        # number of network to sample; original implementation did 1000, but 100 give results good enough.
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (Parallel(n_jobs=self.n_jobs, verbose=10)(
            delayed(_parallel_helper)(neuralnetwork(
                (5, ),
                random_state=seeds[i],
                activation='logistic',
                solver='lbfgs',
                max_iter=10000,
            ), 'fit', self.train_descs, self.train_target) for i in range(n)))
        # get 20 best
        best_idx = np.array([
            net.score(self.test_descs, self.test_target.flatten())
            for net in trained_nets
        ]).argsort()[::-1][:20]
        self.model = ensemble_model([trained_nets[i] for i in best_idx])

        error = rmse(self.model.predict(self.test_descs), self.test_target)
        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print('Test set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        error = rmse(self.model.predict(self.train_descs), self.train_target)
        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print('Train set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
Ejemplo n.º 10
0
    prob = cls.predict_proba(X)
    assert_array_almost_equal(prob, [[0, 1]] * 5 + [[1, 0]] * 5, decimal=1)
    log_prob = cls.predict_log_proba(X)
    assert_array_almost_equal(np.log(prob), log_prob)

    pickled = pickle.dumps(cls)
    reloaded = pickle.loads(pickled)
    prob_reloaded = reloaded.predict_proba(X)
    assert_array_almost_equal(prob, prob_reloaded)


@pytest.mark.parametrize('reg', [
    regressors.svm(C=10),
    regressors.randomforest(random_state=42),
    regressors.neuralnetwork(
        solver='lbfgs', random_state=42, hidden_layer_sizes=(20, 20)),
    regressors.mlr()
])
def test_regressors(reg):
    X = np.vstack(
        (np.arange(30, 10,
                   -2, dtype='float64'), np.arange(100,
                                                   90,
                                                   -1,
                                                   dtype='float64'))).T

    Y = np.arange(10, dtype='float64')

    np.random.seed(42)

    reg.fit(X, Y)