Esempio n. 1
0
    def train(self, sf_pickle = ''):
        # load precomputed descriptors and target values
        self.train_descs = np.loadtxt(dirname(__file__) + '/NNScore/train_descs.csv', delimiter=',', dtype=float)
        self.train_target = np.loadtxt(dirname(__file__) + '/NNScore/train_target.csv', delimiter=',', dtype=float)
        self.test_descs = np.loadtxt(dirname(__file__) + '/NNScore/test_descs.csv', delimiter=',', dtype=float)
        self.test_target = np.loadtxt(dirname(__file__) + '/NNScore/test_target.csv', delimiter=',', dtype=float)

        n_dim = (~((self.train_descs == 0).all(axis=0) | (self.train_descs.min(axis=0) == self.train_descs.max(axis=0)))).sum()

        # number of network to sample; original implementation did 1000, but 100 give results good enough.
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = Parallel(n_jobs=self.n_jobs, verbose=10)(delayed(_parallel_helper)(neuralnetwork([n_dim,5,1], random_state=seeds[i]), 'fit', self.train_descs, self.train_target, neural_network__train_alg='tnc', neural_network__maxfun=10000) for i in xrange(n))
        # get 20 best
        best_idx = np.array([net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets]).argsort()[::-1][:20]
        self.model = ensemble_model([trained_nets[i] for i in best_idx])

        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print 'Test set: R**2:', r2, ' R:', r

        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print 'Train set: R**2:', r2, ' R:', r

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('NNScore.pickle')
Esempio n. 2
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'

        desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version)

        super(rfscore,
              self)._load_pdbbind_desc(desc_path,
                                       pdbbind_version=pdbbind_version)

        # remove sparse dimentions
        if self.spr > 0:
            self.mask = (self.train_descs > self.spr).any(axis=0)
            if self.mask.sum() > 0:
                self.train_descs = self.train_descs[:, self.mask]
                self.test_descs = self.test_descs[:, self.mask]

        # make nets reproducible
        random_seed(1)
        self.model.fit(self.train_descs, self.train_target)

        print('Training RFScore v%i on PDBBind v%i' %
              (self.version, pdbbind_version),
              file=sys.stderr)

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target),
            ('OOB', self.model.oob_prediction_, self.train_target)
        ]

        for name, pred, target in sets:
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t',
                  file=sys.stderr)

        # compile trees
        if compiledtrees is not None:
            try:
                print('Compiling Random Forest using sklearn-compiledtrees',
                      file=sys.stderr)
                self.model = compiledtrees.CompiledRegressionPredictor(
                    self.model, n_jobs=self.n_jobs)
            except Exception as e:
                print('Failed to compile Random Forest with exception: %s' % e,
                      file=sys.stderr)
                print('Continuing without compiled RF.', file=sys.stderr)

        if sf_pickle is None:
            return self.save('RFScore_v%i_pdbbind%i.pickle' %
                             (self.version, pdbbind_version))
        else:
            return self.save(sf_pickle)
Esempio n. 3
0
    def train(self, home_dir=None, sf_pickle='', pdbbind_version=2007):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'
        # load precomputed descriptors and target values
        self.train_descs = np.loadtxt(home_dir + '/train_descs_pdbbind%i.csv' %
                                      (pdbbind_version),
                                      delimiter=',',
                                      dtype=float)
        self.train_target = np.loadtxt(
            home_dir + '/train_target_pdbbind%i.csv' % (pdbbind_version),
            delimiter=',',
            dtype=float)
        self.test_descs = np.loadtxt(home_dir + '/test_descs_pdbbind%i.csv' %
                                     (pdbbind_version),
                                     delimiter=',',
                                     dtype=float)
        self.test_target = np.loadtxt(home_dir + '/test_target_pdbbind%i.csv' %
                                      (pdbbind_version),
                                      delimiter=',',
                                      dtype=float)

        n_dim = (~((self.train_descs == 0).all(axis=0) |
                   (self.train_descs.min(axis=0)
                    == self.train_descs.max(axis=0)))).sum()

        # number of network to sample; original implementation did 1000, but 100 give results good enough.
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = Parallel(n_jobs=self.n_jobs, verbose=10)(
            delayed(_parallel_helper)(neuralnetwork(
                (5, ),
                random_state=seeds[i],
                activation='logistic',
                solver='lbfgs',
                max_iter=10000,
            ), 'fit', self.train_descs, self.train_target) for i in range(n))
        # get 20 best
        best_idx = np.array([
            net.score(self.test_descs, self.test_target.flatten())
            for net in trained_nets
        ]).argsort()[::-1][:20]
        self.model = ensemble_model([trained_nets[i] for i in best_idx])

        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print('Test set: R**2:', r2, ' R:', r, file=sys.stderr)

        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print('Train set: R**2:', r2, ' R:', r, file=sys.stderr)

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
Esempio n. 4
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        desc_path = path_join(home_dir, 'nnscore_descs.csv')

        super(nnscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=2016)

        # number of network to sample; original implementation did 1000, but
        # 100 give results good enough.
        # TODO: allow user to specify number of nets?
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (Parallel(
            n_jobs=self.n_jobs, verbose=10,
            pre_dispatch='all')(delayed(method_caller)(neuralnetwork(
                (5, ),
                random_state=seeds[i],
                activation='logistic',
                solver='lbfgs',
                max_iter=10000), 'fit', self.train_descs, self.train_target)
                                for i in range(n)))
        # get 20 best
        trained_nets.sort(
            key=lambda n: n.score(self.test_descs, self.test_target.flatten()))
        self.model = ensemble_model(trained_nets[-20:])

        error = rmse(self.model.predict(self.test_descs), self.test_target)
        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print('Test set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        error = rmse(self.model.predict(self.train_descs), self.train_target)
        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print('Train set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        if sf_pickle is None:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
        else:
            return self.save(sf_pickle)
Esempio n. 5
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'

        desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version)

        super(rfscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=pdbbind_version)

        # remove sparse dimentions
        if self.spr > 0:
            self.mask = (self.train_descs > self.spr).any(axis=0)
            if self.mask.sum() > 0:
                self.train_descs = self.train_descs[:, self.mask]
                self.test_descs = self.test_descs[:, self.mask]

        # make nets reproducible
        random_seed(1)
        self.model.fit(self.train_descs, self.train_target)

        print('Training RFScore v%i on PDBBind v%i'
              % (self.version, pdbbind_version), file=sys.stderr)

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target),
            ('OOB', self.model.oob_prediction_, self.train_target)]

        for name, pred, target in sets:
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t', file=sys.stderr)

        # compile trees
        if compiledtrees is not None:
            try:
                print('Compiling Random Forest using sklearn-compiledtrees',
                      file=sys.stderr)
                self.model = compiledtrees.CompiledRegressionPredictor(
                    self.model, n_jobs=self.n_jobs)
            except Exception as e:
                print('Failed to compile Random Forest with exception: %s' % e,
                      file=sys.stderr)
                print('Continuing without compiled RF.', file=sys.stderr)

        if sf_pickle is None:
            return self.save('RFScore_v%i_pdbbind%i.pickle'
                             % (self.version, pdbbind_version))
        else:
            return self.save(sf_pickle)
Esempio n. 6
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        desc_path = path_join(home_dir, 'nnscore_descs.csv')

        super(nnscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=pdbbind_version)

        # number of network to sample; original implementation did 1000, but
        # 100 give results good enough.
        # TODO: allow user to specify number of nets?
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (
            Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(
                delayed(method_caller)(
                    neuralnetwork((5,),
                                  random_state=seeds[i],
                                  activation='logistic',
                                  solver='lbfgs',
                                  max_iter=10000),
                    'fit',
                    self.train_descs,
                    self.train_target)
                for i in range(n)))
        # get 20 best
        trained_nets.sort(key=lambda n: n.score(self.test_descs,
                                                self.test_target.flatten()))
        self.model = ensemble_model(trained_nets[-20:])

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target)]

        for name, pred, target in sets:
            if len(target) < 3:
                print('There are less than 3 values to predict, skipping.', file=sys.stderr)
                continue
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t', file=sys.stderr)

        if sf_pickle is None:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
        else:
            return self.save(sf_pickle)
Esempio n. 7
0
 def fit(self, descs, target_values, train_alg='tnc',**kwargs):
     # setup neural network
     if self.full_conn:
         conec = tmlgraph(self.shape, self.biases)
     else:
         conec = mlgraph(self.shape, self.biases)
     self.model = ffnet(conec)
     if self.random_weights:
         if not self.random_state is None:
             random_seed(self.random_state)
         self.model.randomweights()
     # train
     getattr(self.model, 'train_'+train_alg)(descs, target_values, nproc='ncpu' if self.n_jobs < 1 else self.n_jobs, **kwargs)
     return self
Esempio n. 8
0
    def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        desc_path = path_join(home_dir, 'nnscore_descs.csv')

        super(nnscore, self)._load_pdbbind_desc(desc_path,
                                                pdbbind_version=pdbbind_version)

        # number of network to sample; original implementation did 1000, but
        # 100 give results good enough.
        # TODO: allow user to specify number of nets?
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (
            Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(
                delayed(method_caller)(
                    neuralnetwork((5,),
                                  random_state=seeds[i],
                                  activation='logistic',
                                  solver='lbfgs',
                                  max_iter=10000),
                    'fit',
                    self.train_descs,
                    self.train_target)
                for i in range(n)))
        # get 20 best
        trained_nets.sort(key=lambda n: n.score(self.test_descs,
                                                self.test_target.flatten()))
        self.model = ensemble_model(trained_nets[-20:])

        sets = [
            ('Test', self.model.predict(self.test_descs), self.test_target),
            ('Train', self.model.predict(self.train_descs), self.train_target)]

        for name, pred, target in sets:
            print('%s set:' % name,
                  'R2_score: %.4f' % r2_score(target, pred),
                  'Rp: %.4f' % pearsonr(target, pred)[0],
                  'RMSE: %.4f' % rmse(target, pred),
                  'SD: %.4f' % standard_deviation_error(target, pred),
                  sep='\t', file=sys.stderr)

        if sf_pickle is None:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
        else:
            return self.save(sf_pickle)
Esempio n. 9
0
    def train(self, home_dir=None, sf_pickle=''):
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'
        # load precomputed descriptors and target values
        self.train_descs = np.loadtxt(home_dir + '/train_descs_v%i.csv' %
                                      (self.version),
                                      delimiter=',',
                                      dtype=float)
        self.train_target = np.loadtxt(home_dir + '/train_target.csv',
                                       delimiter=',',
                                       dtype=float)

        self.test_descs = np.loadtxt(home_dir + '/test_descs_v%i.csv' %
                                     (self.version),
                                     delimiter=',',
                                     dtype=float)
        self.test_target = np.loadtxt(home_dir + '/test_target.csv',
                                      delimiter=',',
                                      dtype=float)

        # remove sparse dimentions
        if self.spr > 0:
            self.mask = (self.train_descs > self.spr).any(axis=0)
            if self.mask.sum() > 0:
                self.train_descs = self.train_descs[:, self.mask]
                self.test_descs = self.test_descs[:, self.mask]

        # make nets reproducible
        random_seed(1)
        self.model.fit(self.train_descs, self.train_target)

        print "Training RFScore v%i" % self.version

        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print 'Test set: R**2:', r2, ' R:', r

        rmse = np.sqrt(
            np.mean(np.square(self.model.oob_prediction_ - self.train_target)))
        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print 'Train set: R**2:', r2, ' R:', r, 'RMSE:', rmse

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('RFScore_v%i.pickle' % self.version)
Esempio n. 10
0
 def fit(self, descs, target_values, train_alg='tnc', **kwargs):
     # setup neural network
     if self.full_conn:
         conec = tmlgraph(self.shape, self.biases)
     else:
         conec = mlgraph(self.shape, self.biases)
     self.model = ffnet(conec)
     if self.random_weights:
         if not self.random_state is None:
             random_seed(self.random_state)
         self.model.randomweights()
     # train
     getattr(self.model, 'train_' + train_alg)(
         descs,
         target_values,
         nproc='ncpu' if self.n_jobs < 1 else self.n_jobs,
         **kwargs)
     return self
Esempio n. 11
0
    def train(self, home_dir = None, sf_pickle = ''):
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'
        # load precomputed descriptors and target values
        self.train_descs = np.loadtxt(home_dir + '/train_descs_v%i.csv' % (self.version), delimiter=',', dtype=float)
        self.train_target = np.loadtxt(home_dir + '/train_target.csv', delimiter=',', dtype=float)

        self.test_descs = np.loadtxt(home_dir + '/test_descs_v%i.csv' % (self.version), delimiter=',', dtype=float)
        self.test_target = np.loadtxt(home_dir + '/test_target.csv', delimiter=',', dtype=float)

        # remove sparse dimentions
        if self.spr > 0:
            self.mask = (self.train_descs > self.spr).any(axis=0)
            if self.mask.sum() > 0:
                self.train_descs =  self.train_descs[:,self.mask]
                self.test_descs = self.test_descs[:,self.mask]

        # make nets reproducible
        random_seed(1)
        self.model.fit(self.train_descs, self.train_target)

        print "Training RFScore v%i" % self.version

        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print 'Test set: R**2:', r2, ' R:', r

        rmse = np.sqrt(np.mean(np.square(self.model.oob_prediction_ - self.train_target)))
        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print 'Train set: R**2:', r2, ' R:', r, 'RMSE:', rmse

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('RFScore_v%i.pickle' % self.version)
Esempio n. 12
0
    def train(self, home_dir=None, sf_pickle='', pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/NNScore'

        # load precomputed descriptors and target values
        df = pd.read_csv(home_dir + '/nnscore_descs.csv', index_col='pdbid')

        train_set = 'refined'
        test_set = 'core'
        cols = list(map(str, range(len(self.descriptor_generator))))
        self.train_descs = (
            df[(df['%i_%s' % (pdbbind_version, train_set)]
                & ~df['%i_%s' % (pdbbind_version, test_set)])][cols].values)
        self.train_target = (
            df[(df['%i_%s' % (pdbbind_version, train_set)]
                & ~df['%i_%s' % (pdbbind_version, test_set)])]['act'].values)
        self.test_descs = df[df['%i_%s' %
                                (pdbbind_version, test_set)]][cols].values
        self.test_target = df[df['%i_%s' %
                                 (pdbbind_version, test_set)]]['act'].values

        # number of network to sample; original implementation did 1000, but 100 give results good enough.
        n = 1000
        # make nets reproducible
        random_seed(1)
        seeds = np.random.randint(123456789, size=n)
        trained_nets = (Parallel(n_jobs=self.n_jobs, verbose=10)(
            delayed(_parallel_helper)(neuralnetwork(
                (5, ),
                random_state=seeds[i],
                activation='logistic',
                solver='lbfgs',
                max_iter=10000,
            ), 'fit', self.train_descs, self.train_target) for i in range(n)))
        # get 20 best
        best_idx = np.array([
            net.score(self.test_descs, self.test_target.flatten())
            for net in trained_nets
        ]).argsort()[::-1][:20]
        self.model = ensemble_model([trained_nets[i] for i in best_idx])

        error = rmse(self.model.predict(self.test_descs), self.test_target)
        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print('Test set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        error = rmse(self.model.predict(self.train_descs), self.train_target)
        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print('Train set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
Esempio n. 13
0
    def train(self, home_dir=None, sf_pickle='', pdbbind_version=2007):
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'
        # load precomputed descriptors and target values
        self.train_descs = np.loadtxt(home_dir +
                                      '/train_descs_v%i_pdbbind%i.csv' %
                                      (self.version, pdbbind_version),
                                      delimiter=',',
                                      dtype=float)
        self.train_target = np.loadtxt(
            home_dir + '/train_target_pdbbind%i.csv' % (pdbbind_version),
            delimiter=',',
            dtype=float)

        self.test_descs = np.loadtxt(home_dir +
                                     '/test_descs_v%i_pdbbind%i.csv' %
                                     (self.version, pdbbind_version),
                                     delimiter=',',
                                     dtype=float)
        self.test_target = np.loadtxt(home_dir + '/test_target_pdbbind%i.csv' %
                                      (pdbbind_version),
                                      delimiter=',',
                                      dtype=float)

        # remove sparse dimentions
        if self.spr > 0:
            self.mask = (self.train_descs > self.spr).any(axis=0)
            if self.mask.sum() > 0:
                self.train_descs = self.train_descs[:, self.mask]
                self.test_descs = self.test_descs[:, self.mask]

        # make nets reproducible
        random_seed(1)
        self.model.fit(self.train_descs, self.train_target)

        print("Training RFScore v%i on PDBBind v%i" %
              (self.version, pdbbind_version),
              file=sys.stderr)

        error = rmse(self.model.predict(self.test_descs), self.test_target)
        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print('Test set: R**2:', r2, ' R:', r, 'RMSE:', error, file=sys.stderr)

        error = rmse(self.model.predict(self.train_descs), self.train_target)
        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print('Train set: R**2:',
              r2,
              ' R:',
              r,
              'RMSE:',
              error,
              file=sys.stderr)

        # compile trees
        if compiledtrees is not None:
            try:
                print("Compiling Random Forest using sklearn-compiledtrees",
                      file=sys.stderr)
                self.model = compiledtrees.CompiledRegressionPredictor(
                    self.model, n_jobs=self.n_jobs)
            except Exception as e:
                print("Failed to compile Random Forest with exception: %s" % e,
                      file=sys.stderr)
                print("Continuing without compiled RF.", file=sys.stderr)

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('RFScore_v%i_pdbbind%i.pickle' %
                             (self.version, pdbbind_version))
Esempio n. 14
0
    def train(self, home_dir=None, sf_pickle='', pdbbind_version=2016):
        if not home_dir:
            home_dir = dirname(__file__) + '/RFScore'

        # load precomputed descriptors and target values
        df = pd.read_csv(home_dir + '/rfscore_descs_v%i.csv' % self.version,
                         index_col='pdbid')

        train_set = 'refined'
        test_set = 'core'
        self.train_descs = df[
            df['%i_%s' % (pdbbind_version, train_set)]
            & ~df['%i_%s' % (pdbbind_version, test_set)]][list(
                map(str, range(len(self.descriptor_generator))))].values
        self.train_target = df[
            df['%i_%s' % (pdbbind_version, train_set)]
            & ~df['%i_%s' % (pdbbind_version, test_set)]]['act'].values
        self.test_descs = df[df['%i_%s' % (pdbbind_version, test_set)]][list(
            map(str, range(len(self.descriptor_generator))))].values
        self.test_target = df[df['%i_%s' %
                                 (pdbbind_version, test_set)]]['act'].values

        # remove sparse dimentions
        if self.spr > 0:
            self.mask = (self.train_descs > self.spr).any(axis=0)
            if self.mask.sum() > 0:
                self.train_descs = self.train_descs[:, self.mask]
                self.test_descs = self.test_descs[:, self.mask]

        # make nets reproducible
        random_seed(1)
        self.model.fit(self.train_descs, self.train_target)

        print("Training RFScore v%i on PDBBind v%i" %
              (self.version, pdbbind_version),
              file=sys.stderr)

        error = rmse(self.model.predict(self.test_descs), self.test_target)
        r2 = self.model.score(self.test_descs, self.test_target)
        r = np.sqrt(r2)
        print('Test set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              sep='\t',
              file=sys.stderr)

        error = rmse(self.model.predict(self.train_descs), self.train_target)
        oob_error = rmse(self.model.oob_prediction_, self.train_target)
        r2 = self.model.score(self.train_descs, self.train_target)
        r = np.sqrt(r2)
        print('Train set:',
              'R**2: %.4f' % r2,
              'R: %.4f' % r,
              'RMSE: %.4f' % error,
              'OOB RMSE: %.4f' % oob_error,
              sep='\t',
              file=sys.stderr)

        # compile trees
        if compiledtrees is not None:
            try:
                print("Compiling Random Forest using sklearn-compiledtrees",
                      file=sys.stderr)
                self.model = compiledtrees.CompiledRegressionPredictor(
                    self.model, n_jobs=self.n_jobs)
            except Exception as e:
                print("Failed to compile Random Forest with exception: %s" % e,
                      file=sys.stderr)
                print("Continuing without compiled RF.", file=sys.stderr)

        if sf_pickle:
            return self.save(sf_pickle)
        else:
            return self.save('RFScore_v%i_pdbbind%i.pickle' %
                             (self.version, pdbbind_version))