def test_ensemble_model(): X = np.vstack( (np.arange(30, 10, -2, dtype='float64'), np.arange(100, 90, -1, dtype='float64'))).T Y = np.arange(10, dtype='float64') rf = regressors.randomforest(random_state=42) nn = regressors.neuralnetwork(solver='lbfgs', random_state=42) ensemble = ensemble_model((rf, nn)) # we do not need to fit underlying models, they change when we fit enseble ensemble.fit(X, Y) pred = ensemble.predict(X) mean_pred = np.vstack((rf.predict(X), nn.predict(X))).mean(axis=0) assert_array_almost_equal(pred, mean_pred) assert_almost_equal(ensemble.score(X, Y), r2_score(Y, pred)) # ensemble of a single model should behave exactly like this model nn = neuralnetwork(solver='lbfgs', random_state=42) ensemble = ensemble_model((nn, )) ensemble.fit(X, Y) assert_array_almost_equal(ensemble.predict(X), nn.predict(X)) assert_almost_equal(ensemble.score(X, Y), nn.score(X, Y))
def train(self, sf_pickle = ''): # load precomputed descriptors and target values self.train_descs = np.loadtxt(dirname(__file__) + '/NNScore/train_descs.csv', delimiter=',', dtype=float) self.train_target = np.loadtxt(dirname(__file__) + '/NNScore/train_target.csv', delimiter=',', dtype=float) self.test_descs = np.loadtxt(dirname(__file__) + '/NNScore/test_descs.csv', delimiter=',', dtype=float) self.test_target = np.loadtxt(dirname(__file__) + '/NNScore/test_target.csv', delimiter=',', dtype=float) n_dim = (~((self.train_descs == 0).all(axis=0) | (self.train_descs.min(axis=0) == self.train_descs.max(axis=0)))).sum() # number of network to sample; original implementation did 1000, but 100 give results good enough. n = 1000 trained_nets = Parallel(n_jobs=self.n_jobs)(delayed(_parallel_helper)(neuralnetwork([n_dim,5,1]), 'fit', self.train_descs, self.train_target, train_alg='tnc', maxfun=1000) for i in xrange(n)) # get 20 best best_idx = np.array([net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets]).argsort()[::-1][:20] self.model = ensemble_model([trained_nets[i] for i in best_idx]) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print 'Test set: R**2:', r2, ' R:', r r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print 'Train set: R**2:', r2, ' R:', r if sf_pickle: return self.save(sf_pickle) else: return self.save('NNScore.pickle')
def train(self, sf_pickle = ''): # load precomputed descriptors and target values self.train_descs = np.loadtxt(dirname(__file__) + '/NNScore/train_descs.csv', delimiter=',', dtype=float) self.train_target = np.loadtxt(dirname(__file__) + '/NNScore/train_target.csv', delimiter=',', dtype=float) self.test_descs = np.loadtxt(dirname(__file__) + '/NNScore/test_descs.csv', delimiter=',', dtype=float) self.test_target = np.loadtxt(dirname(__file__) + '/NNScore/test_target.csv', delimiter=',', dtype=float) n_dim = (~((self.train_descs == 0).all(axis=0) | (self.train_descs.min(axis=0) == self.train_descs.max(axis=0)))).sum() # number of network to sample; original implementation did 1000, but 100 give results good enough. n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = Parallel(n_jobs=self.n_jobs, verbose=10)(delayed(_parallel_helper)(neuralnetwork([n_dim,5,1], random_state=seeds[i]), 'fit', self.train_descs, self.train_target, neural_network__train_alg='tnc', neural_network__maxfun=10000) for i in xrange(n)) # get 20 best best_idx = np.array([net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets]).argsort()[::-1][:20] self.model = ensemble_model([trained_nets[i] for i in best_idx]) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print 'Test set: R**2:', r2, ' R:', r r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print 'Train set: R**2:', r2, ' R:', r if sf_pickle: return self.save(sf_pickle) else: return self.save('NNScore.pickle')
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2007): if not home_dir: home_dir = dirname(__file__) + '/NNScore' # load precomputed descriptors and target values self.train_descs = np.loadtxt(home_dir + '/train_descs_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) self.train_target = np.loadtxt( home_dir + '/train_target_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) self.test_descs = np.loadtxt(home_dir + '/test_descs_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) self.test_target = np.loadtxt(home_dir + '/test_target_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) n_dim = (~((self.train_descs == 0).all(axis=0) | (self.train_descs.min(axis=0) == self.train_descs.max(axis=0)))).sum() # number of network to sample; original implementation did 1000, but 100 give results good enough. n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = Parallel(n_jobs=self.n_jobs, verbose=10)( delayed(_parallel_helper)(neuralnetwork( (5, ), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000, ), 'fit', self.train_descs, self.train_target) for i in range(n)) # get 20 best best_idx = np.array([ net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets ]).argsort()[::-1][:20] self.model = ensemble_model([trained_nets[i] for i in best_idx]) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set: R**2:', r2, ' R:', r, file=sys.stderr) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set: R**2:', r2, ' R:', r, file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=2016) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = (Parallel( n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(delayed(method_caller)(neuralnetwork( (5, ), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort( key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = ( Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')( delayed(method_caller)( neuralnetwork((5,), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort(key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: if len(target) < 3: print('There are less than 3 values to predict, skipping.', file=sys.stderr) continue print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = ( Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')( delayed(method_caller)( neuralnetwork((5,), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort(key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' # load precomputed descriptors and target values df = pd.read_csv(home_dir + '/nnscore_descs.csv', index_col='pdbid') train_set = 'refined' test_set = 'core' cols = list(map(str, range(len(self.descriptor_generator)))) self.train_descs = ( df[(df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)])][cols].values) self.train_target = ( df[(df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)])]['act'].values) self.test_descs = df[df['%i_%s' % (pdbbind_version, test_set)]][cols].values self.test_target = df[df['%i_%s' % (pdbbind_version, test_set)]]['act'].values # number of network to sample; original implementation did 1000, but 100 give results good enough. n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = (Parallel(n_jobs=self.n_jobs, verbose=10)( delayed(_parallel_helper)(neuralnetwork( (5, ), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000, ), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best best_idx = np.array([ net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets ]).argsort()[::-1][:20] self.model = ensemble_model([trained_nets[i] for i in best_idx]) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))