def train(self, sf_pickle = ''): # load precomputed descriptors and target values self.train_descs = np.loadtxt(dirname(__file__) + '/NNScore/train_descs.csv', delimiter=',', dtype=float) self.train_target = np.loadtxt(dirname(__file__) + '/NNScore/train_target.csv', delimiter=',', dtype=float) self.test_descs = np.loadtxt(dirname(__file__) + '/NNScore/test_descs.csv', delimiter=',', dtype=float) self.test_target = np.loadtxt(dirname(__file__) + '/NNScore/test_target.csv', delimiter=',', dtype=float) n_dim = (~((self.train_descs == 0).all(axis=0) | (self.train_descs.min(axis=0) == self.train_descs.max(axis=0)))).sum() # number of network to sample; original implementation did 1000, but 100 give results good enough. n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = Parallel(n_jobs=self.n_jobs, verbose=10)(delayed(_parallel_helper)(neuralnetwork([n_dim,5,1], random_state=seeds[i]), 'fit', self.train_descs, self.train_target, neural_network__train_alg='tnc', neural_network__maxfun=10000) for i in xrange(n)) # get 20 best best_idx = np.array([net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets]).argsort()[::-1][:20] self.model = ensemble_model([trained_nets[i] for i in best_idx]) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print 'Test set: R**2:', r2, ' R:', r r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print 'Train set: R**2:', r2, ' R:', r if sf_pickle: return self.save(sf_pickle) else: return self.save('NNScore.pickle')
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/RFScore' desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version) super(rfscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print('Training RFScore v%i on PDBBind v%i' % (self.version, pdbbind_version), file=sys.stderr) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target), ('OOB', self.model.oob_prediction_, self.train_target) ] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) # compile trees if compiledtrees is not None: try: print('Compiling Random Forest using sklearn-compiledtrees', file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print('Failed to compile Random Forest with exception: %s' % e, file=sys.stderr) print('Continuing without compiled RF.', file=sys.stderr) if sf_pickle is None: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2007): if not home_dir: home_dir = dirname(__file__) + '/NNScore' # load precomputed descriptors and target values self.train_descs = np.loadtxt(home_dir + '/train_descs_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) self.train_target = np.loadtxt( home_dir + '/train_target_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) self.test_descs = np.loadtxt(home_dir + '/test_descs_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) self.test_target = np.loadtxt(home_dir + '/test_target_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) n_dim = (~((self.train_descs == 0).all(axis=0) | (self.train_descs.min(axis=0) == self.train_descs.max(axis=0)))).sum() # number of network to sample; original implementation did 1000, but 100 give results good enough. n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = Parallel(n_jobs=self.n_jobs, verbose=10)( delayed(_parallel_helper)(neuralnetwork( (5, ), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000, ), 'fit', self.train_descs, self.train_target) for i in range(n)) # get 20 best best_idx = np.array([ net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets ]).argsort()[::-1][:20] self.model = ensemble_model([trained_nets[i] for i in best_idx]) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set: R**2:', r2, ' R:', r, file=sys.stderr) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set: R**2:', r2, ' R:', r, file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=2016) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = (Parallel( n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')(delayed(method_caller)(neuralnetwork( (5, ), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort( key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/RFScore' desc_path = path_join(home_dir, 'rfscore_descs_v%i.csv' % self.version) super(rfscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print('Training RFScore v%i on PDBBind v%i' % (self.version, pdbbind_version), file=sys.stderr) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target), ('OOB', self.model.oob_prediction_, self.train_target)] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) # compile trees if compiledtrees is not None: try: print('Compiling Random Forest using sklearn-compiledtrees', file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print('Failed to compile Random Forest with exception: %s' % e, file=sys.stderr) print('Continuing without compiled RF.', file=sys.stderr) if sf_pickle is None: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = ( Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')( delayed(method_caller)( neuralnetwork((5,), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort(key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: if len(target) < 3: print('There are less than 3 values to predict, skipping.', file=sys.stderr) continue print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def fit(self, descs, target_values, train_alg='tnc',**kwargs): # setup neural network if self.full_conn: conec = tmlgraph(self.shape, self.biases) else: conec = mlgraph(self.shape, self.biases) self.model = ffnet(conec) if self.random_weights: if not self.random_state is None: random_seed(self.random_state) self.model.randomweights() # train getattr(self.model, 'train_'+train_alg)(descs, target_values, nproc='ncpu' if self.n_jobs < 1 else self.n_jobs, **kwargs) return self
def train(self, home_dir=None, sf_pickle=None, pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' desc_path = path_join(home_dir, 'nnscore_descs.csv') super(nnscore, self)._load_pdbbind_desc(desc_path, pdbbind_version=pdbbind_version) # number of network to sample; original implementation did 1000, but # 100 give results good enough. # TODO: allow user to specify number of nets? n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = ( Parallel(n_jobs=self.n_jobs, verbose=10, pre_dispatch='all')( delayed(method_caller)( neuralnetwork((5,), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best trained_nets.sort(key=lambda n: n.score(self.test_descs, self.test_target.flatten())) self.model = ensemble_model(trained_nets[-20:]) sets = [ ('Test', self.model.predict(self.test_descs), self.test_target), ('Train', self.model.predict(self.train_descs), self.train_target)] for name, pred, target in sets: print('%s set:' % name, 'R2_score: %.4f' % r2_score(target, pred), 'Rp: %.4f' % pearsonr(target, pred)[0], 'RMSE: %.4f' % rmse(target, pred), 'SD: %.4f' % standard_deviation_error(target, pred), sep='\t', file=sys.stderr) if sf_pickle is None: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version)) else: return self.save(sf_pickle)
def train(self, home_dir=None, sf_pickle=''): if not home_dir: home_dir = dirname(__file__) + '/RFScore' # load precomputed descriptors and target values self.train_descs = np.loadtxt(home_dir + '/train_descs_v%i.csv' % (self.version), delimiter=',', dtype=float) self.train_target = np.loadtxt(home_dir + '/train_target.csv', delimiter=',', dtype=float) self.test_descs = np.loadtxt(home_dir + '/test_descs_v%i.csv' % (self.version), delimiter=',', dtype=float) self.test_target = np.loadtxt(home_dir + '/test_target.csv', delimiter=',', dtype=float) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print "Training RFScore v%i" % self.version r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print 'Test set: R**2:', r2, ' R:', r rmse = np.sqrt( np.mean(np.square(self.model.oob_prediction_ - self.train_target))) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print 'Train set: R**2:', r2, ' R:', r, 'RMSE:', rmse if sf_pickle: return self.save(sf_pickle) else: return self.save('RFScore_v%i.pickle' % self.version)
def fit(self, descs, target_values, train_alg='tnc', **kwargs): # setup neural network if self.full_conn: conec = tmlgraph(self.shape, self.biases) else: conec = mlgraph(self.shape, self.biases) self.model = ffnet(conec) if self.random_weights: if not self.random_state is None: random_seed(self.random_state) self.model.randomweights() # train getattr(self.model, 'train_' + train_alg)( descs, target_values, nproc='ncpu' if self.n_jobs < 1 else self.n_jobs, **kwargs) return self
def train(self, home_dir = None, sf_pickle = ''): if not home_dir: home_dir = dirname(__file__) + '/RFScore' # load precomputed descriptors and target values self.train_descs = np.loadtxt(home_dir + '/train_descs_v%i.csv' % (self.version), delimiter=',', dtype=float) self.train_target = np.loadtxt(home_dir + '/train_target.csv', delimiter=',', dtype=float) self.test_descs = np.loadtxt(home_dir + '/test_descs_v%i.csv' % (self.version), delimiter=',', dtype=float) self.test_target = np.loadtxt(home_dir + '/test_target.csv', delimiter=',', dtype=float) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:,self.mask] self.test_descs = self.test_descs[:,self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print "Training RFScore v%i" % self.version r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print 'Test set: R**2:', r2, ' R:', r rmse = np.sqrt(np.mean(np.square(self.model.oob_prediction_ - self.train_target))) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print 'Train set: R**2:', r2, ' R:', r, 'RMSE:', rmse if sf_pickle: return self.save(sf_pickle) else: return self.save('RFScore_v%i.pickle' % self.version)
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/NNScore' # load precomputed descriptors and target values df = pd.read_csv(home_dir + '/nnscore_descs.csv', index_col='pdbid') train_set = 'refined' test_set = 'core' cols = list(map(str, range(len(self.descriptor_generator)))) self.train_descs = ( df[(df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)])][cols].values) self.train_target = ( df[(df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)])]['act'].values) self.test_descs = df[df['%i_%s' % (pdbbind_version, test_set)]][cols].values self.test_target = df[df['%i_%s' % (pdbbind_version, test_set)]]['act'].values # number of network to sample; original implementation did 1000, but 100 give results good enough. n = 1000 # make nets reproducible random_seed(1) seeds = np.random.randint(123456789, size=n) trained_nets = (Parallel(n_jobs=self.n_jobs, verbose=10)( delayed(_parallel_helper)(neuralnetwork( (5, ), random_state=seeds[i], activation='logistic', solver='lbfgs', max_iter=10000, ), 'fit', self.train_descs, self.train_target) for i in range(n))) # get 20 best best_idx = np.array([ net.score(self.test_descs, self.test_target.flatten()) for net in trained_nets ]).argsort()[::-1][:20] self.model = ensemble_model([trained_nets[i] for i in best_idx]) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('NNScore_pdbbind%i.pickle' % (pdbbind_version))
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2007): if not home_dir: home_dir = dirname(__file__) + '/RFScore' # load precomputed descriptors and target values self.train_descs = np.loadtxt(home_dir + '/train_descs_v%i_pdbbind%i.csv' % (self.version, pdbbind_version), delimiter=',', dtype=float) self.train_target = np.loadtxt( home_dir + '/train_target_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) self.test_descs = np.loadtxt(home_dir + '/test_descs_v%i_pdbbind%i.csv' % (self.version, pdbbind_version), delimiter=',', dtype=float) self.test_target = np.loadtxt(home_dir + '/test_target_pdbbind%i.csv' % (pdbbind_version), delimiter=',', dtype=float) # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print("Training RFScore v%i on PDBBind v%i" % (self.version, pdbbind_version), file=sys.stderr) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set: R**2:', r2, ' R:', r, 'RMSE:', error, file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set: R**2:', r2, ' R:', r, 'RMSE:', error, file=sys.stderr) # compile trees if compiledtrees is not None: try: print("Compiling Random Forest using sklearn-compiledtrees", file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print("Failed to compile Random Forest with exception: %s" % e, file=sys.stderr) print("Continuing without compiled RF.", file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version))
def train(self, home_dir=None, sf_pickle='', pdbbind_version=2016): if not home_dir: home_dir = dirname(__file__) + '/RFScore' # load precomputed descriptors and target values df = pd.read_csv(home_dir + '/rfscore_descs_v%i.csv' % self.version, index_col='pdbid') train_set = 'refined' test_set = 'core' self.train_descs = df[ df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)]][list( map(str, range(len(self.descriptor_generator))))].values self.train_target = df[ df['%i_%s' % (pdbbind_version, train_set)] & ~df['%i_%s' % (pdbbind_version, test_set)]]['act'].values self.test_descs = df[df['%i_%s' % (pdbbind_version, test_set)]][list( map(str, range(len(self.descriptor_generator))))].values self.test_target = df[df['%i_%s' % (pdbbind_version, test_set)]]['act'].values # remove sparse dimentions if self.spr > 0: self.mask = (self.train_descs > self.spr).any(axis=0) if self.mask.sum() > 0: self.train_descs = self.train_descs[:, self.mask] self.test_descs = self.test_descs[:, self.mask] # make nets reproducible random_seed(1) self.model.fit(self.train_descs, self.train_target) print("Training RFScore v%i on PDBBind v%i" % (self.version, pdbbind_version), file=sys.stderr) error = rmse(self.model.predict(self.test_descs), self.test_target) r2 = self.model.score(self.test_descs, self.test_target) r = np.sqrt(r2) print('Test set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, sep='\t', file=sys.stderr) error = rmse(self.model.predict(self.train_descs), self.train_target) oob_error = rmse(self.model.oob_prediction_, self.train_target) r2 = self.model.score(self.train_descs, self.train_target) r = np.sqrt(r2) print('Train set:', 'R**2: %.4f' % r2, 'R: %.4f' % r, 'RMSE: %.4f' % error, 'OOB RMSE: %.4f' % oob_error, sep='\t', file=sys.stderr) # compile trees if compiledtrees is not None: try: print("Compiling Random Forest using sklearn-compiledtrees", file=sys.stderr) self.model = compiledtrees.CompiledRegressionPredictor( self.model, n_jobs=self.n_jobs) except Exception as e: print("Failed to compile Random Forest with exception: %s" % e, file=sys.stderr) print("Continuing without compiled RF.", file=sys.stderr) if sf_pickle: return self.save(sf_pickle) else: return self.save('RFScore_v%i_pdbbind%i.pickle' % (self.version, pdbbind_version))