def train(validate, dset=None): sv = Server() sv.load_data('cn_model_v1.0.csv') sv.train(validate=validate, selection_set=dset) sv.use(dset=dset) sv.errors('rmse', 'med_abs_error', 'mean_abs_error', 'r2', dset=dset)
def create_project(): sv = Server() sv.load_data('cn_model_v1.0.csv') sv.create_project('use_project', num_pools=2, num_candidates=2) sv.train() sv.save_project()
def predict(input_txt, results_file, prj_file, form='name', temp_db='_new_mols.csv', clean_up=True): '''Predicts values for new data using pre-existing .prj file Args: input_txt (str): path to .txt file containing either molecule names or SMILES strings results_file (str): path to results file generated by this function prj_file (str): path to pre-existing .prj file form (str): `name` if supplying molecule names, `SMILES` if supplying SMILES strings temp_db (str): path to temporary database generated by this function clean_up (bool): if True, cleans up all files generated during this function (except for input/results files) ''' logger.stream_level = 'disable' sv = Server(prj_file=prj_file) input_names = sv._df.input_names create_db(input_txt, temp_db, form=form, clean_up=clean_up) new_data = DataFrame(temp_db) new_data.set_inputs(input_names) sv._df = new_data sv.use(output_filename=results_file) if clean_up: remove(temp_db) rmtree(prj_file.replace('.prj', ''))
def test_load_data(self): print('\nUNIT TEST: Server.load_data') sv = Server() sv.load_data(DB_LOC) self.assertEqual(len(sv._df), 482) self.assertEqual(type(sv._sets), PackagedData) remove('config.yml')
def main(): logger.stream_level = 'debug' sv = Server(num_processes=4) sv.load_data('../kv_model_v1.0.csv') sv.create_project('kinetic_viscosity', num_pools=5, num_candidates=25) sv.train(shuffle='train', split=[0.7, 0.2, 0.1], validate=True, selection_set='test') sv.save_project(del_candidates=True)
def test_use_project(self): print('\nUNIT TEST: Server.use') sv = Server() sv.load_data(DB_LOC, random=True, split=[0.7, 0.2, 0.1]) sv.create_project('test_project', 2, 2) sv._vars['epochs'] = 100 sv.train() results = sv.use() self.assertEqual(len(results), len(sv._df)) remove('config.yml') rmtree('test_project')
def test_create_project(self): print('\nUNIT TEST: Server.create_project') sv = Server() sv.create_project('test_project', 3, 5) for pool in range(3): for candidate in range(5): self.assertTrue( isdir( join('test_project', 'pool_{}'.format(pool), 'candidate_{}'.format(candidate)))) remove('config.yml') rmtree('test_project')
def test_init(self): print('\nUNIT TEST: Server init') sv = Server() self.assertTrue(exists('config.yml')) self.assertEqual(sv._vars, default_config()) remove('config.yml')
def test_server_limit(self): print('\nUNIT TEST: limit_rforest (Server)') sv = Server() sv.load_data(DB_LOC) sv.limit_inputs(2, output_filename='cn_limited.csv') self.assertEqual(len(sv._df._input_names), 2) self.assertEqual(len(sv._sets.learn_x[0]), 2) sv.load_data('cn_limited.csv') self.assertEqual(len(sv._df._input_names), 2) self.assertEqual(len(sv._sets.learn_x[0]), 2) remove('cn_limited.csv') remove('config.yml')
def main(): logger.stream_level = 'info' sv = Server(prj_file='kinetic_viscosity.prj') train_exp = [] train_exp.extend(y for y in sv._sets.learn_y) train_exp.extend(y for y in sv._sets.valid_y) train_pred = sv.use(dset='train') train_errors = sv.errors('rmse', 'r2', dset='train') test_exp = sv._sets.test_y test_pred = sv.use(dset='test') test_errors = sv.errors('rmse', 'r2', dset='test') kv_plot = ParityPlot( title='Predicted vs. Experimental Kinematic Viscosity', x_label='Experimental KV', y_label='Predicted KV') kv_plot.add_series(train_exp, train_pred, name='Training Set', color='blue') kv_plot.add_series(test_exp, test_pred, name='Test Set', color='red') kv_plot.add_error_bars(test_errors['rmse'], label='Test RMSE') kv_plot._add_label('Test R-Squared', test_errors['r2']) kv_plot._add_label('Train RMSE', train_errors['rmse']) kv_plot._add_label('Train R-Squared', train_errors['r2']) kv_plot.save('../kv_parity_plot.png')
def test_train_project(self): print('\nUNIT TEST: Server.train') sv = Server() sv.load_data(DB_LOC, random=True, split=[0.7, 0.2, 0.1]) sv.create_project('test_project', 2, 2) sv._vars['epochs'] = 100 sv.train() for pool in range(2): self.assertTrue( exists(join('test_project', 'pool_{}'.format(pool), 'model.h5'))) for candidate in range(2): self.assertTrue( exists( join('test_project', 'pool_{}'.format(pool), 'candidate_{}'.format(candidate), 'model.h5'))) remove('config.yml') rmtree('test_project')
def test_multiprocessing_train(self): print('\nUNIT TEST: multiprocessing training') sv = Server(num_processes=8) sv.load_data(DB_LOC) sv.create_project('test_project', 2, 4) sv._vars['epochs'] = 100 sv.train() for pool in range(2): self.assertTrue( exists(join('test_project', 'pool_{}'.format(pool), 'model.h5'))) for candidate in range(4): self.assertTrue( exists( join('test_project', 'pool_{}'.format(pool), 'candidate_{}'.format(candidate), 'model.h5'))) remove('config.yml') rmtree('test_project')
def main(): logger.stream_level = 'debug' sv = Server(num_processes=4) sv.load_data('../kv_model_v1.0.csv') sv.tune_hyperparameters(20, 20, shuffle='train', split=[0.7, 0.2, 0.1], eval_set='test')
def tune(num_processes, shuffle=None, split=[0.7, 0.2, 0.1], validate=True, eval_set=None, eval_fn='rmse'): logger.stream_level = 'debug' sv = Server(num_processes=num_processes) sv.load_data('cn_model_v1.0.csv', random=True, split=split) sv.tune_hyperparameters(2, 2, shuffle=shuffle, split=split, validate=validate, eval_set=eval_set, eval_fn=eval_fn)
def retrain(): sv = Server(prj_file='_training_test.prj') sv.load_data('cn_model_v1.0.csv') sv.train(retrain=True) sv.save_project(del_candidates=True)
def train_project(validate, shuffle, split=[0.7, 0.2, 0.1], num_processes=1, dset=None, sel_fn='rmse', output_filename=None): sv = Server(num_processes=num_processes) sv.load_data('cn_model_v1.0.csv') sv.create_project('_training_test', num_pools=2, num_candidates=2) sv.train(shuffle=shuffle, split=split, selection_set=dset, selection_fn=sel_fn, validate=validate) sv.use(dset=dset, output_filename=output_filename) sv.errors('rmse', 'med_abs_error', 'mean_abs_error', 'r2') sv.save_project()
def create_model(prop_abvr: str, smiles: list = None, targets: list = None, db_name: str = None, qspr_backend: str = 'padel', create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1], log_level: str = 'info', log_to_file: bool = True, num_processes: int = 1): ''' create_model: ECRL's database/model creation workflow for all publications Args: prop_abvr (str): abbreviation for the property name (e.g. CN) smiles (list): if supplied with targets, creates a new database targets (list): if supplied with smiles, creates a new database db_name (str): you may supply an existing ECNet-formatted database qspr_backend (str): if creating new database, generation software to use (`padel`, `alvadesc`) create_plots (bool): if True, creates plots for median absolute error vs. number of descriptors as inputs, parity plot for all sets data_split (list): [learn %, valid %, test %] for all supplied data log_level (str): `debug`, `info`, `warn`, `error`, `crit` log_to_file (bool): if True, saves workflow logs to a file in `logs` directory num_processes (int): number of concurrent processes to use for various tasks ''' # Initialize logging logger.stream_level = log_level if log_to_file: logger.file_level = log_level # If database not supplied, create database from supplied SMILES, targets if db_name is None: if smiles is None or targets is None: raise ValueError('Must supply SMILES and target values') db_name = datetime.now().strftime('{}_model_%Y%m%d.csv'.format( prop_abvr )) logger.log('info', 'Creating database {}...'.format(db_name), 'WORKFLOW') create_db(smiles, db_name, targets, prop_abvr, backend=qspr_backend) logger.log('info', 'Created database {}'.format(db_name), 'WORKFLOW') # Create database split, each subset has proportionally equal number of # compounds based on range of experimental/target values logger.log('info', 'Creating optimal data split...', 'WORKFLOW') prop_range_from_split(db_name, data_split) logger.log('info', 'Created optimal data split', 'WORKFLOW') df = DataFrame(db_name) df.create_sets() logger.log('info', '\tLearning set: {}'.format(len(df.learn_set)), 'WORKFLOW') logger.log('info', '\tValidation set: {}'.format(len(df.valid_set)), 'WORKFLOW') logger.log('info', '\tTest set: {}'.format(len(df.test_set)), 'WORKFLOW') # Find optimal number of QSPR input variables logger.log('info', 'Finding optimal number of inputs...', 'WORKFLOW') errors, desc = find_optimal_num_inputs(db_name, 'valid', num_processes) df = DataFrame(db_name) df.set_inputs(desc) df.save(db_name.replace('.csv', '_opt.csv')) logger.log('info', 'Found optimal number of inputs', 'WORKFLOW') logger.log('info', '\tNumber of inputs: {}'.format(len(df._input_names)), 'WORKFLOW') # Plot the curve of MAE vs. num. desc. added, if desired if create_plots: logger.log('info', 'Creating plot of MAE vs. descriptors...', 'WORKFLOW') num_add = [e[0] for e in errors] maes = [e[1] for e in errors] opt_num = len(desc) plt.clf() plt.rcParams['font.family'] = 'Times New Roman' plt.plot(num_add, maes, c='blue') plt.axvline(x=opt_num, c='red', linestyle='--') plt.xlabel('Number of Descriptors as ANN Input Variables') plt.ylabel('Median Absolute Error of {} Predictions'.format(prop_abvr)) plt.savefig(db_name.replace('.csv', '_desc_curve.png')) logger.log('info', 'Created plot of MAE vs. descriptors', 'WORKFLOW') # Tune ANN hyperparameters according to validation set performance logger.log('info', 'Tuning ANN hyperparameters...', 'WORKFLOW') config = default_config() config = tune_hyperparameters(df, config, 25, 10, num_processes, shuffle='train', split=[0.7, 0.2, 0.1], validate=True, eval_set='valid', eval_fn='med_abs_error', epochs=300) config['epochs'] = default_config()['epochs'] config_filename = db_name.replace('.csv', '.yml') save_config(config, config_filename) logger.log('info', 'Tuned ANN hyperparameters', 'WORKFLOW') logger.log('info', '\tLearning rate: {}'.format(config['learning_rate']), 'WORKFLOW') logger.log('info', '\tLR decay: {}'.format(config['decay']), 'WORKFLOW') logger.log('info', '\tBatch size: {}'.format(config['batch_size']), 'WORKFLOW') logger.log('info', '\tPatience: {}'.format(config['patience']), 'WORKFLOW') logger.log('info', '\tHidden layers: {}'.format(config['hidden_layers']), 'WORKFLOW') # Create Model logger.log('info', 'Generating ANN...', 'WORKFLOW') sv = Server(db_name.replace('.csv', '.yml'), num_processes=num_processes) sv.load_data(db_name.replace('.csv', '_opt.csv')) sv.create_project(db_name.replace('.csv', ''), 5, 75) sv.train(validate=True, selection_set='valid', shuffle='train', split=[0.7, 0.2, 0.1], selection_fn='med_abs_error') logger.log('info', 'ANN Generated', 'WORKFLOW') logger.log('info', 'Measuring ANN performance...', 'WORKFLOW') preds_test = sv.use(dset='test') preds_train = sv.use(dset='train') test_errors = sv.errors('r2', 'med_abs_error', dset='test') train_errors = sv.errors('r2', 'med_abs_error', dset='train') logger.log('info', 'Measured ANN performance', 'WORKFLOW') logger.log('info', '\tTraining set:\t R2: {}\t MAE: {}'.format( train_errors['r2'], train_errors['med_abs_error']), 'WORKFLOW') logger.log('info', '\tTesting set:\t R2: {}\t MAE: {}'.format( test_errors['r2'], test_errors['med_abs_error']), 'WORKFLOW') sv.save_project(del_candidates=True) if create_plots: logger.log('info', 'Creating parity plot...', 'WORKFLOW') plt.clf() parity_plot = ParityPlot( '', 'Experimental {} Value'.format(prop_abvr), 'Predicted {} Value'.format(prop_abvr) ) parity_plot.add_series(concatenate( (sv._sets.learn_y, sv._sets.valid_y) ), preds_train, 'Training Set', 'blue') parity_plot.add_series(sv._sets.test_y, preds_test, 'Test Set', 'red') parity_plot.add_error_bars(test_errors['med_abs_error'], 'Test MAE') parity_plot._add_label('Test $R^2$', test_errors['r2']) parity_plot._add_label('Training MAE', train_errors['med_abs_error']) parity_plot._add_label('Training $R^2$', train_errors['r2']) parity_plot.save(db_name.replace('.csv', '_parity.png')) logger.log('info', 'Created parity plot', 'WORKFLOW')
def main(): logger.stream_level = 'debug' sv = Server() sv.load_data('cn_model_v1.0.csv') sv.remove_outliers(output_filename='cn_no_outliers.csv')
def main(): logger.stream_level = 'debug' sv = Server(prj_file='kinetic_viscosity.prj') sv.use(dset='test', output_filename='../kv_test_results.csv') sv.errors('rmse', 'mean_abs_error', 'med_abs_error', 'r2', dset='test')
def test_predict(self): print('\nUNIT TEST: project.predict') sv = Server() sv.load_data('cn_model_v2.0.csv') sv.create_project('test_project', 1, 1) sv._vars['epochs'] = 100 sv.train() sv.save_project() results = predict(['CCC', 'CCCC'], 'test_project.prj', 'results.csv') self.assertEqual(len(results), 2) remove('test_project.prj') remove('config.yml')
def test_save_project(self): print('\nUNIT TEST: Server.save_project') sv = Server() sv.load_data(DB_LOC, random=True, split=[0.7, 0.2, 0.1]) sv.create_project('test_project', 2, 2) sv._vars['epochs'] = 100 sv.train() sv.save_project() self.assertTrue(exists('test_project.prj')) self.assertTrue(not isdir('test_project')) remove('test_project.prj') remove('config.yml')
def main(): logger.stream_level = 'debug' sv = Server(num_processes=4) sv.load_data('../kv_model_v1.0_full.csv') sv.limit_inputs(15, output_filename='../kv_model_v1.0.csv')
def main(db_name: str): # Set up logging logger.stream_level = 'info' logger.log_dir = db_name.replace('.csv', '') + '_logs' logger.file_level = 'debug' # Split database proportionally based on property value # Proportions are 70% learn, 20% validate, 10% test prop_range_from_split(db_name, [0.7, 0.2, 0.1]) # Find the optimal number of input variables # Train (learn + valid) set used for evaluation n_desc = len(find_optimal_num_inputs(db_name, 'train', _NUM_PROC)[1]) logger.log('info', 'Optimal number of input variables: {}'.format(n_desc)) # Create server object with base config sv = Server(model_config=db_name.replace('.csv', '.yml'), num_processes=_NUM_PROC) # Load data sv.load_data(db_name) # Limit input variables to `n_desc` using Train set # Outputs to relevant database name sv.limit_inputs( n_desc, eval_set='train', output_filename=db_name.replace('.csv', '.{}.csv'.format(n_desc)) ) # Tune hyperparameters (architecture and ADAM) # 20 employer bees, 10 search cycles # Evaluation of solutions based on validation set median absolute error sv.tune_hyperparameters(20, 10, eval_set='valid', eval_fn='med_abs_error') # Create an ECNet project (saved and recalled later) # 5 pools with 75 trials/pool, best ANNs selected from each pool sv.create_project(db_name.replace('.csv', ''), 5, 75) # Train project # Select best candidates based on validation set median absolute error sv.train(validate=True, selection_set='valid', selection_fn='med_abs_error') # Obtain learning, validation, testing set median absolute error, r-squared err_l = sv.errors('med_abs_error', 'r2', dset='learn') err_v = sv.errors('med_abs_error', 'r2', dset='valid') err_t = sv.errors('med_abs_error', 'r2', dset='test') logger.log('info', 'Learning set performance: {}'.format(err_l)) logger.log('info', 'Validation set performance: {}'.format(err_v)) logger.log('info', 'Testing set performance: {}'.format(err_t)) # Save the project, creating a .prj file and removing un-chosen candidates sv.save_project(del_candidates=True)
def limit(num_processes, output_filename=None): logger.stream_level = 'info' sv = Server(num_processes=num_processes) sv.load_data('cn_model_v1.0.csv') sv.limit_inputs(3, output_filename=output_filename)
def use_project(): sv = Server(prj_file='use_project.prj') sv.use('train', output_filename='use_project_train.csv') sv.use('test', output_filename='use_project_test.csv')