def train(validate, dset=None): sv = Server() sv.load_data('cn_model_v1.0.csv') sv.train(validate=validate, selection_set=dset) sv.use(dset=dset) sv.errors('rmse', 'med_abs_error', 'mean_abs_error', 'r2', dset=dset)
def main(): logger.stream_level = 'info' sv = Server(prj_file='kinetic_viscosity.prj') train_exp = [] train_exp.extend(y for y in sv._sets.learn_y) train_exp.extend(y for y in sv._sets.valid_y) train_pred = sv.use(dset='train') train_errors = sv.errors('rmse', 'r2', dset='train') test_exp = sv._sets.test_y test_pred = sv.use(dset='test') test_errors = sv.errors('rmse', 'r2', dset='test') kv_plot = ParityPlot( title='Predicted vs. Experimental Kinematic Viscosity', x_label='Experimental KV', y_label='Predicted KV') kv_plot.add_series(train_exp, train_pred, name='Training Set', color='blue') kv_plot.add_series(test_exp, test_pred, name='Test Set', color='red') kv_plot.add_error_bars(test_errors['rmse'], label='Test RMSE') kv_plot._add_label('Test R-Squared', test_errors['r2']) kv_plot._add_label('Train RMSE', train_errors['rmse']) kv_plot._add_label('Train R-Squared', train_errors['r2']) kv_plot.save('../kv_parity_plot.png')
def main(db_name: str): # Set up logging logger.stream_level = 'info' logger.log_dir = db_name.replace('.csv', '') + '_logs' logger.file_level = 'debug' # Split database proportionally based on property value # Proportions are 70% learn, 20% validate, 10% test prop_range_from_split(db_name, [0.7, 0.2, 0.1]) # Find the optimal number of input variables # Train (learn + valid) set used for evaluation n_desc = len(find_optimal_num_inputs(db_name, 'train', _NUM_PROC)[1]) logger.log('info', 'Optimal number of input variables: {}'.format(n_desc)) # Create server object with base config sv = Server(model_config=db_name.replace('.csv', '.yml'), num_processes=_NUM_PROC) # Load data sv.load_data(db_name) # Limit input variables to `n_desc` using Train set # Outputs to relevant database name sv.limit_inputs( n_desc, eval_set='train', output_filename=db_name.replace('.csv', '.{}.csv'.format(n_desc)) ) # Tune hyperparameters (architecture and ADAM) # 20 employer bees, 10 search cycles # Evaluation of solutions based on validation set median absolute error sv.tune_hyperparameters(20, 10, eval_set='valid', eval_fn='med_abs_error') # Create an ECNet project (saved and recalled later) # 5 pools with 75 trials/pool, best ANNs selected from each pool sv.create_project(db_name.replace('.csv', ''), 5, 75) # Train project # Select best candidates based on validation set median absolute error sv.train(validate=True, selection_set='valid', selection_fn='med_abs_error') # Obtain learning, validation, testing set median absolute error, r-squared err_l = sv.errors('med_abs_error', 'r2', dset='learn') err_v = sv.errors('med_abs_error', 'r2', dset='valid') err_t = sv.errors('med_abs_error', 'r2', dset='test') logger.log('info', 'Learning set performance: {}'.format(err_l)) logger.log('info', 'Validation set performance: {}'.format(err_v)) logger.log('info', 'Testing set performance: {}'.format(err_t)) # Save the project, creating a .prj file and removing un-chosen candidates sv.save_project(del_candidates=True)
def train_project(validate, shuffle, split=[0.7, 0.2, 0.1], num_processes=1, dset=None, sel_fn='rmse', output_filename=None): sv = Server(num_processes=num_processes) sv.load_data('cn_model_v1.0.csv') sv.create_project('_training_test', num_pools=2, num_candidates=2) sv.train(shuffle=shuffle, split=split, selection_set=dset, selection_fn=sel_fn, validate=validate) sv.use(dset=dset, output_filename=output_filename) sv.errors('rmse', 'med_abs_error', 'mean_abs_error', 'r2') sv.save_project()
def create_model(prop_abvr: str, smiles: list = None, targets: list = None, db_name: str = None, qspr_backend: str = 'padel', create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1], log_level: str = 'info', log_to_file: bool = True, num_processes: int = 1): ''' create_model: ECRL's database/model creation workflow for all publications Args: prop_abvr (str): abbreviation for the property name (e.g. CN) smiles (list): if supplied with targets, creates a new database targets (list): if supplied with smiles, creates a new database db_name (str): you may supply an existing ECNet-formatted database qspr_backend (str): if creating new database, generation software to use (`padel`, `alvadesc`) create_plots (bool): if True, creates plots for median absolute error vs. number of descriptors as inputs, parity plot for all sets data_split (list): [learn %, valid %, test %] for all supplied data log_level (str): `debug`, `info`, `warn`, `error`, `crit` log_to_file (bool): if True, saves workflow logs to a file in `logs` directory num_processes (int): number of concurrent processes to use for various tasks ''' # Initialize logging logger.stream_level = log_level if log_to_file: logger.file_level = log_level # If database not supplied, create database from supplied SMILES, targets if db_name is None: if smiles is None or targets is None: raise ValueError('Must supply SMILES and target values') db_name = datetime.now().strftime('{}_model_%Y%m%d.csv'.format( prop_abvr )) logger.log('info', 'Creating database {}...'.format(db_name), 'WORKFLOW') create_db(smiles, db_name, targets, prop_abvr, backend=qspr_backend) logger.log('info', 'Created database {}'.format(db_name), 'WORKFLOW') # Create database split, each subset has proportionally equal number of # compounds based on range of experimental/target values logger.log('info', 'Creating optimal data split...', 'WORKFLOW') prop_range_from_split(db_name, data_split) logger.log('info', 'Created optimal data split', 'WORKFLOW') df = DataFrame(db_name) df.create_sets() logger.log('info', '\tLearning set: {}'.format(len(df.learn_set)), 'WORKFLOW') logger.log('info', '\tValidation set: {}'.format(len(df.valid_set)), 'WORKFLOW') logger.log('info', '\tTest set: {}'.format(len(df.test_set)), 'WORKFLOW') # Find optimal number of QSPR input variables logger.log('info', 'Finding optimal number of inputs...', 'WORKFLOW') errors, desc = find_optimal_num_inputs(db_name, 'valid', num_processes) df = DataFrame(db_name) df.set_inputs(desc) df.save(db_name.replace('.csv', '_opt.csv')) logger.log('info', 'Found optimal number of inputs', 'WORKFLOW') logger.log('info', '\tNumber of inputs: {}'.format(len(df._input_names)), 'WORKFLOW') # Plot the curve of MAE vs. num. desc. added, if desired if create_plots: logger.log('info', 'Creating plot of MAE vs. descriptors...', 'WORKFLOW') num_add = [e[0] for e in errors] maes = [e[1] for e in errors] opt_num = len(desc) plt.clf() plt.rcParams['font.family'] = 'Times New Roman' plt.plot(num_add, maes, c='blue') plt.axvline(x=opt_num, c='red', linestyle='--') plt.xlabel('Number of Descriptors as ANN Input Variables') plt.ylabel('Median Absolute Error of {} Predictions'.format(prop_abvr)) plt.savefig(db_name.replace('.csv', '_desc_curve.png')) logger.log('info', 'Created plot of MAE vs. descriptors', 'WORKFLOW') # Tune ANN hyperparameters according to validation set performance logger.log('info', 'Tuning ANN hyperparameters...', 'WORKFLOW') config = default_config() config = tune_hyperparameters(df, config, 25, 10, num_processes, shuffle='train', split=[0.7, 0.2, 0.1], validate=True, eval_set='valid', eval_fn='med_abs_error', epochs=300) config['epochs'] = default_config()['epochs'] config_filename = db_name.replace('.csv', '.yml') save_config(config, config_filename) logger.log('info', 'Tuned ANN hyperparameters', 'WORKFLOW') logger.log('info', '\tLearning rate: {}'.format(config['learning_rate']), 'WORKFLOW') logger.log('info', '\tLR decay: {}'.format(config['decay']), 'WORKFLOW') logger.log('info', '\tBatch size: {}'.format(config['batch_size']), 'WORKFLOW') logger.log('info', '\tPatience: {}'.format(config['patience']), 'WORKFLOW') logger.log('info', '\tHidden layers: {}'.format(config['hidden_layers']), 'WORKFLOW') # Create Model logger.log('info', 'Generating ANN...', 'WORKFLOW') sv = Server(db_name.replace('.csv', '.yml'), num_processes=num_processes) sv.load_data(db_name.replace('.csv', '_opt.csv')) sv.create_project(db_name.replace('.csv', ''), 5, 75) sv.train(validate=True, selection_set='valid', shuffle='train', split=[0.7, 0.2, 0.1], selection_fn='med_abs_error') logger.log('info', 'ANN Generated', 'WORKFLOW') logger.log('info', 'Measuring ANN performance...', 'WORKFLOW') preds_test = sv.use(dset='test') preds_train = sv.use(dset='train') test_errors = sv.errors('r2', 'med_abs_error', dset='test') train_errors = sv.errors('r2', 'med_abs_error', dset='train') logger.log('info', 'Measured ANN performance', 'WORKFLOW') logger.log('info', '\tTraining set:\t R2: {}\t MAE: {}'.format( train_errors['r2'], train_errors['med_abs_error']), 'WORKFLOW') logger.log('info', '\tTesting set:\t R2: {}\t MAE: {}'.format( test_errors['r2'], test_errors['med_abs_error']), 'WORKFLOW') sv.save_project(del_candidates=True) if create_plots: logger.log('info', 'Creating parity plot...', 'WORKFLOW') plt.clf() parity_plot = ParityPlot( '', 'Experimental {} Value'.format(prop_abvr), 'Predicted {} Value'.format(prop_abvr) ) parity_plot.add_series(concatenate( (sv._sets.learn_y, sv._sets.valid_y) ), preds_train, 'Training Set', 'blue') parity_plot.add_series(sv._sets.test_y, preds_test, 'Test Set', 'red') parity_plot.add_error_bars(test_errors['med_abs_error'], 'Test MAE') parity_plot._add_label('Test $R^2$', test_errors['r2']) parity_plot._add_label('Training MAE', train_errors['med_abs_error']) parity_plot._add_label('Training $R^2$', train_errors['r2']) parity_plot.save(db_name.replace('.csv', '_parity.png')) logger.log('info', 'Created parity plot', 'WORKFLOW')
def main(): logger.stream_level = 'debug' sv = Server(prj_file='kinetic_viscosity.prj') sv.use(dset='test', output_filename='../kv_test_results.csv') sv.errors('rmse', 'mean_abs_error', 'med_abs_error', 'r2', dset='test')