def create_project():

    sv = Server()
    sv.load_data('cn_model_v1.0.csv')
    sv.create_project('use_project', num_pools=2, num_candidates=2)
    sv.train()
    sv.save_project()
def train(validate, dset=None):

    sv = Server()
    sv.load_data('cn_model_v1.0.csv')
    sv.train(validate=validate, selection_set=dset)
    sv.use(dset=dset)
    sv.errors('rmse', 'med_abs_error', 'mean_abs_error', 'r2', dset=dset)
Example #3
0
    def test_load_data(self):

        print('\nUNIT TEST: Server.load_data')
        sv = Server()
        sv.load_data(DB_LOC)
        self.assertEqual(len(sv._df), 482)
        self.assertEqual(type(sv._sets), PackagedData)
        remove('config.yml')
def main():

    logger.stream_level = 'debug'
    sv = Server(num_processes=4)
    sv.load_data('../kv_model_v1.0.csv')
    sv.tune_hyperparameters(20,
                            20,
                            shuffle='train',
                            split=[0.7, 0.2, 0.1],
                            eval_set='test')
Example #5
0
def main(db_name: str):

    # Set up logging
    logger.stream_level = 'info'
    logger.log_dir = db_name.replace('.csv', '') + '_logs'
    logger.file_level = 'debug'

    # Split database proportionally based on property value
    # Proportions are 70% learn, 20% validate, 10% test
    prop_range_from_split(db_name, [0.7, 0.2, 0.1])

    # Find the optimal number of input variables
    # Train (learn + valid) set used for evaluation
    n_desc = len(find_optimal_num_inputs(db_name, 'train', _NUM_PROC)[1])
    logger.log('info', 'Optimal number of input variables: {}'.format(n_desc))

    # Create server object with base config
    sv = Server(model_config=db_name.replace('.csv', '.yml'),
                num_processes=_NUM_PROC)

    # Load data
    sv.load_data(db_name)

    # Limit input variables to `n_desc` using Train set
    # Outputs to relevant database name
    sv.limit_inputs(
        n_desc, eval_set='train',
        output_filename=db_name.replace('.csv', '.{}.csv'.format(n_desc))
    )

    # Tune hyperparameters (architecture and ADAM)
    # 20 employer bees, 10 search cycles
    # Evaluation of solutions based on validation set median absolute error
    sv.tune_hyperparameters(20, 10, eval_set='valid', eval_fn='med_abs_error')

    # Create an ECNet project (saved and recalled later)
    # 5 pools with 75 trials/pool, best ANNs selected from each pool
    sv.create_project(db_name.replace('.csv', ''), 5, 75)

    # Train project
    # Select best candidates based on validation set median absolute error
    sv.train(validate=True, selection_set='valid',
             selection_fn='med_abs_error')

    # Obtain learning, validation, testing set median absolute error, r-squared
    err_l = sv.errors('med_abs_error', 'r2', dset='learn')
    err_v = sv.errors('med_abs_error', 'r2', dset='valid')
    err_t = sv.errors('med_abs_error', 'r2', dset='test')
    logger.log('info', 'Learning set performance: {}'.format(err_l))
    logger.log('info', 'Validation set performance: {}'.format(err_v))
    logger.log('info', 'Testing set performance: {}'.format(err_t))

    # Save the project, creating a .prj file and removing un-chosen candidates
    sv.save_project(del_candidates=True)
Example #6
0
def main():

    logger.stream_level = 'debug'
    sv = Server(num_processes=4)
    sv.load_data('../kv_model_v1.0.csv')
    sv.create_project('kinetic_viscosity', num_pools=5, num_candidates=25)
    sv.train(shuffle='train',
             split=[0.7, 0.2, 0.1],
             validate=True,
             selection_set='test')
    sv.save_project(del_candidates=True)
Example #7
0
    def test_use_project(self):

        print('\nUNIT TEST: Server.use')
        sv = Server()
        sv.load_data(DB_LOC, random=True, split=[0.7, 0.2, 0.1])
        sv.create_project('test_project', 2, 2)
        sv._vars['epochs'] = 100
        sv.train()
        results = sv.use()
        self.assertEqual(len(results), len(sv._df))
        remove('config.yml')
        rmtree('test_project')
Example #8
0
    def test_server_limit(self):

        print('\nUNIT TEST: limit_rforest (Server)')
        sv = Server()
        sv.load_data(DB_LOC)
        sv.limit_inputs(2, output_filename='cn_limited.csv')
        self.assertEqual(len(sv._df._input_names), 2)
        self.assertEqual(len(sv._sets.learn_x[0]), 2)
        sv.load_data('cn_limited.csv')
        self.assertEqual(len(sv._df._input_names), 2)
        self.assertEqual(len(sv._sets.learn_x[0]), 2)
        remove('cn_limited.csv')
        remove('config.yml')
Example #9
0
    def test_save_project(self):

        print('\nUNIT TEST: Server.save_project')
        sv = Server()
        sv.load_data(DB_LOC, random=True, split=[0.7, 0.2, 0.1])
        sv.create_project('test_project', 2, 2)
        sv._vars['epochs'] = 100
        sv.train()
        sv.save_project()
        self.assertTrue(exists('test_project.prj'))
        self.assertTrue(not isdir('test_project'))
        remove('test_project.prj')
        remove('config.yml')
Example #10
0
    def test_predict(self):

        print('\nUNIT TEST: project.predict')
        sv = Server()
        sv.load_data('cn_model_v2.0.csv')
        sv.create_project('test_project', 1, 1)
        sv._vars['epochs'] = 100
        sv.train()
        sv.save_project()

        results = predict(['CCC', 'CCCC'], 'test_project.prj', 'results.csv')

        self.assertEqual(len(results), 2)

        remove('test_project.prj')
        remove('config.yml')
def tune(num_processes,
         shuffle=None,
         split=[0.7, 0.2, 0.1],
         validate=True,
         eval_set=None,
         eval_fn='rmse'):

    logger.stream_level = 'debug'
    sv = Server(num_processes=num_processes)
    sv.load_data('cn_model_v1.0.csv', random=True, split=split)
    sv.tune_hyperparameters(2,
                            2,
                            shuffle=shuffle,
                            split=split,
                            validate=validate,
                            eval_set=eval_set,
                            eval_fn=eval_fn)
Example #12
0
def train_project(validate,
                  shuffle,
                  split=[0.7, 0.2, 0.1],
                  num_processes=1,
                  dset=None,
                  sel_fn='rmse',
                  output_filename=None):

    sv = Server(num_processes=num_processes)
    sv.load_data('cn_model_v1.0.csv')
    sv.create_project('_training_test', num_pools=2, num_candidates=2)
    sv.train(shuffle=shuffle,
             split=split,
             selection_set=dset,
             selection_fn=sel_fn,
             validate=validate)
    sv.use(dset=dset, output_filename=output_filename)
    sv.errors('rmse', 'med_abs_error', 'mean_abs_error', 'r2')
    sv.save_project()
Example #13
0
    def test_train_project(self):

        print('\nUNIT TEST: Server.train')
        sv = Server()
        sv.load_data(DB_LOC, random=True, split=[0.7, 0.2, 0.1])
        sv.create_project('test_project', 2, 2)
        sv._vars['epochs'] = 100
        sv.train()
        for pool in range(2):
            self.assertTrue(
                exists(join('test_project', 'pool_{}'.format(pool),
                            'model.h5')))
            for candidate in range(2):
                self.assertTrue(
                    exists(
                        join('test_project', 'pool_{}'.format(pool),
                             'candidate_{}'.format(candidate), 'model.h5')))
        remove('config.yml')
        rmtree('test_project')
Example #14
0
    def test_multiprocessing_train(self):

        print('\nUNIT TEST: multiprocessing training')
        sv = Server(num_processes=8)
        sv.load_data(DB_LOC)
        sv.create_project('test_project', 2, 4)
        sv._vars['epochs'] = 100
        sv.train()
        for pool in range(2):
            self.assertTrue(
                exists(join('test_project', 'pool_{}'.format(pool),
                            'model.h5')))
            for candidate in range(4):
                self.assertTrue(
                    exists(
                        join('test_project', 'pool_{}'.format(pool),
                             'candidate_{}'.format(candidate), 'model.h5')))
        remove('config.yml')
        rmtree('test_project')
Example #15
0
def retrain():

    sv = Server(prj_file='_training_test.prj')
    sv.load_data('cn_model_v1.0.csv')
    sv.train(retrain=True)
    sv.save_project(del_candidates=True)
Example #16
0
def create_model(prop_abvr: str, smiles: list = None, targets: list = None,
                 db_name: str = None, qspr_backend: str = 'padel',
                 create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1],
                 log_level: str = 'info', log_to_file: bool = True,
                 num_processes: int = 1):
    ''' create_model: ECRL's database/model creation workflow for all
    publications

    Args:
        prop_abvr (str): abbreviation for the property name (e.g. CN)
        smiles (list): if supplied with targets, creates a new database
        targets (list): if supplied with smiles, creates a new database
        db_name (str): you may supply an existing ECNet-formatted database
        qspr_backend (str): if creating new database, generation software to
            use (`padel`, `alvadesc`)
        create_plots (bool): if True, creates plots for median absolute error
            vs. number of descriptors as inputs, parity plot for all sets
        data_split (list): [learn %, valid %, test %] for all supplied data
        log_level (str): `debug`, `info`, `warn`, `error`, `crit`
        log_to_file (bool): if True, saves workflow logs to a file in `logs`
            directory
        num_processes (int): number of concurrent processes to use for various
            tasks
    '''

    # Initialize logging
    logger.stream_level = log_level
    if log_to_file:
        logger.file_level = log_level

    # If database not supplied, create database from supplied SMILES, targets
    if db_name is None:
        if smiles is None or targets is None:
            raise ValueError('Must supply SMILES and target values')
        db_name = datetime.now().strftime('{}_model_%Y%m%d.csv'.format(
            prop_abvr
        ))
        logger.log('info', 'Creating database {}...'.format(db_name),
                   'WORKFLOW')
        create_db(smiles, db_name, targets, prop_abvr, backend=qspr_backend)
        logger.log('info', 'Created database {}'.format(db_name), 'WORKFLOW')

    # Create database split, each subset has proportionally equal number of
    #   compounds based on range of experimental/target values
    logger.log('info', 'Creating optimal data split...', 'WORKFLOW')
    prop_range_from_split(db_name, data_split)
    logger.log('info', 'Created optimal data split', 'WORKFLOW')
    df = DataFrame(db_name)
    df.create_sets()
    logger.log('info', '\tLearning set: {}'.format(len(df.learn_set)),
               'WORKFLOW')
    logger.log('info', '\tValidation set: {}'.format(len(df.valid_set)),
               'WORKFLOW')
    logger.log('info', '\tTest set: {}'.format(len(df.test_set)), 'WORKFLOW')

    # Find optimal number of QSPR input variables
    logger.log('info', 'Finding optimal number of inputs...', 'WORKFLOW')
    errors, desc = find_optimal_num_inputs(db_name, 'valid', num_processes)
    df = DataFrame(db_name)
    df.set_inputs(desc)
    df.save(db_name.replace('.csv', '_opt.csv'))
    logger.log('info', 'Found optimal number of inputs', 'WORKFLOW')
    logger.log('info', '\tNumber of inputs: {}'.format(len(df._input_names)),
               'WORKFLOW')

    # Plot the curve of MAE vs. num. desc. added, if desired
    if create_plots:
        logger.log('info', 'Creating plot of MAE vs. descriptors...',
                   'WORKFLOW')
        num_add = [e[0] for e in errors]
        maes = [e[1] for e in errors]
        opt_num = len(desc)
        plt.clf()
        plt.rcParams['font.family'] = 'Times New Roman'
        plt.plot(num_add, maes, c='blue')
        plt.axvline(x=opt_num, c='red', linestyle='--')
        plt.xlabel('Number of Descriptors as ANN Input Variables')
        plt.ylabel('Median Absolute Error of {} Predictions'.format(prop_abvr))
        plt.savefig(db_name.replace('.csv', '_desc_curve.png'))
        logger.log('info', 'Created plot of MAE vs. descriptors', 'WORKFLOW')

    # Tune ANN hyperparameters according to validation set performance
    logger.log('info', 'Tuning ANN hyperparameters...', 'WORKFLOW')
    config = default_config()
    config = tune_hyperparameters(df, config, 25, 10, num_processes,
                                  shuffle='train', split=[0.7, 0.2, 0.1],
                                  validate=True, eval_set='valid',
                                  eval_fn='med_abs_error', epochs=300)
    config['epochs'] = default_config()['epochs']
    config_filename = db_name.replace('.csv', '.yml')
    save_config(config, config_filename)
    logger.log('info', 'Tuned ANN hyperparameters', 'WORKFLOW')
    logger.log('info', '\tLearning rate: {}'.format(config['learning_rate']),
               'WORKFLOW')
    logger.log('info', '\tLR decay: {}'.format(config['decay']), 'WORKFLOW')
    logger.log('info', '\tBatch size: {}'.format(config['batch_size']),
               'WORKFLOW')
    logger.log('info', '\tPatience: {}'.format(config['patience']), 'WORKFLOW')
    logger.log('info', '\tHidden layers: {}'.format(config['hidden_layers']),
               'WORKFLOW')

    # Create Model
    logger.log('info', 'Generating ANN...', 'WORKFLOW')
    sv = Server(db_name.replace('.csv', '.yml'), num_processes=num_processes)
    sv.load_data(db_name.replace('.csv', '_opt.csv'))
    sv.create_project(db_name.replace('.csv', ''), 5, 75)
    sv.train(validate=True, selection_set='valid', shuffle='train',
             split=[0.7, 0.2, 0.1], selection_fn='med_abs_error')
    logger.log('info', 'ANN Generated', 'WORKFLOW')
    logger.log('info', 'Measuring ANN performance...', 'WORKFLOW')
    preds_test = sv.use(dset='test')
    preds_train = sv.use(dset='train')
    test_errors = sv.errors('r2', 'med_abs_error', dset='test')
    train_errors = sv.errors('r2', 'med_abs_error', dset='train')
    logger.log('info', 'Measured ANN performance', 'WORKFLOW')
    logger.log('info', '\tTraining set:\t R2: {}\t MAE: {}'.format(
        train_errors['r2'], train_errors['med_abs_error']), 'WORKFLOW')
    logger.log('info', '\tTesting set:\t R2: {}\t MAE: {}'.format(
        test_errors['r2'], test_errors['med_abs_error']), 'WORKFLOW')
    sv.save_project(del_candidates=True)

    if create_plots:
        logger.log('info', 'Creating parity plot...', 'WORKFLOW')
        plt.clf()
        parity_plot = ParityPlot(
            '',
            'Experimental {} Value'.format(prop_abvr),
            'Predicted {} Value'.format(prop_abvr)
        )
        parity_plot.add_series(concatenate(
            (sv._sets.learn_y, sv._sets.valid_y)
        ), preds_train, 'Training Set', 'blue')
        parity_plot.add_series(sv._sets.test_y, preds_test, 'Test Set', 'red')
        parity_plot.add_error_bars(test_errors['med_abs_error'], 'Test MAE')
        parity_plot._add_label('Test $R^2$', test_errors['r2'])
        parity_plot._add_label('Training MAE', train_errors['med_abs_error'])
        parity_plot._add_label('Training $R^2$', train_errors['r2'])
        parity_plot.save(db_name.replace('.csv', '_parity.png'))
        logger.log('info', 'Created parity plot', 'WORKFLOW')
def main():

    logger.stream_level = 'debug'
    sv = Server()
    sv.load_data('cn_model_v1.0.csv')
    sv.remove_outliers(output_filename='cn_no_outliers.csv')
def limit(num_processes, output_filename=None):

    logger.stream_level = 'info'
    sv = Server(num_processes=num_processes)
    sv.load_data('cn_model_v1.0.csv')
    sv.limit_inputs(3, output_filename=output_filename)
def main():

    logger.stream_level = 'debug'
    sv = Server(num_processes=4)
    sv.load_data('../kv_model_v1.0_full.csv')
    sv.limit_inputs(15, output_filename='../kv_model_v1.0.csv')