Beispiel #1
0
def tune_fitness_function(params, **kwargs):
    '''Fitness function used by ABC

    Args:
        params (dict): bee hyperparams
        kwargs (dict): additional arguments

    Returns:
        float: error of NN with supplied hyperparams
    '''

    vars = default_config()
    vars['beta_1'] = params[0]
    vars['beta_2'] = params[1]
    vars['decay'] = params[2]
    vars['epsilon'] = params[3]
    vars['learning_rate'] = params[4]
    vars['hidden_layers'] = kwargs['hidden_layers']
    for l_idx in range(len(vars['hidden_layers'])):
        vars['hidden_layers'][l_idx][0] = params[5 + l_idx]

    df = kwargs['df']
    if kwargs['shuffle'] is not None:
        df.shuffle(kwargs['shuffle'], kwargs['split'])
    sets = df.package_sets()

    return train_model(sets, vars, kwargs['eval_set'], kwargs['eval_fn'],
                       validate=kwargs['validate'], save=False)
Beispiel #2
0
    def test_init(self):

        print('\nUNIT TEST: Server init')
        sv = Server()
        self.assertTrue(exists('config.yml'))
        self.assertEqual(sv._vars, default_config())
        remove('config.yml')
Beispiel #3
0
    def test_use_model(self):

        print('\nUNIT TEST: use_model')
        df = data_utils.DataFrame(DB_LOC)
        df.create_sets(random=True)
        pd = df.package_sets()
        config = server_utils.default_config()
        config['epochs'] = 100
        _ = server_utils.train_model(pd,
                                     config,
                                     'test',
                                     'rmse',
                                     filename='test_use.h5')
        self.assertEqual(
            len(server_utils.use_model(pd, 'learn', 'test_use.h5')),
            len(pd.learn_y))
        self.assertEqual(
            len(server_utils.use_model(pd, 'valid', 'test_use.h5')),
            len(pd.valid_y))
        self.assertEqual(
            len(server_utils.use_model(pd, 'test', 'test_use.h5')),
            len(pd.test_y))
        self.assertEqual(
            len(server_utils.use_model(pd, 'train', 'test_use.h5')),
            len(pd.learn_y) + len(pd.valid_y))
        self.assertEqual(len(server_utils.use_model(pd, None, 'test_use.h5')),
                         len(pd.learn_y) + len(pd.valid_y) + len(pd.test_y))
        remove('test_use.h5')
Beispiel #4
0
    def test_check_config(self):

        print('\nUNIT TEST: check_config')
        dc = server_utils.default_config()
        del dc['batch_size']
        self.assertFalse('batch_size' in list(dc.keys()))
        dc = server_utils.check_config(dc)
        self.assertTrue('batch_size' in list(dc.keys()))
        self.assertEqual(dc['batch_size'], 32)
Beispiel #5
0
def optimize_ecnet(param_dict, args):

    vars = default_config()
    vars['beta_1'] = param_dict['beta_1'].value
    vars['beta_2'] = param_dict['beta_2'].value
    vars['epsilon'] = param_dict['epsilon'].value
    vars['learning_rate'] = param_dict['learning_rate'].value
    vars['decay'] = param_dict['decay'].value
    vars['hidden_layers'][0][0] = param_dict['hidden_1'].value
    vars['hidden_layers'][1][0] = param_dict['hidden_2'].value

    dataframe = args['dataframe']
    sets = dataframe.package_sets()
    return train_model(sets, vars, 'test', 'rmse', validate=True, save=False)
Beispiel #6
0
    def test_train_model(self):

        print('\nUNIT TEST: train_model')
        df = data_utils.DataFrame(DB_LOC)
        df.create_sets(random=True)
        pd = df.package_sets()
        config = server_utils.default_config()
        config['epochs'] = 100
        _ = server_utils.train_model(pd,
                                     config,
                                     'test',
                                     'r2',
                                     filename='test_train.h5')
        self.assertTrue(exists('test_train.h5'))
        remove('test_train.h5')
Beispiel #7
0
    def __init__(self,
                 model_config: str = 'config.yml',
                 prj_file: str = None,
                 num_processes: int = 1):
        '''Server object: handles data loading, model creation, data-to-model
        hand-off, data input parameter selection, hyperparameter tuning

        Args:
            model_config (str): path to multilayer perceptron .yml config file;
                if not found, default config is generated
            prj_file (str): path to pre-existing ECNet .prj file, if using for
                retraining/new predictions
            num_processes (int): number of parallel processes to utilize for
                training and tuning processes
        '''

        logger.log('debug',
                   'Arguments:\n\t| model_config:\t\t{}\n\t|'
                   ' prj_file:\t\t{}\n\t| num_processes:\t{}'.format(
                       model_config, prj_file, num_processes),
                   call_loc='INIT')

        self._num_processes = num_processes

        if prj_file is not None:
            self._prj_name, self._num_pools, self._num_candidates, self._df,\
                self._cf_file, self._vars = open_project(prj_file)
            check_config(self._vars)
            self._sets = self._df.package_sets()
            logger.log('info',
                       'Opened project {}'.format(prj_file),
                       call_loc='INIT')
            return

        self._cf_file = model_config
        self._prj_name = None

        self._vars = {}
        try:
            self._vars.update(open_config(self._cf_file))
            check_config(self._vars)
        except FileNotFoundError:
            logger.log(
                'warn',
                '{} not found, generating default config'.format(model_config),
                call_loc='INIT')
            self._vars = default_config()
            save_config(self._vars, self._cf_file)
Beispiel #8
0
    def test_default_config(self):

        print('\nUNIT TEST: default_config')
        dc = server_utils.default_config()
        self.assertEqual(
            dc, {
                'epochs': 3000,
                'learning_rate': 0.01,
                'beta_1': 0.9,
                'beta_2': 0.999,
                'epsilon': 1e-8,
                'decay': 0.0,
                'hidden_layers': [[32, 'relu'], [32, 'relu']],
                'output_activation': 'linear',
                'batch_size': 32,
                'patience': 128
            })
Beispiel #9
0
    def test_open_save_config(self):

        print('\nUNIT TEST: open/save config')
        config = server_utils.default_config()
        server_utils.save_config(config, 'config.yml')
        config = server_utils.open_config('config.yml')
        self.assertEqual(
            server_utils.open_config('config.yml'), {
                'epochs': 3000,
                'learning_rate': 0.01,
                'beta_1': 0.9,
                'beta_2': 0.999,
                'epsilon': 1e-8,
                'decay': 0.0,
                'hidden_layers': [[32, 'relu'], [32, 'relu']],
                'output_activation': 'linear',
                'batch_size': 32,
                'patience': 128
            })
        remove('config.yml')
Beispiel #10
0
    def test_th_multiprocess(self):

        print('\nUNIT TEST: tune_hyperparameters multiprocessed')
        df = DataFrame(DB_LOC)
        df.create_sets(random=True)
        config = default_config()
        new_hp = tune_hyperparameters(df, config, 2, 1, 2, epochs=100)
        self.assertGreaterEqual(new_hp['beta_1'], 0)
        self.assertLessEqual(new_hp['beta_1'], 1)
        self.assertGreaterEqual(new_hp['beta_2'], 0)
        self.assertLessEqual(new_hp['beta_2'], 1)
        self.assertGreaterEqual(new_hp['decay'], 0)
        self.assertLessEqual(new_hp['decay'], 1)
        self.assertGreaterEqual(new_hp['epsilon'], 0)
        self.assertLessEqual(new_hp['epsilon'], 1)
        self.assertGreaterEqual(new_hp['learning_rate'], 0)
        self.assertLessEqual(new_hp['learning_rate'], 1)
        self.assertGreaterEqual(new_hp['batch_size'], 1)
        self.assertLessEqual(new_hp['batch_size'], len(df.learn_set))
        self.assertGreaterEqual(new_hp['hidden_layers'][0][0], 1)
        self.assertLessEqual(new_hp['hidden_layers'][0][0], 600)
        self.assertGreaterEqual(new_hp['hidden_layers'][1][0], 1)
        self.assertLessEqual(new_hp['hidden_layers'][1][0], 600)
Beispiel #11
0
def create_model(prop_abvr: str, smiles: list = None, targets: list = None,
                 db_name: str = None, qspr_backend: str = 'padel',
                 create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1],
                 log_level: str = 'info', log_to_file: bool = True,
                 num_processes: int = 1):
    ''' create_model: ECRL's database/model creation workflow for all
    publications

    Args:
        prop_abvr (str): abbreviation for the property name (e.g. CN)
        smiles (list): if supplied with targets, creates a new database
        targets (list): if supplied with smiles, creates a new database
        db_name (str): you may supply an existing ECNet-formatted database
        qspr_backend (str): if creating new database, generation software to
            use (`padel`, `alvadesc`)
        create_plots (bool): if True, creates plots for median absolute error
            vs. number of descriptors as inputs, parity plot for all sets
        data_split (list): [learn %, valid %, test %] for all supplied data
        log_level (str): `debug`, `info`, `warn`, `error`, `crit`
        log_to_file (bool): if True, saves workflow logs to a file in `logs`
            directory
        num_processes (int): number of concurrent processes to use for various
            tasks
    '''

    # Initialize logging
    logger.stream_level = log_level
    if log_to_file:
        logger.file_level = log_level

    # If database not supplied, create database from supplied SMILES, targets
    if db_name is None:
        if smiles is None or targets is None:
            raise ValueError('Must supply SMILES and target values')
        db_name = datetime.now().strftime('{}_model_%Y%m%d.csv'.format(
            prop_abvr
        ))
        logger.log('info', 'Creating database {}...'.format(db_name),
                   'WORKFLOW')
        create_db(smiles, db_name, targets, prop_abvr, backend=qspr_backend)
        logger.log('info', 'Created database {}'.format(db_name), 'WORKFLOW')

    # Create database split, each subset has proportionally equal number of
    #   compounds based on range of experimental/target values
    logger.log('info', 'Creating optimal data split...', 'WORKFLOW')
    prop_range_from_split(db_name, data_split)
    logger.log('info', 'Created optimal data split', 'WORKFLOW')
    df = DataFrame(db_name)
    df.create_sets()
    logger.log('info', '\tLearning set: {}'.format(len(df.learn_set)),
               'WORKFLOW')
    logger.log('info', '\tValidation set: {}'.format(len(df.valid_set)),
               'WORKFLOW')
    logger.log('info', '\tTest set: {}'.format(len(df.test_set)), 'WORKFLOW')

    # Find optimal number of QSPR input variables
    logger.log('info', 'Finding optimal number of inputs...', 'WORKFLOW')
    errors, desc = find_optimal_num_inputs(db_name, 'valid', num_processes)
    df = DataFrame(db_name)
    df.set_inputs(desc)
    df.save(db_name.replace('.csv', '_opt.csv'))
    logger.log('info', 'Found optimal number of inputs', 'WORKFLOW')
    logger.log('info', '\tNumber of inputs: {}'.format(len(df._input_names)),
               'WORKFLOW')

    # Plot the curve of MAE vs. num. desc. added, if desired
    if create_plots:
        logger.log('info', 'Creating plot of MAE vs. descriptors...',
                   'WORKFLOW')
        num_add = [e[0] for e in errors]
        maes = [e[1] for e in errors]
        opt_num = len(desc)
        plt.clf()
        plt.rcParams['font.family'] = 'Times New Roman'
        plt.plot(num_add, maes, c='blue')
        plt.axvline(x=opt_num, c='red', linestyle='--')
        plt.xlabel('Number of Descriptors as ANN Input Variables')
        plt.ylabel('Median Absolute Error of {} Predictions'.format(prop_abvr))
        plt.savefig(db_name.replace('.csv', '_desc_curve.png'))
        logger.log('info', 'Created plot of MAE vs. descriptors', 'WORKFLOW')

    # Tune ANN hyperparameters according to validation set performance
    logger.log('info', 'Tuning ANN hyperparameters...', 'WORKFLOW')
    config = default_config()
    config = tune_hyperparameters(df, config, 25, 10, num_processes,
                                  shuffle='train', split=[0.7, 0.2, 0.1],
                                  validate=True, eval_set='valid',
                                  eval_fn='med_abs_error', epochs=300)
    config['epochs'] = default_config()['epochs']
    config_filename = db_name.replace('.csv', '.yml')
    save_config(config, config_filename)
    logger.log('info', 'Tuned ANN hyperparameters', 'WORKFLOW')
    logger.log('info', '\tLearning rate: {}'.format(config['learning_rate']),
               'WORKFLOW')
    logger.log('info', '\tLR decay: {}'.format(config['decay']), 'WORKFLOW')
    logger.log('info', '\tBatch size: {}'.format(config['batch_size']),
               'WORKFLOW')
    logger.log('info', '\tPatience: {}'.format(config['patience']), 'WORKFLOW')
    logger.log('info', '\tHidden layers: {}'.format(config['hidden_layers']),
               'WORKFLOW')

    # Create Model
    logger.log('info', 'Generating ANN...', 'WORKFLOW')
    sv = Server(db_name.replace('.csv', '.yml'), num_processes=num_processes)
    sv.load_data(db_name.replace('.csv', '_opt.csv'))
    sv.create_project(db_name.replace('.csv', ''), 5, 75)
    sv.train(validate=True, selection_set='valid', shuffle='train',
             split=[0.7, 0.2, 0.1], selection_fn='med_abs_error')
    logger.log('info', 'ANN Generated', 'WORKFLOW')
    logger.log('info', 'Measuring ANN performance...', 'WORKFLOW')
    preds_test = sv.use(dset='test')
    preds_train = sv.use(dset='train')
    test_errors = sv.errors('r2', 'med_abs_error', dset='test')
    train_errors = sv.errors('r2', 'med_abs_error', dset='train')
    logger.log('info', 'Measured ANN performance', 'WORKFLOW')
    logger.log('info', '\tTraining set:\t R2: {}\t MAE: {}'.format(
        train_errors['r2'], train_errors['med_abs_error']), 'WORKFLOW')
    logger.log('info', '\tTesting set:\t R2: {}\t MAE: {}'.format(
        test_errors['r2'], test_errors['med_abs_error']), 'WORKFLOW')
    sv.save_project(del_candidates=True)

    if create_plots:
        logger.log('info', 'Creating parity plot...', 'WORKFLOW')
        plt.clf()
        parity_plot = ParityPlot(
            '',
            'Experimental {} Value'.format(prop_abvr),
            'Predicted {} Value'.format(prop_abvr)
        )
        parity_plot.add_series(concatenate(
            (sv._sets.learn_y, sv._sets.valid_y)
        ), preds_train, 'Training Set', 'blue')
        parity_plot.add_series(sv._sets.test_y, preds_test, 'Test Set', 'red')
        parity_plot.add_error_bars(test_errors['med_abs_error'], 'Test MAE')
        parity_plot._add_label('Test $R^2$', test_errors['r2'])
        parity_plot._add_label('Training MAE', train_errors['med_abs_error'])
        parity_plot._add_label('Training $R^2$', train_errors['r2'])
        parity_plot.save(db_name.replace('.csv', '_parity.png'))
        logger.log('info', 'Created parity plot', 'WORKFLOW')
Beispiel #12
0
def find_optimal_num_inputs(db_name: str, eval_set: str,
                            num_processes: int) -> tuple:
    ''' find_optimal_num_inputs: find the optimal number of input variables,
    return names of variables; optimal number of variables produces lowest
    median absolute error; variables added 25 at a time, according to RFR
    importance score (most-to-least important)

    Args:
        db_name (str): name/location of ECNet-formatted database
        eval_set (str): set to evaluate (`learn`, `valid`, `train`, `test`,
            None (all))
        num_processes (int): number of concurrent processes to run for RFR,
            training

    Returns:
        tuple: ([addition1, error1, ..., additionN, errorN], opt_desc)
    '''

    conf = default_config()
    conf['epochs'] = 300
    df = DataFrame(db_name)
    df.create_sets()
    conf['batch_size'] = len(df.learn_set)
    desc = limit_rforest(df,
                         len(df._input_names),
                         num_processes=num_processes,
                         eval_set=eval_set)
    desc = [d[0] for d in desc]

    errors = []
    if num_processes > 1:
        if name != 'nt':
            set_start_method('spawn', force=True)
        train_pool = Pool(processes=num_processes)

    for d_idx in range(0, len(desc), 10):
        if d_idx >= len(desc) - 1:
            to_use = desc[:]
        else:
            to_use = desc[:d_idx + 1]
        df = DataFrame(db_name)
        df.set_inputs(to_use)
        df.create_sets()
        sets = df.package_sets()

        if num_processes > 1:
            errors.append([
                d_idx,
                train_pool.apply_async(train_model, [
                    sets, conf, eval_set, 'med_abs_error', False, '_.h5',
                    False, False
                ])
            ])
        else:
            errors.append([
                d_idx,
                train_model(sets, conf, eval_set, 'med_abs_error', False,
                            '_.h5', False, False)[0]
            ])

    if num_processes > 1:
        train_pool.close()
        train_pool.join()
        for idx, err in enumerate(errors):
            errors[idx][1] = err[1].get()[0]

    min_error = errors[0][1]
    opt_num_desc = 1
    for err in errors[1:]:
        if err[1] < min_error:
            min_error = err[1]
            opt_num_desc = err[0]

    return (errors, desc[:opt_num_desc])