def remove_outliers(df, leaf_size=40, num_processes=1): '''Unsupervised outlier detection using local outlier factor Args: df (ecnet.utils.data_utils.DataFrame): loaded data leaf_size (int): used by nearest-neighbor algorithm as the number of points at which to switch to brute force num_processes (int): number of parallel jobs for LOF algorithm Returns: ecnet.utils.data_utils.DataFrame: data w/o outliers ''' ditto_logger.stream_level = logger.stream_level if logger.file_level != 'disable': ditto_logger.log_dir = logger.log_dir ditto_logger.file_level = logger.file_level ditto_logger.default_call_loc('OUTLIERS') item_collection = ItemCollection(df._filename) for inp_name in df.input_names: item_collection.add_attribute(Attribute(inp_name)) for pt in df.data_points: item_collection.add_item(pt.id, deepcopy(pt.inputs)) item_collection.strip() outliers = local_outlier_factor(item_collection.dataframe, leaf_size=leaf_size, n_jobs=num_processes) logger.log('debug', 'Outliers: {}'.format(outliers), call_loc='OUTLIERS') for out in outliers: for idx, pt in enumerate(df.data_points): if out == pt.id: del df.data_points[idx] break return df
def errors(self, *args, dset: str = None, model_filename: str = 'model.h5') -> dict: '''Obtains various errors for specified set Args: *args (str): one or more error functions; `rmse`, `mean_abs_error`, `med_abs_error`, `r2` dset (str): set to obtain errors for; `learn`, `valid`, `train`, `test`, None (all sets) model_filename (str): if specified, uses .h5 model file for error calculations Returns: dict: {'error_fn', value ...} with supplied errors ''' for err in args: logger.log('debug', 'Calculating {} for {} set'.format(err, dset), call_loc='ERRORS') preds = self.use(dset, model_filename=model_filename) y_vals = get_y(self._sets, dset) errors = {} for err in args: errors[err] = get_error(preds, y_vals, err) logger.log('debug', 'Errors: {}'.format(errors), call_loc='ERRORS') return errors
def use(self, dset: str = None, output_filename: str = None, model_filename: str = 'model.h5') -> list: '''Uses trained neural network(s) to predict for specified set; single NN if no project created, best pool candidates if created Args: dset (str): set to predict for; `learn`, `valid`, `train`, `test`, None (all sets) output_filename (str): if supplied, saves results to this CSV file model_filename (str): if supplied, use specified .h5 model file Returns: list: list of results for specified set ''' if self._prj_name is None: results = use_model(self._sets, dset, model_filename) else: results = use_project(self._prj_name, self._num_pools, dset, self._sets) if output_filename is not None: save_results(results, dset, self._df, output_filename) logger.log('info', 'Results saved to {}'.format(output_filename), call_loc='USE') return results
def limit_inputs(self, limit_num: int, num_estimators: int = None, eval_set: str = 'learn', output_filename: str = None, **kwargs) -> list: '''Selects `limit_num` influential input parameters using random forest regression Args: limit_num (int): desired number of inputs num_estimators (int): number of trees in the RFR algorithm; defaults to the total number of inputs output_filename (str): if not None, new limited database is saved here eval_set (str): set to perform RFR on (`learn`, `valid`, `train`, `test`, None (all)) (default: `learn`) **kwargs: any argument accepted by sklearn.ensemble.RandomForestRegressor Returns: list: [(feature, importance), ..., (feature, importance)] ''' result = limit_rforest(self._df, limit_num, num_estimators, self._num_processes, eval_set, **kwargs) self._df.set_inputs([r[0] for r in result]) self._sets = self._df.package_sets() if output_filename is not None: self._df.save(output_filename) logger.log( 'info', 'Resulting database saved to {}'.format(output_filename), call_loc='LIMIT') return result
def stream_logging(s_level): logger.stream_level = s_level logger.log('debug', 'Debug message') logger.log('info', 'Info message') logger.log('warn', 'Warning message') logger.log('error', 'Error message') logger.log('crit', 'Critical message')
def file_logging(f_level, log_dir='logs'): logger.file_level = f_level logger.stream_level = f_level logger.log_dir = log_dir logger.log('debug', 'Debug message') logger.log('info', 'Info message') logger.log('warn', 'Warning message') logger.log('error', 'Error message') logger.log('crit', 'Critical message')
def train(self, shuffle: str = None, split: list = None, retrain: bool = False, validate: bool = False, selection_set: str = None, selection_fn: str = 'rmse', model_filename: str = 'model.h5', verbose: int = 0) -> tuple: '''Trains neural network(s) using currently-loaded data; single NN if no project is created, all candidates if created Args: shuffle (str): `all` to shuffle all sets for each candidate, `train` to shuffle learning/validation data for each candidate split (list): if shuffle == `all`||`train`, [learn%, valid%, test%] retrain (bool): if True, uses existing project models for additional training validate (bool): if True, uses a validation set to determine learning cutoff selection_set (str): best candidates/pool are selected using this set; `learn`, `valid`, `train`, `test`, None (all data) selection_fn (str): candidates are selected based on this error metric; `rmse`, `mean_abs_error`, `med_abs_error` model_filename (str): if project not created, saves `.h5` file here verbose (int): 1 to display loss at each epoch, 0 otherwise (single model only) Returns: tuple: if training single model, returns tuple of learn/valid losses, else None ''' if self._prj_name is None: logger.log('info', 'Training single model', call_loc='TRAIN') _, losses = train_model(self._sets, self._vars, selection_set, selection_fn, retrain, model_filename, validate, verbose=verbose) return losses else: train_project(self._prj_name, self._num_pools, self._num_candidates, self._df, self._sets, self._vars, shuffle, split, retrain, validate, selection_set, selection_fn, self._num_processes) return None
def load(self, filename=None): '''Loads neural network from .h5 file Args: filename (str): path to .h5 model file ''' if filename is None: filename = self._filename self._model = load_model(filename) logger.log('debug', 'Model loaded from {}'.format(filename), call_loc='MLP')
def create_sorted_sets(self, sort_string, split=[0.65, 0.25, 0.1]): '''Creates random learn, validate and test sets, ensuring data points with the supplied sort string are split proportionally between the sets Args: sort_string (str): database STRING value used to sort data points split (list): [learn%, valid%, test%] for set assignments ''' logger.log('debug', 'Creating sorted sets using {} STRING'.format(sort_string), call_loc='DF') try: string_idx = self.string_names.index(sort_string) except ValueError: raise Exception('{} not found in STRING names'.format(sort_string)) self.data_points.sort(key=lambda x: x.strings[string_idx]) string_vals = [] string_groups = [] for point in self.data_points: if point.strings[string_idx] not in string_vals: string_vals.append(point.strings[string_idx]) string_groups.append([point]) else: string_groups[-1].append(point) self.learn_set = [] self.valid_set = [] self.test_set = [] for group in string_groups: split_locs = [ int(len(group) * split[0]), int(len(group) * (split[0] + split[1])), ] for point in group[0:split_locs[0]]: point.assignment = 'L' self.learn_set.append(point) for point in group[split_locs[0]:split_locs[1]]: point.assignment = 'V' self.valid_set.append(point) for point in group[split_locs[1]:]: point.assignment = 'T' self.test_set.append(point) logger.log('debug', 'Number of entries in learn set: {}'.format( len(self.learn_set)), call_loc='DF') logger.log('debug', 'Number of entries in validation set: {}'.format( len(self.valid_set)), call_loc='DF') logger.log('debug', 'Number of entries in test set: {}'.format( len(self.test_set)), call_loc='DF')
def create_sorted_sets(self, sort_str: str, split: list = [0.7, 0.2, 0.1]): '''Creates random learn, validate and test sets, ensuring data points with the supplied sort string are split proportionally between the sets Args: sort_str (str): database STRING value used to sort data points split (list): [learn%, valid%, test%] for set assignments ''' logger.log('debug', 'Creating sorted sets using {} STRING'.format(sort_str), call_loc='DF') if sort_str not in self._string_names: raise ValueError('{} not found in STRING names'.format(sort_str)) string_vals = [] string_groups = [] for point in self.data_points: str_val = getattr(point, sort_str) if str_val not in string_vals: string_vals.append(str_val) string_groups.append([point]) else: str_loc = string_vals.index(str_val) string_groups[str_loc].append(point) self.learn_set = [] self.valid_set = [] self.test_set = [] for group in string_groups: split_locs = [ int(len(group) * split[0]), int(len(group) * (split[0] + split[1])), ] for point in group[0:split_locs[0]]: point.assignment = 'L' self.learn_set.append(point) for point in group[split_locs[0]:split_locs[1]]: point.assignment = 'V' self.valid_set.append(point) for point in group[split_locs[1]:]: point.assignment = 'T' self.test_set.append(point) logger.log('debug', 'Number of entries in learn set: {}'.format( len(self.learn_set)), call_loc='DF') logger.log('debug', 'Number of entries in validation set: {}'.format( len(self.valid_set)), call_loc='DF') logger.log('debug', 'Number of entries in test set: {}'.format( len(self.test_set)), call_loc='DF')
def limit_rforest(df, limit_num, num_estimators=1000, num_processes=1): '''Uses random forest regression to select input parameters Args: df (ecnet.utils.data_utils.DataFrame): loaded data limit_num (int): desired number of input parameters num_estimators (int): number of trees used by RFR algorithm num_processes (int): number of parallel jobs for RFR algorithm Returns: ecnet.utils.data_utils.DataFrame: limited data ''' ditto_logger.stream_level = logger.stream_level if logger.file_level != 'disable': ditto_logger.log_dir = logger.log_dir ditto_logger.file_level = logger.file_level ditto_logger.default_call_loc('LIMIT') item_collection = ItemCollection(df._filename) for inp_name in df.input_names: item_collection.add_attribute(Attribute(inp_name)) for pt in df.data_points: item_collection.add_item(pt.id, deepcopy(pt.inputs)) for tar_name in df.target_names: item_collection.add_attribute(Attribute(tar_name, is_descriptor=False)) for pt in df.data_points: for idx, tar in enumerate(pt.targets): item_collection.set_item_attribute(pt.id, tar, df.target_names[idx]) item_collection.strip() params = [ param[0] for param in random_forest_regressor( item_collection.dataframe, target_attribute=df.target_names[0], n_components=limit_num, n_estimators=num_estimators, n_jobs=num_processes) ] for idx, param in enumerate(params): for tn in df.target_names: if tn == param: del params[idx] break logger.log('debug', 'Selected parameters: {}'.format(params), call_loc='LIMIT') df.set_inputs(params) return df
def save(self, filename: str = None): ''' save: saves the model weights, architecture to either the filename/ path specified when object was created, or new, supplied filename/path Args: filename (str): new filepath if different than init filename/path ''' if filename is None: filename = self._filename check_h5(filename) self._model.save(filename, include_optimizer=False) logger.log('debug', 'Model saved to {}'.format(filename), call_loc='MLP')
def load(self, filename: str = None): ''' load: loads a saved model, restoring the architecture/weights; loads from filename/path specified during object initialization, unless new filename/path specified Args: filename (str): new filepath if different than init filename/path ''' if filename is None: filename = self._filename self._model = load_model(filename, compile=False) logger.log('debug', 'Model loaded from {}'.format(filename), call_loc='MLP')
def save(self, filename: str): '''Saves the current state of the DataFrame to a new CSV database Args: filename (str): path to location where database is saved; if not supplied, saves to CSV file where data was loaded from ''' if filename is None: filename = self._filename if '.csv' not in filename: filename += '.csv' rows = [] type_row = ['DATAID', 'ASSIGNMENT'] type_row.extend(['STRING' for _ in range(len(self._string_names))]) type_row.extend(['GROUP' for _ in range(len(self._group_names))]) type_row.extend(['TARGET' for _ in range(len(self._target_names))]) type_row.extend(['INPUT' for _ in range(len(self._input_names))]) rows.append(type_row) title_row = ['DATAID', 'ASSIGNMENT'] title_row.extend(self._string_names) title_row.extend(self._group_names) title_row.extend(self._target_names) title_row.extend(self._input_names) rows.append(title_row) data_rows = [] for point in self.data_points: data_row = [point.id, point.assignment] data_row.extend([getattr(point, s) for s in self._string_names]) data_row.extend([getattr(point, g) for g in self._group_names]) data_row.extend([getattr(point, t) for t in self._target_names]) data_row.extend([getattr(point, i) for i in self._input_names]) data_rows.append(data_row) rows.extend(sorted(data_rows, key=lambda x: x[0])) with open(filename, 'w', encoding='utf8') as csv_file: wr = writer(csv_file, quoting=QUOTE_ALL, lineterminator='\n') for row in rows: wr.writerow(row) logger.log('debug', 'DataFrame saved to {}'.format(filename), call_loc='DF')
def save(self, filename=None): '''Saves neural network to .h5 file filename (str): if None, uses MultilayerPerceptron._filename; otherwise, saves to this file ''' if filename is None: filename = self._filename if H5_EXT.match(filename) is None: raise ValueError( 'Invalid filename/extension, must be `.h5`: {}'.format( filename)) self._model.save(filename) logger.log('debug', 'Model saved to {}'.format(filename), call_loc='MLP')
def set_inputs(self, inputs: list): '''Removes all input variables except those supplied, updates sets accordingly Args: inputs (list): input variable names, str ''' logger.log('debug', 'Setting input parameters to {}'.format(inputs), call_loc='DF') for inp in inputs: if inp not in self._input_names: raise ValueError('{} not found in existing inputs'.format(inp)) self._input_names = inputs self.create_sets()
def main(db_name: str): # Set up logging logger.stream_level = 'info' logger.log_dir = db_name.replace('.csv', '') + '_logs' logger.file_level = 'debug' # Split database proportionally based on property value # Proportions are 70% learn, 20% validate, 10% test prop_range_from_split(db_name, [0.7, 0.2, 0.1]) # Find the optimal number of input variables # Train (learn + valid) set used for evaluation n_desc = len(find_optimal_num_inputs(db_name, 'train', _NUM_PROC)[1]) logger.log('info', 'Optimal number of input variables: {}'.format(n_desc)) # Create server object with base config sv = Server(model_config=db_name.replace('.csv', '.yml'), num_processes=_NUM_PROC) # Load data sv.load_data(db_name) # Limit input variables to `n_desc` using Train set # Outputs to relevant database name sv.limit_inputs( n_desc, eval_set='train', output_filename=db_name.replace('.csv', '.{}.csv'.format(n_desc)) ) # Tune hyperparameters (architecture and ADAM) # 20 employer bees, 10 search cycles # Evaluation of solutions based on validation set median absolute error sv.tune_hyperparameters(20, 10, eval_set='valid', eval_fn='med_abs_error') # Create an ECNet project (saved and recalled later) # 5 pools with 75 trials/pool, best ANNs selected from each pool sv.create_project(db_name.replace('.csv', ''), 5, 75) # Train project # Select best candidates based on validation set median absolute error sv.train(validate=True, selection_set='valid', selection_fn='med_abs_error') # Obtain learning, validation, testing set median absolute error, r-squared err_l = sv.errors('med_abs_error', 'r2', dset='learn') err_v = sv.errors('med_abs_error', 'r2', dset='valid') err_t = sv.errors('med_abs_error', 'r2', dset='test') logger.log('info', 'Learning set performance: {}'.format(err_l)) logger.log('info', 'Validation set performance: {}'.format(err_v)) logger.log('info', 'Testing set performance: {}'.format(err_t)) # Save the project, creating a .prj file and removing un-chosen candidates sv.save_project(del_candidates=True)
def save(self, filename): '''Saves the current state of the DataFrame to a new CSV database Args: filename (str): path to location where database is saved ''' if '.csv' not in filename: filename += '.csv' rows = [] type_row = ['DATAID', 'ASSIGNMENT'] type_row.extend(['STRING' for _ in range(self.num_strings)]) type_row.extend(['GROUP' for _ in range(self.num_groups)]) type_row.extend(['TARGET' for _ in range(self.num_targets)]) type_row.extend(['INPUT' for _ in range(self.num_inputs)]) rows.append(type_row) title_row = ['DATAID', 'ASSIGNMENT'] title_row.extend(self.string_names) title_row.extend(self.group_names) title_row.extend(self.target_names) title_row.extend(self.input_names) rows.append(title_row) data_rows = [] for point in self.data_points: data_row = [point.id, point.assignment] data_row.extend(point.strings) data_row.extend(point.groups) data_row.extend(point.targets) data_row.extend(point.inputs) data_rows.append(data_row) rows.extend(sorted(data_rows, key=lambda x: x[0])) with open(filename, 'w') as csv_file: wr = writer(csv_file, quoting=QUOTE_ALL, lineterminator='\n') for row in rows: wr.writerow(row) logger.log('debug', 'DataFrame saved to {}'.format(filename), call_loc='DF')
def create_project(self, project_name: str, num_pools: int = 1, num_candidates: int = 1): '''Creates folder hierarchy for a new project Args: project_name (str): name of the project, and top-level dir name num_pools (int): number of candidate pools for the project num_candidates (int): number of candidates per pool ''' self._prj_name = project_name self._num_pools = num_pools self._num_candidates = num_candidates create_project(project_name, num_pools, num_candidates) logger.log('info', 'Created project: {}'.format(project_name), call_loc='PROJECT') logger.log('debug', 'Number of pools: {}'.format(num_pools), call_loc='PROJECT') logger.log('debug', 'Number of candidates/pool: {}'.format(num_candidates), call_loc='PROJECT')
def save_project(self, filename: str = None, clean_up: bool = True, del_candidates: bool = False): '''Saves current state of project to a .prj file Args: filename (str): if None, uses name supplied in project creation; else, saves the project here clean_up (bool): if True, removes project folder structure after .prj file created del_candidates (bool): if True, deletes all non-chosen candidate neural networks ''' if self._prj_name is None: raise RuntimeError('A project has not been created') save_path = save_project(self._prj_name, filename, self._cf_file, self._df, self._vars, clean_up, del_candidates) logger.log('info', 'Project saved to {}'.format(save_path), call_loc='PROJECT')
def load_data(self, filename: str, random: bool = False, split: list = None, normalize: bool = False): '''Loads data from an ECNet-formatted CSV database Args: filename (str): path to CSV database random (bool): if True, random set assignments (learn, validate, test); if False, uses DB-specified assignmenets split (list): if random is True, [learn%, valid%, test%] normalize (bool): if true, uses min-max normalization to normalize input parameters between 0 and 1 ''' logger.log('info', 'Loading data from {}'.format(filename), call_loc='LOAD') self._df = DataFrame(filename) if normalize: self._df.normalize() self._df.create_sets(random, split) self._sets = self._df.package_sets()
def set_inputs(self, inputs): '''Removes all input variables except those supplied Args: inputs (list): input variable names, str ''' logger.log('debug', 'Setting input parameters to {}'.format(inputs), call_loc='DF') idxs = [] for input in inputs: for cidx, current_input in enumerate(self.input_names): if input == current_input: idxs.append(cidx) for point in self.data_points: new_inputs = [] for i in idxs: new_inputs.append(point.inputs[i]) point.inputs = new_inputs self.input_names = inputs self.num_inputs = len(inputs) self.create_sets()
def limit_rforest(df: DataFrame, limit_num: int, num_estimators: int = None, num_processes: int = 1, eval_set: str = 'learn', **kwargs) -> list: '''Uses random forest regression to select input parameters Args: df (ecnet.utils.data_utils.DataFrame): loaded data limit_num (int): desired number of input parameters num_estimators (int): number of trees used by RFR algorithm num_processes (int): number of parallel jobs for RFR algorithm eval_set (str): set to perform RFR on (`learn`, `valid`, `train`, `test`, None (all)) (default: `learn`) **kwargs: any argument accepted by sklearn.ensemble.RandomForestRegressor Returns: list: [(feature, importance), ..., (feature, importance)] ''' logger.log( 'info', 'Finding {} most influential input parameters'.format(limit_num), call_loc='LIMIT') pd = df.package_sets() X = get_x(pd, eval_set) y = ravel(get_y(pd, eval_set)) if num_estimators is None: num_estimators = len(X[0]) logger.log('debug', 'Number of estimators: {}'.format(num_estimators), call_loc='LIMIT') regr = RandomForestRegressor(n_jobs=num_processes, n_estimators=num_estimators, **kwargs) regr.fit(X, y) importances = regr.feature_importances_ result = [] for idx, name in enumerate(df._input_names): result.append((name, importances[idx])) result = sorted(result, key=lambda t: t[1], reverse=True)[:limit_num] logger.log('debug', 'Selected parameters: {}'.format([r[0] for r in result]), call_loc='LIMIT') return result
def __init__(self, model_config: str = 'config.yml', prj_file: str = None, num_processes: int = 1): '''Server object: handles data loading, model creation, data-to-model hand-off, data input parameter selection, hyperparameter tuning Args: model_config (str): path to multilayer perceptron .yml config file; if not found, default config is generated prj_file (str): path to pre-existing ECNet .prj file, if using for retraining/new predictions num_processes (int): number of parallel processes to utilize for training and tuning processes ''' logger.log('debug', 'Arguments:\n\t| model_config:\t\t{}\n\t|' ' prj_file:\t\t{}\n\t| num_processes:\t{}'.format( model_config, prj_file, num_processes), call_loc='INIT') self._num_processes = num_processes if prj_file is not None: self._prj_name, self._num_pools, self._num_candidates, self._df,\ self._cf_file, self._vars = open_project(prj_file) check_config(self._vars) self._sets = self._df.package_sets() logger.log('info', 'Opened project {}'.format(prj_file), call_loc='INIT') return self._cf_file = model_config self._prj_name = None self._vars = {} try: self._vars.update(open_config(self._cf_file)) check_config(self._vars) except FileNotFoundError: logger.log( 'warn', '{} not found, generating default config'.format(model_config), call_loc='INIT') self._vars = default_config() save_config(self._vars, self._cf_file)
def shuffle(self, sets: str = 'all', split: list = [0.7, 0.2, 0.1]): '''Shuffles learning, validation and test sets or learning and validation sets Args: sets (str): 'all' or 'train' (learning + validation) split (list): [learn%, valid%, test%] used for new assignments ''' logger.log('debug', 'Shuffling {} sets'.format(sets), call_loc='DF') if sets == 'all': self.create_sets(random=True, split=split) elif sets == 'train': lv_set = [] lv_set.extend([p for p in self.learn_set]) lv_set.extend([p for p in self.valid_set]) rand_index = sample( range(len(self.learn_set) + len(self.valid_set)), (len(self.learn_set) + len(self.valid_set))) self.learn_set = lv_set[ 0:int(len(rand_index) * (split[0] / (1 - split[2]))) + 1] self.valid_set = lv_set[ int(len(rand_index) * (split[0] / (1 - split[2]))) + 1:] logger.log('debug', 'Number of entries in learn set: {}'.format( len(self.learn_set)), call_loc='DF') logger.log('debug', 'Number of entries in validation set: {}'.format( len(self.valid_set)), call_loc='DF') logger.log('debug', 'Number of entries in test set: {}'.format( len(self.test_set)), call_loc='DF') else: raise ValueError('Invalid sets argument: {}'.format(sets))
def tune_hyperparameters(df, vars, num_employers, num_iterations, num_processes=1, shuffle=None, split=None, validate=True, eval_set=None, eval_fn='rmse'): '''Tunes neural network learning/architecture hyperparameters Args: df (ecnet.utils.data_utils.DataFrame): currently loaded data vars (dict): ecnet.Server._vars variables num_employers (int): number of employer bees num_iterations (int): number of search cycles for the colony num_processes (int): number of parallel processes to utilize shuffle (bool): if True, shuffles L/V/T data for all evals split (list): if shuffle is True, [learn%, valid%, test%] validate (bool): if True, uses periodic validation; otherwise, no eval_set (str): set used to evaluate bee performance; `learn`, `valid`, `train`, `test`, None (all sets) eval_fn (str): error function used to evaluate bee performance; `rmse`, `mean_abs_error`, `med_abs_error` Returns: dict: tuned hyperparameters ''' fit_fn_args = { 'df': df, 'shuffle': shuffle, 'num_processes': num_processes, 'split': split, 'validate': validate, 'eval_set': eval_set, 'eval_fn': eval_fn, 'hidden_layers': vars['hidden_layers'] } value_ranges = [ ('float', (0.0, 1.0)), ('float', (0.0, 1.0)), ('float', (0.0, 1.0)), ('float', (0.0, 1.0)), ('float', (0.0, 1.0)) ] for _ in range(len(vars['hidden_layers'])): value_ranges.append(('int', (1, 50))) abc = ABC( tune_fitness_function, num_employers=num_employers, value_ranges=value_ranges, args=fit_fn_args, processes=num_processes ) abc._logger.stream_level = logger.stream_level if logger.file_level != 'disable': abc._logger.log_dir = logger.log_dir abc._logger.file_level = logger.file_level abc._logger.default_call_loc('TUNE') abc.create_employers() for i in range(num_iterations): logger.log('info', 'Iteration {}'.format(i + 1), call_loc='TUNE') abc.run_iteration() logger.log('info', 'Best Performer: {}, {}'.format( abc.best_performer[2], { 'beta_1': abc.best_performer[1][0], 'beta_2': abc.best_performer[1][1], 'decay': abc.best_performer[1][2], 'epsilon': abc.best_performer[1][3], 'learning_rate': abc.best_performer[1][4], 'hidden_layers': abc.best_performer[1][5:] } ), call_loc='TUNE') params = abc.best_performer[1] vars['beta_1'] = params[0] vars['beta_2'] = params[1] vars['decay'] = params[2] vars['epsilon'] = params[3] vars['learning_rate'] = params[4] for l_idx in range(len(vars['hidden_layers'])): vars['hidden_layers'][l_idx][0] = params[5 + l_idx] return vars
def create_model(prop_abvr: str, smiles: list = None, targets: list = None, db_name: str = None, qspr_backend: str = 'padel', create_plots: bool = True, data_split: list = [0.7, 0.2, 0.1], log_level: str = 'info', log_to_file: bool = True, num_processes: int = 1): ''' create_model: ECRL's database/model creation workflow for all publications Args: prop_abvr (str): abbreviation for the property name (e.g. CN) smiles (list): if supplied with targets, creates a new database targets (list): if supplied with smiles, creates a new database db_name (str): you may supply an existing ECNet-formatted database qspr_backend (str): if creating new database, generation software to use (`padel`, `alvadesc`) create_plots (bool): if True, creates plots for median absolute error vs. number of descriptors as inputs, parity plot for all sets data_split (list): [learn %, valid %, test %] for all supplied data log_level (str): `debug`, `info`, `warn`, `error`, `crit` log_to_file (bool): if True, saves workflow logs to a file in `logs` directory num_processes (int): number of concurrent processes to use for various tasks ''' # Initialize logging logger.stream_level = log_level if log_to_file: logger.file_level = log_level # If database not supplied, create database from supplied SMILES, targets if db_name is None: if smiles is None or targets is None: raise ValueError('Must supply SMILES and target values') db_name = datetime.now().strftime('{}_model_%Y%m%d.csv'.format( prop_abvr )) logger.log('info', 'Creating database {}...'.format(db_name), 'WORKFLOW') create_db(smiles, db_name, targets, prop_abvr, backend=qspr_backend) logger.log('info', 'Created database {}'.format(db_name), 'WORKFLOW') # Create database split, each subset has proportionally equal number of # compounds based on range of experimental/target values logger.log('info', 'Creating optimal data split...', 'WORKFLOW') prop_range_from_split(db_name, data_split) logger.log('info', 'Created optimal data split', 'WORKFLOW') df = DataFrame(db_name) df.create_sets() logger.log('info', '\tLearning set: {}'.format(len(df.learn_set)), 'WORKFLOW') logger.log('info', '\tValidation set: {}'.format(len(df.valid_set)), 'WORKFLOW') logger.log('info', '\tTest set: {}'.format(len(df.test_set)), 'WORKFLOW') # Find optimal number of QSPR input variables logger.log('info', 'Finding optimal number of inputs...', 'WORKFLOW') errors, desc = find_optimal_num_inputs(db_name, 'valid', num_processes) df = DataFrame(db_name) df.set_inputs(desc) df.save(db_name.replace('.csv', '_opt.csv')) logger.log('info', 'Found optimal number of inputs', 'WORKFLOW') logger.log('info', '\tNumber of inputs: {}'.format(len(df._input_names)), 'WORKFLOW') # Plot the curve of MAE vs. num. desc. added, if desired if create_plots: logger.log('info', 'Creating plot of MAE vs. descriptors...', 'WORKFLOW') num_add = [e[0] for e in errors] maes = [e[1] for e in errors] opt_num = len(desc) plt.clf() plt.rcParams['font.family'] = 'Times New Roman' plt.plot(num_add, maes, c='blue') plt.axvline(x=opt_num, c='red', linestyle='--') plt.xlabel('Number of Descriptors as ANN Input Variables') plt.ylabel('Median Absolute Error of {} Predictions'.format(prop_abvr)) plt.savefig(db_name.replace('.csv', '_desc_curve.png')) logger.log('info', 'Created plot of MAE vs. descriptors', 'WORKFLOW') # Tune ANN hyperparameters according to validation set performance logger.log('info', 'Tuning ANN hyperparameters...', 'WORKFLOW') config = default_config() config = tune_hyperparameters(df, config, 25, 10, num_processes, shuffle='train', split=[0.7, 0.2, 0.1], validate=True, eval_set='valid', eval_fn='med_abs_error', epochs=300) config['epochs'] = default_config()['epochs'] config_filename = db_name.replace('.csv', '.yml') save_config(config, config_filename) logger.log('info', 'Tuned ANN hyperparameters', 'WORKFLOW') logger.log('info', '\tLearning rate: {}'.format(config['learning_rate']), 'WORKFLOW') logger.log('info', '\tLR decay: {}'.format(config['decay']), 'WORKFLOW') logger.log('info', '\tBatch size: {}'.format(config['batch_size']), 'WORKFLOW') logger.log('info', '\tPatience: {}'.format(config['patience']), 'WORKFLOW') logger.log('info', '\tHidden layers: {}'.format(config['hidden_layers']), 'WORKFLOW') # Create Model logger.log('info', 'Generating ANN...', 'WORKFLOW') sv = Server(db_name.replace('.csv', '.yml'), num_processes=num_processes) sv.load_data(db_name.replace('.csv', '_opt.csv')) sv.create_project(db_name.replace('.csv', ''), 5, 75) sv.train(validate=True, selection_set='valid', shuffle='train', split=[0.7, 0.2, 0.1], selection_fn='med_abs_error') logger.log('info', 'ANN Generated', 'WORKFLOW') logger.log('info', 'Measuring ANN performance...', 'WORKFLOW') preds_test = sv.use(dset='test') preds_train = sv.use(dset='train') test_errors = sv.errors('r2', 'med_abs_error', dset='test') train_errors = sv.errors('r2', 'med_abs_error', dset='train') logger.log('info', 'Measured ANN performance', 'WORKFLOW') logger.log('info', '\tTraining set:\t R2: {}\t MAE: {}'.format( train_errors['r2'], train_errors['med_abs_error']), 'WORKFLOW') logger.log('info', '\tTesting set:\t R2: {}\t MAE: {}'.format( test_errors['r2'], test_errors['med_abs_error']), 'WORKFLOW') sv.save_project(del_candidates=True) if create_plots: logger.log('info', 'Creating parity plot...', 'WORKFLOW') plt.clf() parity_plot = ParityPlot( '', 'Experimental {} Value'.format(prop_abvr), 'Predicted {} Value'.format(prop_abvr) ) parity_plot.add_series(concatenate( (sv._sets.learn_y, sv._sets.valid_y) ), preds_train, 'Training Set', 'blue') parity_plot.add_series(sv._sets.test_y, preds_test, 'Test Set', 'red') parity_plot.add_error_bars(test_errors['med_abs_error'], 'Test MAE') parity_plot._add_label('Test $R^2$', test_errors['r2']) parity_plot._add_label('Training MAE', train_errors['med_abs_error']) parity_plot._add_label('Training $R^2$', train_errors['r2']) parity_plot.save(db_name.replace('.csv', '_parity.png')) logger.log('info', 'Created parity plot', 'WORKFLOW')
def __init__(self, filename: str): '''DataFrame object: handles data importing, set splitting, shuffling, packaging Args: filename (str): path to ECNet-formatted CSV database ''' if '.csv' not in filename: filename += '.csv' try: with open(filename, newline='', encoding='utf8') as file: rows = list(reader(file)) except FileNotFoundError: raise Exception('CSV database not found: {}'.format(filename)) self._filename = filename self.data_points = [] self._string_names = [] self._group_names = [] self._target_names = [] self._input_names = [] for p_idx, row in enumerate(rows[2:]): new_point = DataPoint() for h_idx, header in enumerate(rows[0]): if header == 'DATAID': new_point.id = row[h_idx] elif header == 'ASSIGNMENT': new_point.assignment = row[h_idx] elif header == 'STRING': if p_idx == 0: self._string_names.append(rows[1][h_idx]) setattr(new_point, rows[1][h_idx], row[h_idx]) elif header == 'GROUP': if p_idx == 0: self._group_names.append(rows[1][h_idx]) setattr(new_point, rows[1][h_idx], row[h_idx]) elif header == 'TARGET': if p_idx == 0: self._target_names.append(rows[1][h_idx]) setattr(new_point, rows[1][h_idx], row[h_idx]) elif header == 'INPUT': if p_idx == 0: self._input_names.append(rows[1][h_idx]) setattr(new_point, rows[1][h_idx], row[h_idx]) self.data_points.append(new_point) logger.log('debug', 'Found {} data entries'.format(len(self.data_points)), call_loc='DF') logger.log('debug', 'Input parameters/entry: {}'.format(len(self._input_names)), call_loc='DF') logger.log('debug', 'Target values/entry: {}'.format(len(self._target_names)), call_loc='DF')
def create_sets(self, random: bool = False, split: list = [0.7, 0.2, 0.1]): '''Creates learning, validation and test sets Args: random (bool): if True, use random assignments for learn, validate, test sets split (list): [learn%, valid%, test%] if random == True ''' self.learn_set = [] self.valid_set = [] self.test_set = [] if random is True: logger.log('debug', 'Assigning entries to random sets', call_loc='DF') rand_index = sample(range(len(self)), len(self)) split_locs = [ int(len(rand_index) * split[0]), int(len(rand_index) * (split[0] + split[1])), ] learn_index = rand_index[0:split_locs[0]] valid_index = rand_index[split_locs[0]:split_locs[1]] test_index = rand_index[split_locs[1]:] for idx in learn_index: self.data_points[idx].assignment = 'L' self.learn_set.append(self.data_points[idx]) for idx in valid_index: self.data_points[idx].assignment = 'V' self.valid_set.append(self.data_points[idx]) for idx in test_index: self.data_points[idx].assignment = 'T' self.test_set.append(self.data_points[idx]) elif random is False: logger.log('debug', 'Assigning entries to explicit sets', call_loc='DF') for point in self.data_points: if point.assignment == 'L': self.learn_set.append(point) elif point.assignment == 'V': self.valid_set.append(point) elif point.assignment == 'T': self.test_set.append(point) else: raise ValueError('Unknown random boolean: {}'.format(random)) logger.log('debug', 'Number of entries in learn set: {}'.format( len(self.learn_set)), call_loc='DF') logger.log('debug', 'Number of entries in validation set: {}'.format( len(self.valid_set)), call_loc='DF') logger.log('debug', 'Number of entries in test set: {}'.format( len(self.test_set)), call_loc='DF')
def fit(self, l_x, l_y, v_x=None, v_y=None, epochs=1500, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, v=0): '''Fits neural network to supplied inputs and targets Args: l_x (numpy.array): learning input data l_y (numpy.array): learning target data v_x (numpy.array): if not None, periodic validation is performed w/ these inputs v_y (numpy.array): if not None, periodic validation is performed w/ these targets epochs (int): number of learning epochs if not validating, maximum number of learning epochs if performing periodic validation lr (float): learning rate for Adam optimizer beta_1 (float): beta_1 value for Adam optimizer beta_2 (float): beta_2 value for Adam optimizer epsilon (float): epsilon value for Adam optimizer decay (float): learning rate decay for Adam optimizer v (int): verbose training, `0` for no printing, `1` for printing ''' self._model.compile(loss=mean_squared_error, optimizer=Adam(lr=lr, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, decay=decay), metrics=[mae]) if v_x is not None and v_y is not None: valid_mae_lowest = self._model.evaluate(v_x, v_y, verbose=v)[1] steps = int(epochs / 250) for e in range(steps): h = self._model.fit(l_x, l_y, validation_data=(v_x, v_y), epochs=250, verbose=v) valid_mae = h.history['val_mean_absolute_error'][-1] if valid_mae < valid_mae_lowest: valid_mae_lowest = valid_mae elif valid_mae > (valid_mae_lowest + 0.05 * valid_mae_lowest): logger.log('debug', 'Validation cutoff after {} epochs'.format(e * 250), call_loc='MLP') return else: self._model.fit(l_x, l_y, epochs=epochs, verbose=v) logger.log('debug', 'Training complete after {} epochs'.format(epochs), call_loc='MLP')