def chemprop_train() -> None: """Runs chemprop training.""" args = TrainArgs().parse_args() logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def setUp(self): parser = ArgumentParser() add_train_args(parser) args = parser.parse_args([]) args.data_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'delaney_toy.csv') args.dataset_type = 'regression' args.batch_size = 2 args.hidden_size = 5 args.epochs = 1 args.quiet = True self.temp_dir = TemporaryDirectory() args.save_dir = self.temp_dir.name logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) modify_train_args(args) cross_validate(args, logger) clear_cache() parser = ArgumentParser() add_predict_args(parser) args = parser.parse_args([]) args.batch_size = 2 args.checkpoint_dir = self.temp_dir.name args.preds_path = NamedTemporaryFile().name args.test_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'delaney_toy_smiles.csv') self.args = args
def sklearn_train() -> None: """Runs sklearn training.""" args = SklearnTrainArgs().parse_args() logger = create_logger(name='sklearn-train', save_dir=args.save_dir, quiet=args.quiet) cross_validate_sklearn(args, logger)
def main(): '''main method.''' args = parse_train_args() logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def sklearn_train() -> None: """Parses scikit-learn training arguments and trains a scikit-learn model. This is the entry point for the command line command :code:`sklearn_train`. """ args = SklearnTrainArgs().parse_args() logger = create_logger(name='sklearn-train', save_dir=args.save_dir, quiet=args.quiet) cross_validate_sklearn(args, logger)
def chemprop_train() -> None: """Parses Chemprop training arguments and trains (cross-validates) a Chemprop model. This is the entry point for the command line command :code:`chemprop_train`. """ args = TrainArgs().parse_args() logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def train_outside(args_dict): """ Used for calling this script from another python script. :dict args_dict: dict of args to use """ sys.argv = create_args(args_dict, 'train.py') args = TrainArgs().parse_args() logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def setUp(self): parser = ArgumentParser() add_train_args(parser) args = parser.parse_args([]) args.data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'delaney_toy.csv') args.dataset_type = 'regression' args.batch_size = 2 args.hidden_size = 5 args.epochs = 1 args.quiet = True self.args = args logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) self.logger = logger
def cross_validate_sklearn(args: SklearnTrainArgs) -> Tuple[float, float]: """ Runs k-fold cross-validation for a scikit-learn model. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.SklearnTrainArgs` object containing arguments for loading data and training the scikit-learn model. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=SKLEARN_TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) info = logger.info if logger is not None else print init_seed = args.seed save_dir = args.save_dir # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_sklearn(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'Seed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) # Report scores across folds avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') return mean_score, std_score
def train(): """Renders the train page and performs training if request method is POST.""" global PROGRESS, TRAINING warnings, errors = [], [] if request.method == 'GET': return render_train() # Get arguments data_name, epochs, ensemble_size, checkpoint_name = \ request.form['dataName'], int(request.form['epochs']), \ int(request.form['ensembleSize']), request.form['checkpointName'] gpu = request.form.get('gpu') data_path = os.path.join(app.config['DATA_FOLDER'], f'{data_name}.csv') dataset_type = request.form.get('datasetType', 'regression') # Create and modify args args = TrainArgs().parse_args([ '--data_path', data_path, '--dataset_type', dataset_type, '--epochs', str(epochs), '--ensemble_size', str(ensemble_size) ]) # Check if regression/classification selection matches data data = get_data(path=data_path) targets = data.targets() unique_targets = { target for row in targets for target in row if target is not None } if dataset_type == 'classification' and len(unique_targets - {0, 1}) > 0: errors.append( 'Selected classification dataset but not all labels are 0 or 1. Select regression instead.' ) return render_train(warnings=warnings, errors=errors) if dataset_type == 'regression' and unique_targets <= {0, 1}: errors.append( 'Selected regression dataset but all labels are 0 or 1. Select classification instead.' ) return render_train(warnings=warnings, errors=errors) if gpu is not None: if gpu == 'None': args.cuda = False else: args.gpu = int(gpu) current_user = request.cookies.get('currentUser') if not current_user: # Use DEFAULT as current user if the client's cookie is not set. current_user = app.config['DEFAULT_USER_ID'] ckpt_id, ckpt_name = db.insert_ckpt(checkpoint_name, current_user, args.dataset_type, args.epochs, args.ensemble_size, len(targets)) with TemporaryDirectory() as temp_dir: args.save_dir = temp_dir process = mp.Process(target=progress_bar, args=(args, PROGRESS)) process.start() TRAINING = 1 # Run training logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) task_scores = run_training(args, logger) process.join() # Reset globals TRAINING = 0 PROGRESS = mp.Value('d', 0.0) # Check if name overlap if checkpoint_name != ckpt_name: warnings.append( name_already_exists_message('Checkpoint', checkpoint_name, ckpt_name)) # Move models for root, _, files in os.walk(args.save_dir): for fname in files: if fname.endswith('.pt'): model_id = db.insert_model(ckpt_id) save_path = os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model_id}.pt') shutil.move(os.path.join(args.save_dir, root, fname), save_path) return render_train(trained=True, metric=args.metric, num_tasks=len(args.task_names), task_names=args.task_names, task_scores=format_float_list(task_scores), mean_score=format_float(np.mean(task_scores)), warnings=warnings, errors=errors)
def cross_validate(args: TrainArgs) -> Tuple[float, float]: """ Runs k-fold cross-validation for a Chemprop model. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) info = logger.info if logger is not None else print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) # Run training on different random seeds for each fold all_scores = [] for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = run_training(args, logger) all_scores.append(model_scores) all_scores = np.array(all_scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num, scores in enumerate(all_scores): info( f'\tSeed {init_seed + fold_num} ==> test {args.metric} = {np.nanmean(scores):.6f}' ) if args.show_individual_scores: for task_name, score in zip(args.task_names, scores): info( f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {args.metric} = {score:.6f}' ) # Report scores across models avg_scores = np.nanmean( all_scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {args.metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(args.task_names): info( f'\tOverall test {task_name} {args.metric} = ' f'{np.nanmean(all_scores[:, task_num]):.6f} +/- {np.nanstd(all_scores[:, task_num]):.6f}' ) # Save scores with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f: writer = csv.writer(f) writer.writerow([ 'Task', f'Mean {args.metric}', f'Standard deviation {args.metric}' ] + [f'Fold {i} {args.metric}' for i in range(args.num_folds)]) for task_num, task_name in enumerate(args.task_names): task_scores = all_scores[:, task_num] mean, std = np.nanmean(task_scores), np.nanstd(task_scores) writer.writerow([task_name, mean, std] + task_scores.tolist()) return mean_score, std_score
def cross_validate(args: TrainArgs, train_func: Callable[[TrainArgs, MoleculeDataset, Logger], Dict[str, List[float]]] ) -> Tuple[float, float]: """ Runs k-fold cross-validation. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param train_func: Function which runs training. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir args.task_names = get_task_names(path=args.data_path, smiles_columns=args.smiles_columns, target_columns=args.target_columns, ignore_columns=args.ignore_columns) # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args makedirs(args.save_dir) try: args.save(os.path.join(args.save_dir, 'args.json')) except subprocess.CalledProcessError: debug('Could not write the reproducibility section of the arguments to file, thus omitting this section.') args.save(os.path.join(args.save_dir, 'args.json'), with_reproducibility=False) # set explicit H option and reaction option reset_featurization_parameters(logger=logger) set_explicit_h(args.explicit_h) set_adding_hs(args.adding_h) if args.reaction: set_reaction(args.reaction, args.reaction_mode) elif args.reaction_solvent: set_reaction(True, args.reaction_mode) # Get data debug('Loading data') data = get_data( path=args.data_path, args=args, logger=logger, skip_none_targets=True, data_weights_path=args.data_weights_path ) validate_dataset_type(data, dataset_type=args.dataset_type) args.features_size = data.features_size() if args.atom_descriptors == 'descriptor': args.atom_descriptors_size = data.atom_descriptors_size() args.ffn_hidden_size += args.atom_descriptors_size elif args.atom_descriptors == 'feature': args.atom_features_size = data.atom_features_size() set_extra_atom_fdim(args.atom_features_size) if args.bond_features_path is not None: args.bond_features_size = data.bond_features_size() set_extra_bond_fdim(args.bond_features_size) debug(f'Number of tasks = {args.num_tasks}') if args.target_weights is not None and len(args.target_weights) != args.num_tasks: raise ValueError('The number of provided target weights must match the number and order of the prediction tasks') # Run training on different random seeds for each fold all_scores = defaultdict(list) for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) data.reset_features_and_targets() # If resuming experiment, load results from trained models test_scores_path = os.path.join(args.save_dir, 'test_scores.json') if args.resume_experiment and os.path.exists(test_scores_path): print('Loading scores') with open(test_scores_path) as f: model_scores = json.load(f) # Otherwise, train the models else: model_scores = train_func(args, data, logger) for metric, scores in model_scores.items(): all_scores[metric].append(scores) all_scores = dict(all_scores) # Convert scores to numpy arrays for metric, scores in all_scores.items(): all_scores[metric] = np.array(scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold contains_nan_scores = False for fold_num in range(args.num_folds): for metric, scores in all_scores.items(): info(f'\tSeed {init_seed + fold_num} ==> test {metric} = {multitask_mean(scores[fold_num], metric):.6f}') if args.show_individual_scores: for task_name, score in zip(args.task_names, scores[fold_num]): info(f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {metric} = {score:.6f}') if np.isnan(score): contains_nan_scores = True # Report scores across folds for metric, scores in all_scores.items(): avg_scores = multitask_mean(scores, axis=1, metric=metric) # average score for each model across tasks mean_score, std_score = np.mean(avg_scores), np.std(avg_scores) info(f'Overall test {metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(args.task_names): info(f'\tOverall test {task_name} {metric} = ' f'{np.mean(scores[:, task_num]):.6f} +/- {np.std(scores[:, task_num]):.6f}') if contains_nan_scores: info("The metric scores observed for some fold test splits contain 'nan' values. \ This can occur when the test set does not meet the requirements \ for a particular metric, such as having no valid instances of one \ task in the test set or not having positive examples for some classification metrics. \ Before v1.5.1, the default behavior was to ignore nan values in individual folds or tasks \ and still return an overall average for the remaining folds or tasks. The behavior now \ is to include them in the average, converting overall average metrics to 'nan' as well.") # Save scores with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f: writer = csv.writer(f) header = ['Task'] for metric in args.metrics: header += [f'Mean {metric}', f'Standard deviation {metric}'] + \ [f'Fold {i} {metric}' for i in range(args.num_folds)] writer.writerow(header) if args.dataset_type == 'spectra': # spectra data type has only one score to report row = ['spectra'] for metric, scores in all_scores.items(): task_scores = scores[:,0] mean, std = np.mean(task_scores), np.std(task_scores) row += [mean, std] + task_scores.tolist() writer.writerow(row) else: # all other data types, separate scores by task for task_num, task_name in enumerate(args.task_names): row = [task_name] for metric, scores in all_scores.items(): task_scores = scores[:, task_num] mean, std = np.mean(task_scores), np.std(task_scores) row += [mean, std] + task_scores.tolist() writer.writerow(row) # Determine mean and std score of main metric avg_scores = multitask_mean(all_scores[args.metric], metric=args.metric, axis=1) mean_score, std_score = np.mean(avg_scores), np.std(avg_scores) # Optionally merge and save test preds if args.save_preds: all_preds = pd.concat([pd.read_csv(os.path.join(save_dir, f'fold_{fold_num}', 'test_preds.csv')) for fold_num in range(args.num_folds)]) all_preds.to_csv(os.path.join(save_dir, 'test_preds.csv'), index=False) return mean_score, std_score
import pandas as pd import glob import os from chemprop.args import TrainArgs from chemprop.train import cross_validate from chemprop.utils import create_logger csvs = glob.glob(os.path.join('../data/tmprss2_meyer_et_al/', '*.csv')) raw_data = pd.concat((pd.read_csv(f) for f in csvs)) chemprop_data = raw_data[['SMILES', 'Activity']] chemprop_data.to_csv('chemprop_in.csv', index=False) # argument passing pretty janky but it's set up to use command line args = TrainArgs().parse_args(['--data_path', 'chemprop_in.csv', '--dataset_type', 'regression', '--save_dir', 'models']) logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) cross_validate(args, logger)
def grid_search(args: Namespace): # Create loggers logger = create_logger(name='hyperparameter_optimization', save_dir=args.log_dir, quiet=True) train_logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) # Run grid search results = [] # Define hyperparameter optimization def objective(hyperparams: Dict[str, Union[int, float]]) -> float: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Update args with hyperparams hyper_args = deepcopy(args) if args.save_dir is not None: folder_name = '_'.join(f'{key}_{value}' for key, value in hyperparams.items()) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) # Record hyperparameters logger.info(hyperparams) # Cross validate mean_score, std_score = cross_validate(hyper_args, train_logger) # Record results temp_model = build_model(hyper_args) num_params = param_count(temp_model) logger.info(f'num params: {num_params:,}') logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}') results.append({ 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params }) # Deal with nan if np.isnan(mean_score): if hyper_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError('Can\'t handle nan score for non-classification dataset.') return (1 if hyper_args.minimize_score else -1) * mean_score fmin(objective, SPACE, algo=tpe.suggest, max_evals=args.num_iters) # Report best result results = [result for result in results if not np.isnan(result['mean_score'])] best_result = min(results, key=lambda result: (1 if args.minimize_score else -1) * result['mean_score']) logger.info('best') logger.info(best_result['hyperparams']) logger.info(f'num params: {best_result["num_params"]:,}') logger.info(f'{best_result["mean_score"]} +/- {best_result["std_score"]} {args.metric}') # Save best hyperparameter settings as JSON config file makedirs(args.config_save_path, isfile=True) with open(args.config_save_path, 'w') as f: json.dump(best_result['hyperparams'], f, indent=4, sort_keys=True)
def hyperopt(args: HyperoptArgs) -> None: """ Runs hyperparameter optimization on a Chemprop model. Hyperparameter optimization optimizes the following parameters: * :code:`hidden_size`: The hidden size of the neural network layers is selected from {300, 400, ..., 2400} * :code:`depth`: The number of message passing iterations is selected from {2, 3, 4, 5, 6} * :code:`dropout`: The dropout probability is selected from {0.0, 0.05, ..., 0.4} * :code:`ffn_num_layers`: The number of feed-forward layers after message passing is selected from {1, 2, 3} The best set of hyperparameters is saved as a JSON file to :code:`args.config_save_path`. :param args: A :class:`~chemprop.args.HyperoptArgs` object containing arguments for hyperparameter optimization in addition to all arguments needed for training. """ # Create logger logger = create_logger(name=HYPEROPT_LOGGER_NAME, save_dir=args.log_dir, quiet=True) # Run grid search results = [] # Define hyperparameter optimization def objective(hyperparams: Dict[str, Union[int, float]]) -> float: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Copy args hyper_args = deepcopy(args) # Update args with hyperparams if args.save_dir is not None: folder_name = '_'.join(f'{key}_{value}' for key, value in hyperparams.items()) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) hyper_args.ffn_hidden_size = hyper_args.hidden_size # Record hyperparameters logger.info(hyperparams) # Cross validate mean_score, std_score = cross_validate(args=hyper_args, train_func=run_training) # Record results temp_model = MoleculeModel(hyper_args) num_params = param_count(temp_model) logger.info(f'num params: {num_params:,}') logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}') results.append({ 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params }) # Deal with nan if np.isnan(mean_score): if hyper_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError('Can\'t handle nan score for non-classification dataset.') return (1 if hyper_args.minimize_score else -1) * mean_score fmin(objective, SPACE, algo=tpe.suggest, max_evals=args.num_iters, rstate=np.random.RandomState(args.seed)) # Report best result results = [result for result in results if not np.isnan(result['mean_score'])] best_result = min(results, key=lambda result: (1 if args.minimize_score else -1) * result['mean_score']) logger.info('best') logger.info(best_result['hyperparams']) logger.info(f'num params: {best_result["num_params"]:,}') logger.info(f'{best_result["mean_score"]} +/- {best_result["std_score"]} {args.metric}') # Save best hyperparameter settings as JSON config file makedirs(args.config_save_path, isfile=True) with open(args.config_save_path, 'w') as f: json.dump(best_result['hyperparams'], f, indent=4, sort_keys=True)
def cross_validate( args: TrainArgs, train_func: Callable[[TrainArgs, MoleculeDataset, Logger], Dict[str, List[float]]] ) -> Tuple[float, float]: """ Runs k-fold cross-validation. For each of k splits (folds) of the data, trains and tests a model on that split and aggregates the performance across folds. :param args: A :class:`~chemprop.args.TrainArgs` object containing arguments for loading data and training the Chemprop model. :param train_func: Function which runs training. :return: A tuple containing the mean and standard deviation performance across folds. """ logger = create_logger(name=TRAIN_LOGGER_NAME, save_dir=args.save_dir, quiet=args.quiet) if logger is not None: debug, info = logger.debug, logger.info else: debug = info = print # Initialize relevant variables init_seed = args.seed save_dir = args.save_dir args.task_names = get_task_names(path=args.data_path, smiles_column=args.smiles_column, target_columns=args.target_columns, ignore_columns=args.ignore_columns) # Print command line debug('Command line') debug(f'python {" ".join(sys.argv)}') # Print args debug('Args') debug(args) # Save args args.save(os.path.join(args.save_dir, 'args.json')) # Get data debug('Loading data') data = get_data(path=args.data_path, args=args, logger=logger, skip_none_targets=True) validate_dataset_type(data, dataset_type=args.dataset_type) args.features_size = data.features_size() debug(f'Number of tasks = {args.num_tasks}') # Run training on different random seeds for each fold all_scores = defaultdict(list) for fold_num in range(args.num_folds): info(f'Fold {fold_num}') args.seed = init_seed + fold_num args.save_dir = os.path.join(save_dir, f'fold_{fold_num}') makedirs(args.save_dir) model_scores = train_func( args, deepcopy(data), logger) # deepcopy since data may be modified for metric, scores in model_scores.items(): all_scores[metric].append(scores) all_scores = dict(all_scores) # Convert scores to numpy arrays for metric, scores in all_scores.items(): all_scores[metric] = np.array(scores) # Report results info(f'{args.num_folds}-fold cross validation') # Report scores for each fold for fold_num in range(args.num_folds): for metric, scores in all_scores.items(): info( f'\tSeed {init_seed + fold_num} ==> test {metric} = {np.nanmean(scores[fold_num]):.6f}' ) if args.show_individual_scores: for task_name, score in zip(args.task_names, scores[fold_num]): info( f'\t\tSeed {init_seed + fold_num} ==> test {task_name} {metric} = {score:.6f}' ) # Report scores across folds for metric, scores in all_scores.items(): avg_scores = np.nanmean( scores, axis=1) # average score for each model across tasks mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) info(f'Overall test {metric} = {mean_score:.6f} +/- {std_score:.6f}') if args.show_individual_scores: for task_num, task_name in enumerate(args.task_names): info( f'\tOverall test {task_name} {metric} = ' f'{np.nanmean(scores[:, task_num]):.6f} +/- {np.nanstd(scores[:, task_num]):.6f}' ) # Save scores with open(os.path.join(save_dir, TEST_SCORES_FILE_NAME), 'w') as f: writer = csv.writer(f) header = ['Task'] for metric in args.metrics: header += [f'Mean {metric}', f'Standard deviation {metric}'] + \ [f'Fold {i} {metric}' for i in range(args.num_folds)] writer.writerow(header) for task_num, task_name in enumerate(args.task_names): row = [task_name] for metric, scores in all_scores.items(): task_scores = scores[:, task_num] mean, std = np.nanmean(task_scores), np.nanstd(task_scores) row += [mean, std] + task_scores.tolist() writer.writerow(row) # Determine mean and std score of main metric avg_scores = np.nanmean(all_scores[args.metric], axis=1) mean_score, std_score = np.nanmean(avg_scores), np.nanstd(avg_scores) # Optionally merge and save test preds if args.save_preds: all_preds = pd.concat([ pd.read_csv( os.path.join(save_dir, f'fold_{fold_num}', 'test_preds.csv')) for fold_num in range(args.num_folds) ]) all_preds.to_csv(os.path.join(save_dir, 'test_preds.csv'), index=False) return mean_score, std_score
def hyperopt(args: HyperoptArgs) -> None: """ Runs hyperparameter optimization on a Chemprop model. Hyperparameter optimization optimizes the following parameters: * :code:`hidden_size`: The hidden size of the neural network layers is selected from {300, 400, ..., 2400} * :code:`depth`: The number of message passing iterations is selected from {2, 3, 4, 5, 6} * :code:`dropout`: The dropout probability is selected from {0.0, 0.05, ..., 0.4} * :code:`ffn_num_layers`: The number of feed-forward layers after message passing is selected from {1, 2, 3} The best set of hyperparameters is saved as a JSON file to :code:`args.config_save_path`. :param args: A :class:`~chemprop.args.HyperoptArgs` object containing arguments for hyperparameter optimization in addition to all arguments needed for training. """ # Create logger logger = create_logger(name=HYPEROPT_LOGGER_NAME, save_dir=args.log_dir, quiet=True) # Load in manual trials if args.manual_trial_dirs is not None: manual_trials = load_manual_trials(args.manual_trial_dirs, SPACE.keys(), args) logger.info(f'{len(manual_trials)} manual trials included in hyperparameter search.') else: manual_trials = None logger.info('No manual trials loaded as part of hyperparameter search') makedirs(args.hyperopt_checkpoint_dir) # Define hyperparameter optimization def objective(hyperparams: Dict[str, Union[int, float]], seed: int) -> Dict: # Convert hyperparams from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Copy args hyper_args = deepcopy(args) # Update args with hyperparams if args.save_dir is not None: folder_name = '_'.join(f'{key}_{value}' for key, value in hyperparams.items()) hyper_args.save_dir = os.path.join(hyper_args.save_dir, folder_name) for key, value in hyperparams.items(): setattr(hyper_args, key, value) hyper_args.ffn_hidden_size = hyper_args.hidden_size # Cross validate mean_score, std_score = cross_validate(args=hyper_args, train_func=run_training) # Record results temp_model = MoleculeModel(hyper_args) num_params = param_count(temp_model) logger.info(f'Trial results with seed {seed}') logger.info(hyperparams) logger.info(f'num params: {num_params:,}') logger.info(f'{mean_score} +/- {std_score} {hyper_args.metric}') # Deal with nan if np.isnan(mean_score): if hyper_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError('Can\'t handle nan score for non-classification dataset.') loss = (1 if hyper_args.minimize_score else -1) * mean_score return { 'loss': loss, 'status': 'ok', 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params, 'seed': seed, } # Iterate over a number of trials for i in range(args.num_iters): # run fmin and load trials in single steps to allow for parallel operation trials = load_trials(dir_path=args.hyperopt_checkpoint_dir, previous_trials=manual_trials) if len(trials) >= args.num_iters: break # Set a unique random seed for each trial. Pass it into objective function for logging purposes. hyperopt_seed = get_hyperopt_seed(seed=args.seed, dir_path=args.hyperopt_checkpoint_dir) fmin_objective = partial(objective, seed=hyperopt_seed) os.environ['HYPEROPT_FMIN_SEED'] = str(hyperopt_seed) # this environment variable changes the seed in fmin # Log the start of the trial logger.info(f'Initiating trial with seed {hyperopt_seed}') logger.info(f'Loaded {len(trials)} previous trials') if len(trials) < args.startup_random_iters: random_remaining = args.startup_random_iters - len(trials) logger.info(f'Parameters assigned with random search, {random_remaining} random trials remaining') else: logger.info(f'Parameters assigned with TPE directed search') fmin( fmin_objective, SPACE, algo=partial(tpe.suggest, n_startup_jobs=args.startup_random_iters), max_evals=len(trials) + 1, trials=trials, ) # Create a trials object with only the last instance by merging the last data with an empty trials object last_trial = merge_trials(Trials(), [trials.trials[-1]]) save_trials(args.hyperopt_checkpoint_dir, last_trial, hyperopt_seed) # Report best result all_trials = load_trials(dir_path=args.hyperopt_checkpoint_dir, previous_trials=manual_trials) results = all_trials.results results = [result for result in results if not np.isnan(result['mean_score'])] best_result = min(results, key=lambda result: (1 if args.minimize_score else -1) * result['mean_score']) logger.info(f'Best trial, with seed {best_result["seed"]}') logger.info(best_result['hyperparams']) logger.info(f'num params: {best_result["num_params"]:,}') logger.info(f'{best_result["mean_score"]} +/- {best_result["std_score"]} {args.metric}') # Save best hyperparameter settings as JSON config file makedirs(args.config_save_path, isfile=True) with open(args.config_save_path, 'w') as f: json.dump(best_result['hyperparams'], f, indent=4, sort_keys=True)
if __name__ == '__main__': parser = ArgumentParser() add_train_args(parser) parser.add_argument('--class_weight', type=str, choices=['balanced'], help='How to weight classes (None means no class balance)') parser.add_argument('--single_task', action='store_true', default=False, help='Whether to run each task separately (needed when dataset has null entries)') parser.add_argument('--radius', type=int, default=2, help='Morgan fingerprint radius') parser.add_argument('--num_bits', type=int, default=2048, help='Number of bits in morgan fingerprint') parser.add_argument('--num_trees', type=int, default=500, help='Number of random forest trees') args = parser.parse_args() modify_train_args(args) logger = create_logger(name='random_forest', save_dir=args.save_dir, quiet=args.quiet) if args.metric is None: if args.dataset_type == 'regression': args.metric = 'rmse' elif args.dataset_type == 'classification': args.metric = 'auc' else: raise ValueError(f'Default metric not supported for dataset_type "{args.dataset_type}"') cross_validate_random_forest(args, logger)