def test_save_smiles_splits(self): try: self.args.save_smiles_splits = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('save smiles splits')
def test_bias(self): try: self.args.bias = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('bias')
def test_show_individual_scores(self): try: self.args.show_individual_scores = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('show_individual_scores')
def test_rdkit_2d_features_unnormalized(self): try: self.args.features_generator = ['rdkit_2d'] modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('rdkit_2d_features_unnormalized')
def test_no_cache(self): try: self.args.no_cache = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('no_cache')
def test_undirected_messages(self): try: self.args.undirected = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('undirected_messages')
def test_activation_prelu(self): try: self.args.activation = 'PReLU' modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('activation_prelu')
def test_hyperopt(self): try: parser = ArgumentParser() add_train_args(parser) parser.add_argument('--num_iters', type=int, default=20, help='Number of hyperparameter choices to try') parser.add_argument('--config_save_path', type=str, help='Path to .json file where best hyperparameter settings will be written') parser.add_argument('--log_dir', type=str, help='(Optional) Path to a directory where all results of the hyperparameter optimization will be written') args = parser.parse_args([]) args.data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'delaney_toy.csv') args.dataset_type = 'regression' args.batch_size = 2 args.hidden_size = 5 args.epochs = 1 args.quiet = True temp_file = NamedTemporaryFile() args.config_save_path = temp_file.name args.num_iters = 3 modify_train_args(args) grid_search(args) clear_cache() except: self.fail('hyperopt')
def test_atom_messages(self): try: self.args.atom_messages = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('atom_messages')
def test_config(self): try: self.args.config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.json') modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('config')
def setUp(self): parser = ArgumentParser() add_train_args(parser) args = parser.parse_args([]) args.data_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'delaney_toy.csv') args.dataset_type = 'regression' args.batch_size = 2 args.hidden_size = 5 args.epochs = 1 args.quiet = True self.temp_dir = TemporaryDirectory() args.save_dir = self.temp_dir.name logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) modify_train_args(args) cross_validate(args, logger) clear_cache() parser = ArgumentParser() add_predict_args(parser) args = parser.parse_args([]) args.batch_size = 2 args.checkpoint_dir = self.temp_dir.name args.preds_path = NamedTemporaryFile().name args.test_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'delaney_toy_smiles.csv') self.args = args
def test_scaffold(self): try: self.args.split_type = 'scaffold_balanced' modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('scaffold')
def run_comparison(experiment_args: Namespace, logger: logging.Logger, features_dir: str = None): for dataset_name in experiment_args.datasets: dataset_type, dataset_path, num_folds, metric = DATASETS[dataset_name] logger.info(dataset_name) # Set up args args = deepcopy(experiment_args) args.data_path = dataset_path args.dataset_type = dataset_type args.save_dir = os.path.join(args.save_dir, dataset_name) args.num_folds = num_folds args.metric = metric if features_dir is not None: args.features_path = [ os.path.join(features_dir, dataset_name + '.pckl') ] modify_train_args(args) # Set up logging for training os.makedirs(args.save_dir, exist_ok=True) fh = logging.FileHandler(os.path.join(args.save_dir, args.log_name)) fh.setLevel(logging.DEBUG) # Cross validate TRAIN_LOGGER.addHandler(fh) mean_score, std_score = cross_validate(args, TRAIN_LOGGER) TRAIN_LOGGER.removeHandler(fh) # Record results logger.info(f'{mean_score} +/- {std_score} {metric}') temp_model = build_model(args) logger.info(f'num params: {param_count(temp_model):,}')
def test_classification_multiclass_default(self): try: self.args.data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tox21_toy.csv') self.args.dataset_type = 'classification' modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('classification_default')
def test_rdkit_2d_features(self): try: self.args.features_generator = ['rdkit_2d_normalized'] self.args.no_features_scaling = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('rdkit_2d_features')
def test_features_path(self): try: self.args.features_path = [os.path.join(os.path.dirname(os.path.abspath(__file__)), 'delaney_toy_features.npz')] self.args.no_features_scaling = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('features_path')
def test_features_only(self): try: self.args.features_generator = ['morgan'] self.features_only = True modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('features_only')
def test_num_folds_ensemble(self): try: self.args.num_folds = 2 self.args.ensemble_size = 2 modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('num_folds_ensemble')
def test_predetermined_split(self): try: self.args.split_type = 'predetermined' self.args.folds_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'delaney_toy_folds.pkl') self.args.val_fold_index = 1 self.args.test_fold_index = 2 modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('predetermined_split')
def test_checkpoint(self): try: args_copy = deepcopy(self.args) temp_dir = TemporaryDirectory() self.args.save_dir = temp_dir.name modify_train_args(self.args) cross_validate(self.args, self.logger) args_copy.checkpoint_dir = temp_dir.name args_copy.test = True modify_train_args(args_copy) cross_validate(args_copy, self.logger) except: self.fail('checkpoint')
model = load_checkpoint(os.path.join(save_dir, 'model.pt'), cuda=args.cuda) test_smiles, test_targets = test_data.smiles(), test_data.targets() test_preds = predict(model, test_data, args.batch_size) test_scores = evaluate_predictions(test_preds, test_targets, args.num_tasks, metric_func, args.dataset_type) avg_test_score = np.nanmean(test_scores) print(f'Test {args.metric} = {avg_test_score:.4f}') return avg_test_score if __name__ == "__main__": parser = ArgumentParser() parser.add_argument('--source_data_path', required=True) parser.add_argument('--src_batch_size', type=int, default=100) parser.add_argument('--lambda_e', type=float, default=0.1) add_train_args(parser) args = parser.parse_args() modify_train_args(args) all_test_score = np.zeros((args.num_folds, )) for i in range(args.num_folds): fold_dir = os.path.join(args.save_dir, f'fold_{i}') makedirs(fold_dir) all_test_score[i] = run_training(args, fold_dir) mean, std = np.mean(all_test_score), np.std(all_test_score) print(f'{args.num_folds} fold average: {mean:.4f} +/- {std:.4f}')
def train(): global training_message if request.method == 'GET': return render_template('train.html', datasets=get_datasets(), started=False, cuda=app.config['CUDA'], gpus=app.config['GPUS']) # Get arguments data_name, epochs, checkpoint_name = \ request.form['dataName'], int(request.form['epochs']), request.form['checkpointName'] gpu = request.form.get('gpu', None) dataset_type = request.form.get('datasetType', 'regression') if not checkpoint_name.endswith('.pt'): checkpoint_name += '.pt' # Create and modify args parser = ArgumentParser() add_train_args(parser) args = parser.parse_args() args.data_path = os.path.join(app.config['DATA_FOLDER'], data_name) args.dataset_type = dataset_type args.epochs = epochs target_set, all_targets_have_labels, has_invalid_targets = get_target_set( args.data_path) if len(target_set) == 0: return render_template('train.html', datasets=get_datasets(), started=False, cuda=app.config['CUDA'], gpus=app.config['GPUS'], error="No training labels provided") if has_invalid_targets: return render_template('train.html', datasets=get_datasets(), started=False, cuda=app.config['CUDA'], gpus=app.config['GPUS'], error="Training data contains invalid labels") classification_on_regression_dataset = ((not target_set <= set([0, 1])) and args.dataset_type == 'classification') if classification_on_regression_dataset: return render_template( 'train.html', datasets=get_datasets(), started=False, cuda=app.config['CUDA'], gpus=app.config['GPUS'], error= 'Selected classification dataset, but not all labels are 0 or 1') regression_on_classification_dataset = (target_set <= set([0, 1]) and args.dataset_type == 'regression') if not all_targets_have_labels: training_message += 'One or more targets have no labels. \n' # TODO could have separate warning messages for each? if regression_on_classification_dataset: training_message += 'All labels are 0 or 1; did you mean to train classification instead of regression?\n' if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) with TemporaryDirectory() as temp_dir: args.save_dir = temp_dir modify_train_args(args) if os.path.isdir(args.save_dir): training_message += 'Overwriting preexisting checkpoint with the same name.' logger = logging.getLogger('train') logger.setLevel(logging.DEBUG) logger.propagate = False set_logger(logger, args.save_dir, args.quiet) global progress process = mp.Process(target=progress_bar, args=(args, progress)) process.start() global started started = 1 # Run training run_training(args, logger) process.join() # reset globals started = 0 progress = mp.Value('d', 0.0) # Move checkpoint shutil.move( os.path.join(args.save_dir, 'model_0', 'model.pt'), os.path.join(app.config['CHECKPOINT_FOLDER'], checkpoint_name)) warning = training_message if len(training_message) > 0 else None training_message = "" return render_template('train.html', datasets=get_datasets(), cuda=app.config['CUDA'], gpus=app.config['GPUS'], trained=True, warning=warning)
def train(): global progress, training warnings, errors = [], [] if request.method == 'GET': return render_train() # Get arguments data_name, epochs, checkpoint_name = \ request.form['dataName'], int(request.form['epochs']), request.form['checkpointName'] gpu = request.form.get('gpu') data_path = os.path.join(app.config['DATA_FOLDER'], data_name) dataset_type = request.form.get('datasetType', 'regression') if not checkpoint_name.endswith('.pt'): checkpoint_name += '.pt' # Create and modify args parser = ArgumentParser() add_train_args(parser) args = parser.parse_args() args.data_path = data_path args.dataset_type = dataset_type args.epochs = epochs # Check if regression/classification selection matches data data = get_data(path=data_path) targets = data.targets() unique_targets = set(np.unique(targets)) if dataset_type == 'classification' and len(unique_targets - {0, 1}) > 0: errors.append( 'Selected classification dataset but not all labels are 0 or 1. Select regression instead.' ) return render_train(warnings=warnings, errors=errors) if dataset_type == 'regression' and unique_targets <= {0, 1}: errors.append( 'Selected regression dataset but all labels are 0 or 1. Select classification instead.' ) return render_train(warnings=warnings, errors=errors) if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) with TemporaryDirectory() as temp_dir: args.save_dir = temp_dir modify_train_args(args) logger = logging.getLogger('train') logger.setLevel(logging.DEBUG) logger.propagate = False set_logger(logger, args.save_dir, args.quiet) process = mp.Process(target=progress_bar, args=(args, progress)) process.start() training = 1 # Run training task_scores = run_training(args, logger) process.join() # Reset globals training = 0 progress = mp.Value('d', 0.0) # Check if name overlap original_save_path = os.path.join(app.config['CHECKPOINT_FOLDER'], checkpoint_name) save_path = find_unique_path(original_save_path) if save_path != original_save_path: warnings.append( name_already_exists_message('Checkpoint', original_save_path, save_path)) # Move checkpoint shutil.move(os.path.join(args.save_dir, 'model_0', 'model.pt'), save_path) return render_train(trained=True, metric=args.metric, num_tasks=len(args.task_names), task_names=args.task_names, task_scores=format_float_list(task_scores), mean_score=format_float(np.mean(task_scores)), warnings=warnings, errors=errors)
def grid_search(args: Namespace): for dataset_name in args.datasets: # Get dataset dataset_type, dataset_path, _, metric = DATASETS[dataset_name] # Create logger for dataset logger = create_logger(name=dataset_name, save_dir=args.save_dir, save_name='{}_{}.log'.format( dataset_name, args.split_type)) # Set up args for dataset dataset_args = deepcopy(args) dataset_args.data_path = dataset_path dataset_args.dataset_type = dataset_type dataset_args.save_dir = None dataset_args.metric = metric modify_train_args(dataset_args) # Run grid search results = [] # Define hyperparameter optimization def objective(hyperparams: Dict[str, Union[int, float]]) -> float: # Convert hyperparms from float to int when necessary for key in INT_KEYS: hyperparams[key] = int(hyperparams[key]) # Copy args gs_args = deepcopy(dataset_args) for key, value in hyperparams.items(): setattr(gs_args, key, value) # Record hyperparameters logger.info(hyperparams) # Cross validate mean_score, std_score = cross_validate(gs_args, TRAIN_LOGGER) # Record results temp_model = build_model(gs_args) num_params = param_count(temp_model) logger.info('num params: {:,}'.format(num_params)) logger.info('{} +/- {} {}'.format(mean_score, std_score, metric)) results.append({ 'mean_score': mean_score, 'std_score': std_score, 'hyperparams': hyperparams, 'num_params': num_params }) # Deal with nan if np.isnan(mean_score): if gs_args.dataset_type == 'classification': mean_score = 0 else: raise ValueError( 'Can\'t handle nan score for non-classification dataset.' ) return (1 if gs_args.minimize_score else -1) * mean_score fmin(objective, SPACE, algo=tpe.suggest, max_evals=args.num_runs_per_dataset) # Report best result results = [ result for result in results if not np.isnan(result['mean_score']) ] best_result = min( results, key=lambda result: (1 if dataset_args.minimize_score else -1) * result['mean_score']) logger.info('best') logger.info(best_result['hyperparams']) logger.info('num params: {:,}'.format(best_result['num_params'])) logger.info('{} +/- {} {}'.format(best_result['mean_score'], best_result['std_score'], metric))
# Report best result results = [result for result in results if not np.isnan(result['mean_score'])] best_result = min(results, key=lambda result: (1 if args.minimize_score else -1) * result['mean_score']) logger.info('best') logger.info(best_result['hyperparams']) logger.info(f'num params: {best_result["num_params"]:,}') logger.info(f'{best_result["mean_score"]} +/- {best_result["std_score"]} {args.metric}') # Save best hyperparameter settings as JSON config file makedirs(args.config_save_path, isfile=True) with open(args.config_save_path, 'w') as f: json.dump(best_result['hyperparams'], f, indent=4, sort_keys=True) if __name__ == '__main__': parser = ArgumentParser() add_train_args(parser) parser.add_argument('--num_iters', type=int, default=20, help='Number of hyperparameter choices to try') parser.add_argument('--config_save_path', type=str, required=True, help='Path to .json file where best hyperparameter settings will be written') parser.add_argument('--log_dir', type=str, help='(Optional) Path to a directory where all results of the hyperparameter optimization will be written') temp_input = '--data_path data/bbbp.csv --dataset_type classification --save_dir log/bbbp/model --gpu 0 --num_folds 10 --features_generator rdkit_2d_normalized --no_features_scaling --config_save_path log/bbbp/config --log_dir log/bbbp/temp' #'--data_path data/tox21.csv --dataset_type classification --save_dir log/tox21_checkpoints --gpu 0 --num_iters 20 --config_save_path log/best_json --log_dir log/temp' args = parser.parse_args(temp_input.split()) modify_train_args(args) # 对输入的参数进行调整 grid_search(args)
def train(): """Renders the train page and performs training if request method is POST.""" global PROGRESS, TRAINING warnings, errors = [], [] if request.method == 'GET': return render_train() # Get arguments data_name, epochs, ensemble_size, checkpoint_name = \ request.form['dataName'], int(request.form['epochs']), \ int(request.form['ensembleSize']), request.form['checkpointName'] gpu = request.form.get('gpu') data_path = os.path.join(app.config['DATA_FOLDER'], f'{data_name}.csv') dataset_type = request.form.get('datasetType', 'regression') # Create and modify args parser = ArgumentParser() add_train_args(parser) args = parser.parse_args([]) args.data_path = data_path args.dataset_type = dataset_type args.epochs = epochs args.ensemble_size = ensemble_size # Check if regression/classification selection matches data data = get_data(path=data_path) targets = data.targets() unique_targets = {target for row in targets for target in row if target is not None} if dataset_type == 'classification' and len(unique_targets - {0, 1}) > 0: errors.append('Selected classification dataset but not all labels are 0 or 1. Select regression instead.') return render_train(warnings=warnings, errors=errors) if dataset_type == 'regression' and unique_targets <= {0, 1}: errors.append('Selected regression dataset but all labels are 0 or 1. Select classification instead.') return render_train(warnings=warnings, errors=errors) if gpu is not None: if gpu == 'None': args.no_cuda = True else: args.gpu = int(gpu) current_user = request.cookies.get('currentUser') if not current_user: # Use DEFAULT as current user if the client's cookie is not set. current_user = app.config['DEFAULT_USER_ID'] ckpt_id, ckpt_name = db.insert_ckpt(checkpoint_name, current_user, args.dataset_type, args.epochs, args.ensemble_size, len(targets)) with TemporaryDirectory() as temp_dir: args.save_dir = temp_dir modify_train_args(args) process = mp.Process(target=progress_bar, args=(args, PROGRESS)) process.start() TRAINING = 1 # Run training logger = create_logger(name='train', save_dir=args.save_dir, quiet=args.quiet) task_scores = run_training(args, logger) process.join() # Reset globals TRAINING = 0 PROGRESS = mp.Value('d', 0.0) # Check if name overlap if checkpoint_name != ckpt_name: warnings.append(name_already_exists_message('Checkpoint', checkpoint_name, ckpt_name)) # Move models for root, _, files in os.walk(args.save_dir): for fname in files: if fname.endswith('.pt'): model_id = db.insert_model(ckpt_id) save_path = os.path.join(app.config['CHECKPOINT_FOLDER'], f'{model_id}.pt') shutil.move(os.path.join(args.save_dir, root, fname), save_path) return render_train(trained=True, metric=args.metric, num_tasks=len(args.task_names), task_names=args.task_names, task_scores=format_float_list(task_scores), mean_score=format_float(np.mean(task_scores)), warnings=warnings, errors=errors)
def test_regression_default(self): try: modify_train_args(self.args) cross_validate(self.args, self.logger) except: self.fail('regression_default')