def get_cli_reporter(self, extra_metric_cols: Optional[List[str]] = None): if extra_metric_cols is None: extra_metric_cols = [] default_cli_reporter_metric_cols = [ metrics_mod.PARAM_COUNT_NAME, metrics_mod.CURRENT_EPOCH_NAME, 'time_this_iter_s', 'time_total_s', 'train_loss_total', 'valid_loss_total', self.search_params.opt.search_metric, ] # unique metric_cols_set = set() # a temporary lookup set metric_cols = [ x for x in default_cli_reporter_metric_cols if x not in metric_cols_set and metric_cols_set.add(x) is None ] return tune.CLIReporter( max_progress_rows=self.search_params.tune.num_hp_samples, parameter_columns=self.search_params.get_samplable_param_names(), metric_columns=metric_cols + extra_metric_cols, )
def run_exp(design_or_test, config, n_samples, p_early, p_scheduler, exp_dir, chk_score_attr, log_params, gpus=[], gpu_threshold=None): if not os.path.exists(exp_dir): os.makedirs(exp_dir) config['wdir'] = os.getcwd() config['gpu_ids'] = gpus config['gpu_threshold'] = gpu_threshold early_stopping = TrialNoImprovementStopper(metric=p_early['metric'], mode=p_early['mode'], patience_threshold=p_early['patience']) if p_scheduler is not None: scheduler = ASHAScheduler( metric=p_scheduler['metric'], mode=p_scheduler['mode'], max_t=p_scheduler['max_t'], grace_period=p_scheduler['grace'], reduction_factor=p_scheduler['reduction'] ) else: scheduler = None resources = {'cpu': 1, 'gpu': 0.0001} reporter = tune.CLIReporter(metric_columns={ 'training_iteration': '#Iter', 'tr_loss': 'TR-Loss', 'tr_score': 'TR-Score', 'vl_loss': 'VL-Loss', 'vl_score': 'VL-Score', 'rank_score': 'Rank Score', }, parameter_columns=log_params, infer_limit=3, metric='rank_score', mode='max') return tune.run( GPUTrainable, name=design_or_test, stop=early_stopping, local_dir=exp_dir, config=config, num_samples=n_samples, resources_per_trial=resources, keep_checkpoints_num=1, checkpoint_score_attr=chk_score_attr, checkpoint_freq=1, max_failures=5, progress_reporter=reporter, scheduler=scheduler, verbose=1 )
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--config', help= 'Path to configuration file (default: %(default)s). Please specify a config with all arguments in LibMultiLabel/main.py::get_config.' ) parser.add_argument('--cpu_count', type=int, default=4, help='Number of CPU per trial (default: %(default)s)') parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPU per trial (default: %(default)s)') parser.add_argument( '--num_samples', type=int, default=50, help= 'Number of running trials. If the search space is `grid_search`, the same grid will be repeated `num_samples` times. (default: %(default)s)' ) parser.add_argument( '--mode', default='max', choices=['min', 'max'], help= 'Determines whether objective is minimizing or maximizing the metric attribute. (default: %(default)s)' ) parser.add_argument('--search_alg', default=None, choices=['basic_variant', 'bayesopt', 'optuna'], help='Search algorithms (default: %(default)s)') parser.add_argument( '--merge_train_val', action='store_true', help='Merge the training and validation data after parameter search.') parser.add_argument('--retrain_best', action='store_true', help='Retrain the best model.') args, _ = parser.parse_known_args() # Load config from the config file and overwrite values specified in CLI. parameter_columns = dict( ) # parameters to include in progress table of CLIReporter config = load_config_from_file(args.config) config = init_search_params_spaces(config, parameter_columns, prefix='') parser.set_defaults(**config) config = AttributeDict(vars(parser.parse_args())) # Check if the validation set is provided. val_path = config.val_path or os.path.join(config.data_dir, 'valid.txt') assert config.val_size > 0 or os.path.exists(val_path), \ "You should specify either a positive `val_size` or a `val_path` defaults to `data_dir/valid.txt` for parameter search." """Run tune analysis. - If no search algorithm is specified, the default search algorighm is BasicVariantGenerator. https://docs.ray.io/en/master/tune/api_docs/suggestion.html#tune-basicvariant - Arguments without search spaces will be ignored by `tune.run` (https://github.com/ray-project/ray/blob/34d3d9294c50aea4005b7367404f6a5d9e0c2698/python/ray/tune/suggest/variant_generator.py#L333), so we parse the whole config to `tune.run` here for simplicity. """ data = load_static_data(config) reporter = tune.CLIReporter( metric_columns=[f'val_{metric}' for metric in config.monitor_metrics], parameter_columns=parameter_columns, metric=f'val_{config.val_metric}', mode=args.mode, sort_by_metric=True) if config.scheduler is not None: scheduler = ASHAScheduler(metric=f'val_{config.val_metric}', mode=args.mode, **config.scheduler) else: scheduler = None exp_name = '{}_{}_{}'.format( config.data_name, Path(config.config).stem if config.config else config.model_name, datetime.now().strftime('%Y%m%d%H%M%S'), ) analysis = tune.run( tune.with_parameters(train_libmultilabel_tune, **data), search_alg=init_search_algorithm(config.search_alg, metric=config.val_metric, mode=args.mode), scheduler=scheduler, local_dir=config.result_dir, num_samples=config.num_samples, resources_per_trial={ 'cpu': args.cpu_count, 'gpu': args.gpu_count }, progress_reporter=reporter, config=config, name=exp_name, ) # Save best model after parameter search. best_config = analysis.get_best_config(f'val_{config.val_metric}', args.mode, scope='all') retrain_best_model(exp_name, best_config, config.result_dir, args.merge_train_val)
l += np.mean((y_true - y_pred)**2) r += recall_score(y_true, y_pred) p += precision_score(y_true, y_pred) except: p, r, l = 9999, 9999, 9999 scores = {'p': p, 'r': r, 'l': l} tune.report(**scores) ### TUNE RAY analysis = tune.run(eval_func, num_samples=NUM_SAMPLES, config={k: v[0] for k, v in config.items()}, progress_reporter=tune.CLIReporter(max_progress_rows=50), local_dir='.', resume=False) df = analysis.results_df df.sort_values(by=['l'], inplace=True) df = df.drop(columns=[ 'done', 'time_this_iter_s', 'time_since_restore', 'pid', 'timesteps_total', 'episodes_total', 'training_iteration', 'experiment_id', 'date', 'timestamp', 'hostname', 'node_ip', 'timesteps_since_restore', 'iterations_since_restore', 'experiment_tag' ]) df.to_csv('hyperparameter_results.csv')
target_dqn.train() dqn_trainable = get_trainable(main_dqn, target_dqn, env, n_iterations=100) config = DEFAULT_CONFIG.copy() config[ 'local_dir'] = '/home/rudy/PycharmProjects/playground/results/CartPole' config['verbose'] = False config['steps_per_iter'] = 1000 # this shouldn't affect learning config['steps_to_update_main_model'] = 1 config['steps_to_update_target_model'] = 250 config['learning_rate'] = 0.001 config['gamma'] = 0.9 config['e_decay'] = 0.995 print(config) reporter = tune.CLIReporter() reporter.add_metric_column('min_reward') reporter.add_metric_column('mean_reward') reporter.add_metric_column('max_reward') ray.init() analysis = tune.run( dqn_trainable, name="CartPole", config=config, verbose=1, local_dir='/home/rudy/PycharmProjects/playground/results/CartPoleMyDQN' # resources_per_trial={'gpu': 1} ) print(analysis)
def tuner(trainable, tunable_params, num_samples=20): # print(config) scheduler = tune.schedulers.ASHAScheduler(metric="loss", mode="min", max_t=10, grace_period=1, reduction_factor=2) reporter = tune.CLIReporter(parameter_columns=list(tunable_params.keys()), metric_columns=["loss", "training_iteration"]) resources = { "cpu": 1, "gpu": 0.2 } if torch.cuda.is_available() else { 'cpu': 1 } result = tune.run(trainable, resources_per_trial={ "cpu": 1, "gpu": 0 }, config=tunable_params, num_samples=num_samples, scheduler=scheduler, progress_reporter=reporter, raise_on_failed_trial=False) best_trial = result.get_best_trial("loss", "min", "last") print(f"Best trial config: {best_trial.config}") print( f"Best trial final validation loss: {best_trial.last_result['loss']}") # print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}") # best_trained_model = NN(best_trial.config["n_layers"]) # device = "cpu" # if torch.cuda.is_available(): # device = "cuda:0" # if gpus_per_trial > 1: # best_trained_model = nn.DataParallel(best_trained_model) # best_trained_model.to(device) best_checkpoint_dir = best_trial.checkpoint.value best_checkpoint_save = os.path.join(best_checkpoint_dir, "checkpoint") print(f'best checkpoint found at {best_checkpoint_save}') # model_state, optimizer_state = torch.load(best_checkpoint_save) # best_trained_model.load_state_dict(model_state) return result # def test_accuracy(net, device="cpu"): # correct = 0 # total = 0 # with torch.no_grad(): # for data in test_loader: # images, labels = data # images, labels = images.to(device), labels.to(device) # outputs = net(images) # _, predicted = torch.max(outputs.data, 1) # total += labels.size(0) # correct += (predicted == labels).sum().item() # return correct / total # test_acc = test_accuracy(best_trained_model, device) # print(f"Best trial test set accuracy: ({test_acc*100}%) achieved with {best_trial.config['n_layers']} layers")
def main(targets, exp_dir, policy, save_dir, num_cpus, num_gpus, memory, time, port, ip_head, redis_password, **kwargs): # Setup exp_dir = os.path.abspath(os.path.expanduser(exp_dir)) if policy is not None: policy = os.path.abspath(os.path.expanduser(policy)) for target in targets: if "rename:" in target: print("\n\n=================================================") print("Renaming the config to params_renamed.json!") print("=================================================") # excluded params inc_params = exc_params = [] inc_params = [] # CHANGE this! exc_params = ["seed"] # CHANGE this! # adding checking config_file = target.replace("rename:", "") params_config = import_param_config(config_file) dir_param_dict = generate_params(exp_dir, params_config) for k, v in dir_param_dict.items(): if not os.path.exists(k): continue # copy params.json file, rename the config entry varied_params = k[len(exp_dir) + 1:].split("/") if inc_params: config_name = [ x for x in varied_params if any([x.startswith(y) for y in inc_params]) ] else: config_name = [ x for x in varied_params if not any([x.startswith(y) for y in exc_params]) ] print(config_name) config_name = transform_config_name(config_name) print("||||---> ", config_name) v["config"] = "-".join(config_name) with open(os.path.join(k, "params_renamed.json"), "w") as f: json.dump(v, f) elif "slurm:" in target: print("\n\n=================================================") print("Generating the slurm launch script!") print("=================================================") max_cpus_per_node = 16 # for graham and beluga max_gpus_per_node = 2 nodes = max(np.ceil(num_cpus / max_cpus_per_node), np.ceil(num_gpus / max_gpus_per_node)) cpus_per_node = np.ceil(num_cpus / nodes) gpus_per_node = np.ceil(num_gpus / nodes) config_file = target.replace("slurm:", "") with open( osp.join(osp.dirname(osp.realpath(__file__)), "scripts/slurm_launch.sh")) as f: slurm = f.read() slurm = slurm.replace('%%NAME%%', config_file.replace(".py", "")) slurm = slurm.replace('%%NODES%%', str(int(nodes))) slurm = slurm.replace('%%CPUS_PER_NODE%%', str(int(cpus_per_node))) slurm = slurm.replace('%%GPUS_PER_NODE%%', str(int(gpus_per_node))) slurm = slurm.replace('%%MEM_PER_CPU%%', str(int(memory))) slurm = slurm.replace('%%CONFIG_FILE%%', str(config_file)) slurm = slurm.replace('%%TIME%%', str(int(time))) slurm = slurm.replace('%%PORT%%', str(int(port))) save_file = osp.join(os.getcwd(), config_file.replace(".py", ".sh")) with open(save_file, "w") as f: f.write(slurm) print("The slurm script has been stored to {}".format(save_file)) print( "If this is the same directory as {}, use `sbatch {}` to rerun with the same configuration." .format(config_file, save_file)) os.system("echo '=> sbatch {}' && sbatch {}".format( save_file, save_file)) elif "train:" in target: print("\n\n=================================================") print("Launching the training experiment!") print("=================================================") # adding checking config_file = target.replace("train:", "") params_config = import_param_config(config_file) dir_param_dict = generate_params(exp_dir, params_config) config_name = params_config.pop("config") config_name = config_name[0] if type( config_name) == tuple else config_name search_params_list, search_params_dict = get_search_params( params_config) # store json config file to the target directory overwrite_all = remove_all = False for k, v in dir_param_dict.items(): if os.path.exists(k) and (not overwrite_all and not remove_all): resp = input( "Directory {} exists! (R)emove/(O)verwrite?".format(k)) if resp.lower() == "r": remove_all = True elif resp.lower() == "o": overwrite_all = True else: exit(1) if remove_all: try: shutil.rmtree(k) except FileNotFoundError: pass os.makedirs(k, exist_ok=True) # copy params.json file with open(os.path.join(k, "params.json"), "w") as f: json.dump(v, f) # copy demo_sata file if exist for f in glob.glob("*.pkl") + glob.glob("*.npz"): # .pkl -> pretrained policies, .npz -> demonstrations source = os.path.join(exp_dir, f) destination = os.path.join(k, f) if os.path.isfile(source): shutil.copyfile(source, destination) ray.init(num_cpus=num_cpus if not ip_head else None, num_gpus=num_gpus if not ip_head else None, temp_dir=osp.join(osp.expanduser("~"), ".ray") if not ip_head else None, address=ip_head, redis_password=redis_password) tune.run(train.main, verbose=1, local_dir=os.path.join(exp_dir, "config_" + config_name), resources_per_trial={ "cpu": 1, "gpu": num_gpus / num_cpus, }, config=dict(root_dir=exp_dir, config=config_name, search_params_list=search_params_list, **search_params_dict), progress_reporter=tune.CLIReporter( ["mode", "epoch", "time_total_s"])) elif target == "demo": assert policy != None print("\n\n=================================================") print("Using policy file from {} to generate demo data.".format( policy)) print("=================================================") generate_demo.main(policy=policy, root_dir=exp_dir) elif target == "evaluate": assert policy != None print("\n\n=================================================") print("Evaluating using policy file from {}.".format(policy)) print("=================================================") evaluate.main(policy=policy) elif "plot" in target: print("\n\n=================================================") print("Plotting.") print("=================================================") save_name = target.replace("plot:", "") if "plot:" in target else "" plot.main( dirs=[exp_dir], save_dir=save_dir, save_name=save_name, xys=[ "OnlineTesting/AverageReturn vs EnvironmentSteps", # "OfflineTesting/AverageReturn", ], smooth=True, ) else: raise ValueError("Unknown target: {}".format(target))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--config', help= 'Path to configuration file (default: %(default)s). Please specify a config with all arguments in LibMultiLabel/main.py::get_config.' ) parser.add_argument('--cpu_count', type=int, default=4, help='Number of CPU per trial (default: %(default)s)') parser.add_argument('--gpu_count', type=int, default=1, help='Number of GPU per trial (default: %(default)s)') parser.add_argument( '--local_dir', default=os.getcwd(), help='Directory to save training results of tune (default: %(default)s)' ) parser.add_argument( '--num_samples', type=int, default=50, help= 'Number of running trials. If the search space is `grid_search`, the same grid will be repeated `num_samples` times. (default: %(default)s)' ) parser.add_argument( '--mode', default='max', choices=['min', 'max'], help= 'Determines whether objective is minimizing or maximizing the metric attribute. (default: %(default)s)' ) parser.add_argument('--search_alg', default=None, choices=['basic_variant', 'bayesopt', 'optuna'], help='Search algorithms (default: %(default)s)') args = parser.parse_args() """Other args in the model config are viewed as resolved values that are ignored from tune. https://github.com/ray-project/ray/blob/34d3d9294c50aea4005b7367404f6a5d9e0c2698/python/ray/tune/suggest/variant_generator.py#L333 """ config = init_model_config(args.config) search_alg = args.search_alg if args.search_alg else config.search_alg num_samples = config['num_samples'] if config.get( 'num_samples', None) else args.num_samples parameter_columns = dict() config = init_search_params_spaces(config, parameter_columns, prefix='') data = load_static_data(config) """Run tune analysis. If no search algorithm is specified, the default search algorighm is BasicVariantGenerator. https://docs.ray.io/en/master/tune/api_docs/suggestion.html#tune-basicvariant """ all_monitor_metrics = [ f'{split}_{metric}' for split, metric in itertools.product( ['val', 'test'], config.monitor_metrics) ] reporter = tune.CLIReporter(metric_columns=all_monitor_metrics, parameter_columns=parameter_columns) analysis = tune.run( tune.with_parameters(Trainable, data=data), # run one step "libmultilabel.model.train" stop={"training_iteration": 1}, search_alg=init_search_algorithm(search_alg, metric=config.val_metric, mode=args.mode), local_dir=args.local_dir, metric=f'val_{config.val_metric}', mode=args.mode, num_samples=num_samples, resources_per_trial={ 'cpu': args.cpu_count, 'gpu': args.gpu_count }, progress_reporter=reporter, config=config) results_df = analysis.results_df.sort_values(by=f'val_{config.val_metric}', ascending=False) results_df = results_df.rename(columns=lambda x: x.split('.')[-1]) columns = reporter._metric_columns + [ parameter_columns[x] for x in analysis.best_trial.evaluated_params.keys() ] print(f'\n{results_df[columns].to_markdown()}\n')
def run(self, args: AttributeDict): """Run hyperparameter search using the `tune.schedulers.ASHAScheduler` Args: args (AttributeDict): Arguments Side-effects: Saves logs to `TUNE_LOGS_PATH / args.id` """ try: from ray import tune from ray.tune.integration.pytorch_lightning import ( TuneReportCheckpointCallback, ) except ModuleNotFoundError as e: # pragma: no cover logger.error( "To use hyperparameter search, first install Ray Tune via `pip install 'ray[tune]'` or `pip install 'ride[extras]'`" ) raise e if not hasattr(args, "id"): args.id = "hparamsearch" module_config = ( Configs.from_file(args.from_hparam_space_file) if args.from_hparam_space_file else self.Module.configs() ).tune_config() config = { **dict(args), **module_config, # pl.Trainer args: "gpus": args.gpus_per_trial, "logger": False, "accumulate_grad_batches": ( (8 // args.gpus_per_trial) * args.accumulate_grad_batches if args.gpus_per_trial else args.accumulate_grad_batches ), } scheduler = tune.schedulers.ASHAScheduler( metric=f"val/{args.optimization_metric}", mode=self.Module.metrics()[args.optimization_metric].value, max_t=args.max_epochs, grace_period=1, reduction_factor=2, ) metric_names = [f"val/{m}" for m in self.Module.metrics().keys()] reporter = tune.CLIReporter( metric_columns=[*metric_names, "training_iteration"], ) tune_callbacks = [ TuneReportCheckpointCallback( metrics=metric_names, filename="checkpoint", on="validation_end", ) ] cpus_per_trial = max( 1, ( min(10 * args.gpus_per_trial, NUM_CPU - 10) if args.gpus_per_trial else min(10, NUM_CPU - 2) ), ) analysis = tune.run( partial( Runner.static_train_and_val, self.Module, trainer_callbacks=tune_callbacks, ), name=args.id, local_dir=str(TUNE_LOGS_PATH), resources_per_trial={"cpu": cpus_per_trial, "gpu": args.gpus_per_trial}, config=config, num_samples=args.trials, scheduler=scheduler, progress_reporter=reporter, raise_on_failed_trial=False, ) best_hparams = analysis.get_best_config( metric=f"val/{args.optimization_metric}", mode=self.Module.metrics()[args.optimization_metric].value, scope="all", ) # Select only model parameters if best_hparams: best_hparams = { k: best_hparams[k] for k in [ *self.Module.configs().names, # Trainer parameters that influence model hparams: "accumulate_grad_batches", "batch_size", "gpus", ] } return best_hparams
flat_cfg_dict = flatten(yaml.full_load(open(CFG_PATH))) flat_cfg_dict.update(dataset_info) flat_cfg_dict.update(init_space) # Set the number of epochs per generation tuneLFADS = create_trainable_class(EPOCHS_PER_GENERATION) # connect to Ray cluster or start on single machine address = None if SINGLE_MACHINE else 'localhost:10000' ray.init(address=address) # create the PBT scheduler scheduler = MultiStrategyPBT( HYPERPARAM_SPACE, metric=PBT_METRIC) # Create the trial executor executor = SoftPauseExecutor(reuse_actors=True) # Create the command-line display table reporter = tune.CLIReporter(metric_columns=['epoch', PBT_METRIC]) try: # run the tune job, excepting errors tune.run( tuneLFADS, name=RUN_NAME, local_dir=PBT_HOME, config=flat_cfg_dict, resources_per_trial=RESOURCES_PER_TRIAL, num_samples=NUM_WORKERS, sync_to_driver='# {source} {target}', # prevents rsync scheduler=scheduler, progress_reporter=reporter, trial_executor=executor, verbose=1, reuse_actors=True,