def tune_example(num_workers=1, use_gpu=False, use_fp16=False, test_mode=False): TorchTrainable = TorchTrainer.as_trainable( model_creator=ResNet18, data_creator=cifar_creator, optimizer_creator=optimizer_creator, loss_creator=nn.CrossEntropyLoss, scheduler_creator=scheduler_creator, initialization_hook=initialization_hook, num_workers=num_workers, config={ "test_mode": test_mode, # user-defined param to subset the data BATCH_SIZE: 128 * num_workers, }, use_gpu=use_gpu, scheduler_step_freq="epoch", use_fp16=use_fp16) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="val_loss", mode="min", perturbation_interval=1, hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }) reporter = CLIReporter() reporter.add_metric_column("val_loss", "loss") reporter.add_metric_column("val_accuracy", "acc") analysis = tune.run( TorchTrainable, num_samples=4, config={ "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8 }, stop={"training_iteration": 2 if test_mode else 100}, max_failures=3, # used for fault tolerance checkpoint_freq=3, # used for fault tolerance keep_checkpoints_num=1, # used for fault tolerance verbose=2, progress_reporter=reporter, scheduler=pbt_scheduler) return analysis.get_best_config(metric="val_loss", mode="min")
def main(): config = Config(config_file="lunar/config/config_lunar.yaml").config ray.init(address="auto") space = { "agent_learn_every_x_steps": hp.choice("agent_learn_every_x_steps", [10, 20]), "replay_buffer_batch_size": hp.choice("replay_buffer_batch_size", [128, 512, 1024]), # "memory_learning_start": hp.choice("memory_learning_start", [50000]), "agent_gamma": hp.uniform("agent_gamma", 0.95, 0.999), "agent_gamma": hp.choice("agent_gamma", [0.95, 0.995]) } reporter = CLIReporter() reporter.add_metric_column("mean_rewards") reporter.add_metric_column("reward") ahb = AsyncHyperBandScheduler(time_attr="training_iteration", metric="mean_rewards", mode="max", grace_period=500, max_t=3600) tune.run( LunarTrainer, name="asynHyber-lunar-ddpg", scheduler=ahb, config=config, queue_trials=True, num_samples=10, progress_reporter=reporter, resources_per_trial={ "cpu": 3, "gpu": 0.2 }, search_alg=HyperOptSearch(space=space, max_concurrent=4, metric="mean_rewards", mode="max"), checkpoint_freq=20, checkpoint_at_end=True, verbose=1, )
use_fp16=args.fp16) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="val_loss", mode="min", perturbation_interval=1, hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }) reporter = CLIReporter() reporter.add_metric_column("val_loss", "loss") reporter.add_metric_column("val_accuracy", "acc") analysis = tune.run( TorchTrainable, num_samples=4, config={ "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8 }, stop={"training_iteration": 2 if args.smoke_test else 100}, max_failures=3, # used for fault tolerance checkpoint_freq=3, # used for fault tolerance keep_checkpoints_num=1, # used for fault tolerance verbose=2, progress_reporter=reporter,
Trainable = trainer.to_tune_trainable(train_func) pbt_scheduler = PopulationBasedTraining( time_attr="training_iteration", metric="loss", mode="min", perturbation_interval=1, hyperparam_mutations={ # distribution for resampling "lr": lambda: np.random.uniform(0.001, 1), # allow perturbations within this set of categorical values "momentum": [0.8, 0.9, 0.99], }, ) reporter = CLIReporter() reporter.add_metric_column("loss", "loss") analysis = tune.run( Trainable, num_samples=4, config={ "lr": tune.choice([0.001, 0.01, 0.1]), "momentum": 0.8, "batch_size": 128 * args.num_workers, "epochs": args.num_epochs, "test_mode": args.smoke_test, # whether to to subset the data }, stop={"training_iteration": 2 if args.smoke_test else 100}, max_failures=3, # used for fault tolerance checkpoint_freq=3, # used for fault tolerance keep_checkpoints_num=1, # used for fault tolerance
def main(args): utils.init_random() exp_configs, tune_configs = utils.get_tune_configs(args.logdir) hparams = {} parameters = [] for param_subset, params in tune_configs.items(): hparams[param_subset] = [] for param, options in params.items(): parameters.append({'name': param, **options}) hparams[param_subset].append(param) exp_configs['hparams'] = hparams exp_configs['data_params']['subset'] = args.subset exp_configs['data_params']['workers'] = args.ds_workers max_epochs = 2 if args.smoke else args.max_epochs num_samples = 2 if args.smoke else args.num_samples exp_configs.update({'num_gpus': 1}) # ray.init() ray.init(memory=2000 * 1024 * 1024, object_store_memory=200 * 1024 * 1024, driver_object_store_memory=100 * 1024 * 1024) scheduler = AsyncHyperBandScheduler(time_attr="training_iteration", metric="val_accuracy", mode="max", grace_period=5, max_t=max(max_epochs, 5)) client = AxClient(enforce_sequential_optimization=True) client.create_experiment(parameters=parameters, objective_name='val_accuracy') search_alg = AxSearch(client, max_concurrent=1, mode='max') # search_alg = ConcurrencyLimiter(search_alg, max_concurrent=2) reporter = CLIReporter() reporter.add_metric_column("val_accuracy") reporter.add_metric_column("train_loss") trainable = TorchTrainer.as_trainable( model_creator=utils.model_creator, data_creator=utils.data_creator, optimizer_creator=utils.optimizer_creator, loss_creator=utils.loss_creator, scheduler_creator=utils.scheduler_creator, scheduler_step_freq="epoch", use_gpu=True, config={BATCH_SIZE: exp_configs['batch_size']}, num_workers=args.workers) analysis = tune.run(trainable, num_samples=num_samples, config=exp_configs, trial_name_creator=utils.trial_str_creator, progress_reporter=reporter, scheduler=scheduler, search_alg=search_alg, stop={"training_iteration": max_epochs}, local_dir=args.logdir, checkpoint_freq=10, checkpoint_at_end=True, keep_checkpoints_num=3, resume=args.resume, checkpoint_score_attr='val_accuracy', max_failures=2, verbose=1)
def tune_learner_on_stream(learner, learner_name, task_level_tuning, stream, redis_address, local_mode, num_hp_samplings, vis_params, exp_dir, seed, **training_params): """ Returns 2 dataframes: - The first one contains information about the best trajectory and contains as many rows as there are tasks. Each row corresponding to the model trained on the corresponding task in the best trajectory. - The second contains one row per hyper-parameters combination. Each Row corresponds contains information about the results on all tasks for this specific hp combination. Note that, *in the task-level hp optim settting*, this DF is useful to investigate the behaviors of specific trainings, but rows *DOES NOT* correspond to actual trajectories. """ exp_name = os.path.basename(exp_dir) init_path = path.join(exp_dir, 'model_initializations', learner_name) torch.save(learner, init_path) config = {**learner.get_search_space(), 'training-params': training_params, 'tasks': stream, 'vis_params': vis_params, # 'learner': learner, 'learner_path': init_path, 'task_level_tuning': task_level_tuning, # 'env': learner_name 'seed': seed } def trial_name_creator(trial): return learner_name # return '{}_{}'.format(learner_name, trial.trial_id) reporter = CLIReporter(max_progress_rows=10) # reporter.add_metric_column('avg_acc_val') reporter.add_metric_column('avg_acc_val_so_far', 'avg_val') reporter.add_metric_column('avg_acc_test_so_far', 'avg_test') reporter.add_metric_column('total_params') # reporter.add_metric_column('fw_t') # reporter.add_metric_column('data_t') # reporter.add_metric_column('eval_t') # reporter.add_metric_column('epoch_t') reporter.add_metric_column('duration_model_creation', 'creat_t') reporter.add_metric_column('duration_training', 'train_t') reporter.add_metric_column('duration_postproc', 'pp_t') reporter.add_metric_column('duration_finish', 'fin_t') reporter.add_metric_column('duration_eval', 'ev_t') reporter.add_metric_column('duration_sum', 'sum_t') reporter.add_metric_column('duration_seconds', 'step_t') reporter.add_metric_column('total_t') reporter.add_metric_column('t') ray_params = dict( loggers=[JsonLogger, CSVLogger], name=learner_name, resources_per_trial=learner.ray_resources, num_samples=num_hp_samplings, local_dir=exp_dir, verbose=1, progress_reporter=reporter, trial_name_creator=trial_name_creator, max_failures=3, ) envs = [] all_val_accs = defaultdict(list) all_test_accs = defaultdict(list) if task_level_tuning: best_trials_df = [] config['ray_params'] = ray_params config['local_mode'] = local_mode config['redis_address'] = redis_address analysis, selected = train_on_tasks(config) for t_id, (task, task_an) in enumerate(zip(stream, analysis)): # envs.append([]) for trial_n, t in enumerate(task_an.trials): if len(envs) <= trial_n: envs.append([]) env = '{}_Trial_{}_{}_{}'.format(exp_name, t, t.experiment_tag, task['descriptor']) envs[trial_n].append(env) if selected[t_id] == t.experiment_tag: all_val_accs[t.experiment_tag].append( '<span style="font-weight:bold">{}</span>'.format( t.last_result[f'Val_T{t_id}'])) else: all_val_accs[t.experiment_tag].append( t.last_result[f'Val_T{t_id}']) all_test_accs[t.experiment_tag].append( t.last_result[f'Test_T{t_id}'] ) best_trial = max(task_an.trials, key=lambda trial: trial.last_result['avg_acc_val_so_far']) df = task_an.trial_dataframes[best_trial.logdir] best_trials_df.append(df) return_df = pandas.concat(best_trials_df, ignore_index=True) analysis = analysis[-1] results = sorted(analysis.trials, reverse=True, key=lambda trial: trial.last_result['avg_acc_val_so_far']) else: if not ray.is_initialized(): if local_mode: ray.init(local_mode=local_mode) else: ray.init(redis_address) # logging_level=logging.DEBUG) ray_params['config'] = config analysis = tune.run(train_on_tasks, **ray_params) results = sorted(analysis.trials, reverse=True, key=lambda trial: trial.last_result['avg_acc_val_so_far']) for t in results: envs.append([]) for task in stream: env = '{}_Trial_{}_{}_{}'.format(exp_name, t, t.experiment_tag, task['descriptor']) envs[-1].append(env) return_df = analysis.trial_dataframes[results[0].logdir] summary = { 'model': [t.experiment_tag for t in results], 'Avg acc Val': [t.last_result['avg_acc_val'] for t in results], 'Acc Val': [all_val_accs[t.experiment_tag] for t in results], 'Avg acc Test': [t.last_result['avg_acc_test'] for t in results], 'Acc Test': [all_test_accs[t.experiment_tag] for t in results], 'Params': [t.last_result['total_params'] for t in results], 'Steps': [t.last_result['total_steps'] for t in results], 'paths': [t.logdir for t in results], 'evaluated_params': [t.evaluated_params for t in results], 'envs': envs } summary = pandas.DataFrame(summary) return return_df, summary
import os from ray import tune from ray.tune.schedulers import MedianStoppingRule from hyperopt import hp from ray.tune.suggest.hyperopt import HyperOptSearch from ray.tune import CLIReporter from some_model_to_train import SomeModelToTrain MODEL_FILENAME = "checkpoint.pth" TUNE_RESULTS_FOLDER = './ray_results/' MAX_TRAINING_ITERATION = 2000 reporter = CLIReporter(max_progress_rows=10) reporter.add_metric_column("mean_reward") class Trainable(tune.Trainable): def setup(self, hyperparameter): self.someModelToTrain = SomeModelToTrain(hyperparameter) def step(self): mean_reward = self.someModelToTrain.train_one_episode() return {'mean_reward': mean_reward} def save_checkpoint(self, tmp_checkpoint_dir): checkpoint_path = os.path.join(tmp_checkpoint_dir, MODEL_FILENAME) self.someModelToTrain.save(checkpoint_path) return tmp_checkpoint_dir
def train( trainer, config, stop, exp_name, num_seeds=1, num_gpus=0, test_mode=False, suffix="", checkpoint_freq=10, keep_checkpoints_num=None, start_seed=0, local_mode=False, save_pkl=True, custom_callback=None, **kwargs ): # initialize ray if not os.environ.get("redis_password"): initialize_ray(test_mode=test_mode, local_mode=local_mode, num_gpus=num_gpus) else: password = os.environ.get("redis_password") assert os.environ.get("ip_head") print( "We detect redis_password ({}) exists in environment! So " "we will start a ray cluster!".format(password) ) if num_gpus: print( "We are in cluster mode! So GPU specification is disable and" " should be done when submitting task to cluster! You are " "requiring {} GPU for each machine!".format(num_gpus) ) initialize_ray(address=os.environ["ip_head"], test_mode=test_mode, redis_password=password) # prepare config used_config = { "seed": tune.grid_search([i * 100 + start_seed for i in range(num_seeds)]), "log_level": "DEBUG" if test_mode else "INFO", "callbacks": custom_callback if custom_callback else DrivingCallbacks, # Must Have! } if config: used_config.update(config) config = copy.deepcopy(used_config) trainer_name = trainer if isinstance(trainer, str) else trainer._name if not isinstance(stop, dict) and stop is not None: assert np.isscalar(stop) stop = {"timesteps_total": int(stop)} if keep_checkpoints_num is not None and not test_mode: assert isinstance(keep_checkpoints_num, int) kwargs["keep_checkpoints_num"] = keep_checkpoints_num kwargs["checkpoint_score_attr"] = "episode_reward_mean" if "verbose" not in kwargs: kwargs["verbose"] = 1 if not test_mode else 2 # This functionality is not supported yet! metric_columns = CLIReporter.DEFAULT_COLUMNS.copy() progress_reporter = CLIReporter(metric_columns) progress_reporter.add_metric_column("success") progress_reporter.add_metric_column("crash") progress_reporter.add_metric_column("out") progress_reporter.add_metric_column("max_step") progress_reporter.add_metric_column("length") kwargs["progress_reporter"] = progress_reporter # start training analysis = tune.run( trainer, name=exp_name, checkpoint_freq=checkpoint_freq, checkpoint_at_end=True, stop=stop, config=config, max_failures=20 if not test_mode else 1, reuse_actors=False, local_dir="data", **kwargs ) # save training progress as insurance if save_pkl: pkl_path = "{}-{}{}.pkl".format(exp_name, trainer_name, "" if not suffix else "-" + suffix) with open(pkl_path, "wb") as f: data = analysis.fetch_trial_dataframes() pickle.dump(data, f) print("Result is saved at: <{}>".format(pkl_path)) return analysis
# Search algorithm # search_alg = HyperOptSearch() # Restore previous search state checkpoint # search_alg_state = os.path.join(local_dir, exp_name) # if os.path.isdir(search_alg_state): # print('Restore search state:', search_alg_state) # search_alg.restore_from_dir(search_alg_state) # Repeat each trial 3 times, not recommended to use with TrialSchedulers # search_alg = Repeater(searcher=search_alg, repeat=3) # search_alg = ConcurrencyLimiter(search_alg, max_concurrent=max(num_cpus, num_gpus)) # Progress reporter reporter = CLIReporter() reporter.add_metric_column(metric='train_loss') reporter.add_metric_column(metric='train_acc') reporter.add_metric_column(metric='valid_loss') reporter.add_metric_column(metric='valid_acc') for i in range(len(test_files_list)): reporter.add_metric_column(metric='test{}_loss'.format(i)) reporter.add_metric_column(metric='test{}_acc'.format(i)) # Ray tune - local_dir/exp_name/trial_name_x analysis = tune.run( partial(train_audioset, train_files=train_files, valid_files=valid_files, test_files_list=test_files_list), metric='valid_acc', mode='max', name=exp_name, stop={'training_iteration': 1 if smoke_test else max_num_epochs},