def aug_opt( name, train_spec, scheduler, inner_config_dir, resume=None, fresh=False, search_alg=None, ): from ray.tune import run from ray.tune import Experiment train_spec['config']['config_dir'] = inner_config_dir if resume is None: resume = not fresh experiment = Experiment.from_json(name=name, spec=train_spec) if fresh and os.path.exists(experiment.local_dir): import shutil shutil.rmtree(experiment.local_dir) run(experiment, name=name, scheduler=scheduler, reuse_actors=True, verbose=True, resume=resume, search_alg=search_alg, trial_name_creator=tune.function(trial_str_creator))
def create_expe_spec(config, n_cpu, n_gpu, exp_dir): def _trial_name_creator(trial): return "{}_{}_123".format(trial.trainable_name, trial.trial_id) # Create env and register it, so ray and rllib can use it register_env(config["env_config"]["env"], lambda env_config: env_basic_creator(env_config)) expe_config = merge_env_algo_config(config) # Shouldn't be useful now, automatic in RLLIB #trial_resources = {"cpu": expe_config["num_workers"]+3, "gpu": expe_config["num_gpus"]} # expe_config["lr"] = grid_search([1e-3, 1e-4, 5e-4, 1e-5, 5e-5]) # expe_config["target_network_update_freq"] = grid_search([20000, 40000]) experiment = Experiment( name=config["name_expe"], run=config["algo"], stop=config["stop"], config=expe_config, num_samples=config.get("num_samples", 1), checkpoint_freq=10, max_failures=2, local_dir=exp_dir, # trial_name_creator=tune.function(_trial_name_creator) # todo : add when available ) return experiment
def _setup_create_experiment_checkpoint_dir( self, run_config: Optional[RunConfig]) -> str: """Sets up experiment checkpoint dir before actually running the experiment.""" path = Experiment.get_experiment_checkpoint_dir( self._convert_trainable(self._trainable), run_config.local_dir, run_config.name, ) if not os.path.exists(path): os.makedirs(path) return path
def create_experiment(args): """ Create a single experiment from arguments. :param args: The parsed arguments. :return: A new experiment with its own trainer. """ experiment_name = get_experiment_name(args) config = build_experiment_config_dict(args) trainer = get_trainer(args=args, config=config) experiment_dict = build_experiment_dict(args, experiment_name, trainer, config) return Experiment(**experiment_dict)
def main(): args = parse_args() save_path = args.save_path = os.path.join(args.save_folder, args.arch) os.makedirs(save_path) #os.makedirs(save_path, exist_ok=True) # config args.logger_file = os.path.join(save_path, 'log_{}.txt'.format(args.cmd)) handlers = [ logging.FileHandler(args.logger_file, mode='w'), logging.StreamHandler() ] logging.basicConfig(level=logging.INFO, datefmt='%m-%d-%y %H:%M', format='%(asctime)s:%(message)s', handlers=handlers) if args.cmd == 'train': logging.info('start training {}'.format(args.arch)) run_training(args) elif args.cmd == 'test': logging.info('start evaluating {} with checkpoints from {}'.format( args.arch, args.resume)) test_model(args) elif args.cmd == 'tune': import ray import ray.tune as tune from ray.tune import Experiment from ray.tune.median_stopping_rule import MedianStoppingRule ray.init() sched = MedianStoppingRule(time_attr="timesteps_total", reward_attr="neg_mean_loss") tune.register_trainable( "run_training", lambda cfg, reporter: run_training(args, cfg, reporter)) experiment = Experiment( "train_rl", "run_training", trial_resources={"gpu": 1}, config={"alpha": tune.grid_search([0.1, 0.01, 0.001])}) tune.run_experiments(experiment, scheduler=sched, verbose=False)
def to_ray_experiment(self) -> Experiment: with open(self._parameter_file, "r") as parameter_f: parameter_file_snippet = parameter_f.read() trainable_name = f"{self._experiment_name}_train_func_{self._run_id or '0'}" logger.info(f"Register run parameters: {self._run_parameters}") register_trainable( trainable_name, train_func( run_parameters=self._run_parameters, parameter_file_snippet=parameter_file_snippet, current_working_dir=self._current_working_dir, serialization_dir=self._serialization_dir, include_packages=self._include_packages, gpus_available=self._gpus_available, ), ) def trial_name_creator(trial): params = ",".join([ f"{k}={v}" for k, v in sorted(self._run_parameters.items(), key=lambda kv: kv[0]) ]) return f"{trial}_{params}" config = self._hyperparameters or {} return Experiment( name=self._experiment_name, run=trainable_name, config=config, resources_per_trial=self._resources_per_trial, local_dir=self._log_dir, trial_name_creator=tune.function(trial_name_creator) if self._run_parameters else None, )
def train_model_on_task(self, task, task_viz, exp_dir, use_ray, use_ray_logging, grace_period, num_hp_samplings, local_mode, redis_address, lca_n, **training_params): logger.info("Training dashboard: {}".format(get_env_url(task_viz))) t_id = task['id'] trainable = self.get_trainable(use_ray_logging=use_ray_logging) past_tasks = training_params.pop('past_tasks') normalize = training_params.pop('normalize') augment_data = training_params.pop('augment_data') transformations = [] if augment_data: transformations.extend([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor() ]) t_trans = [[] for _ in range(len(task['split_names']))] t_trans[0] = transformations datasets = trainable._load_datasets(task, task['loss_fn'], past_tasks, t_trans, normalize) train_loader, eval_loaders = get_classic_dataloaders(datasets, training_params.pop( 'batch_sizes')) model = self.get_model(task_id=t_id, x_dim=task['x_dim'], n_classes=task['n_classes'], descriptor=task['descriptor'], dataset=eval_loaders[:2]) if use_ray: if not ray.is_initialized(): ray.init(address=redis_address) scheduler = None training_params['loss_fn'] = tune.function( training_params['loss_fn']) training_params['optim_func'] = tune.function(self.optim_func) init_model_path = os.path.join(exp_dir, 'model_initializations') model_file_name = '{}_init.pth'.format(training_params['name']) model_path = os.path.join(init_model_path, model_file_name) torch.save(model, model_path) training_params['model_path'] = model_path config = {**self.get_search_space(), 'training-params': training_params} if use_ray_logging: stop_condition = {'training_iteration': training_params['n_it_max']} checkpoint_at_end = False keep_checkpoints_num = 1 checkpoint_score_attr = 'min-Val nll' else: stop_condition = None # loggers = [JsonLogger, MyCSVLogger] checkpoint_at_end = False keep_checkpoints_num = None checkpoint_score_attr = None trainable = rename_class(trainable, training_params['name']) experiment = Experiment( name=training_params['name'], run=trainable, stop=stop_condition, config=config, resources_per_trial=self.ray_resources, num_samples=num_hp_samplings, local_dir=exp_dir, loggers=(JsonLogger, CSVLogger), checkpoint_at_end=checkpoint_at_end, keep_checkpoints_num=keep_checkpoints_num, checkpoint_score_attr=checkpoint_score_attr) analysis = tune.run(experiment, scheduler=scheduler, verbose=1, raise_on_failed_trial=True, # max_failures=-1, # with_server=True, # server_port=4321 ) os.remove(model_path) logger.info("Training dashboard: {}".format(get_env_url(task_viz))) all_trials = {t.logdir: t for t in analysis.trials} best_logdir = analysis.get_best_logdir('Val nll', 'min') best_trial = all_trials[best_logdir] # picked_metric = 'accuracy_0' # metric_names = {s: '{} {}'.format(s, picked_metric) for s in # ['Train', 'Val', 'Test']} logger.info('Best trial: {}'.format(best_trial)) best_res = best_trial.checkpoint.result best_point = (best_res['training_iteration'], best_res['Val nll']) # y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss'] y_keys = ['Val nll', 'Train nll'] epoch_key = 'training_epoch' it_key = 'training_iteration' plot_res_dataframe(analysis, training_params['name'], best_point, task_viz, epoch_key, it_key, y_keys) if 'entropy' in next(iter(analysis.trial_dataframes.values())): plot_res_dataframe(analysis, training_params['name'], None, task_viz, epoch_key, it_key, ['entropy']) best_model = self.get_model(task_id=t_id) best_model.load_state_dict(torch.load(best_trial.checkpoint.value)) train_accs = analysis.trial_dataframes[best_logdir]['Train accuracy_0'] best_t = best_res['training_iteration'] t = best_trial.last_result['training_iteration'] else: search_space = self.get_search_space() rand_config = list(generate_variants(search_space))[0][1] learner_params = rand_config.pop('learner-params', {}) optim_params = rand_config.pop('optim') split_optims = training_params.pop('split_optims') if hasattr(model, 'set_h_params'): model.set_h_params(**learner_params) if hasattr(model, 'train_loader_wrapper'): train_loader = model.train_loader_wrapper(train_loader) loss_fn = task['loss_fn'] if hasattr(model, 'loss_wrapper'): loss_fn = model.loss_wrapper(task['loss_fn']) prepare_batch = _prepare_batch if hasattr(model, 'prepare_batch_wrapper'): prepare_batch = model.prepare_batch_wrapper(prepare_batch, t_id) optim_fact = partial(set_optim_params, optim_func=self.optim_func, optim_params=optim_params, split_optims=split_optims) if hasattr(model, 'train_func'): f = model.train_func t, metrics, b_state_dict = f(train_loader=train_loader, eval_loaders=eval_loaders, optim_fact=optim_fact, loss_fn=loss_fn, split_names=task['split_names'], viz=task_viz, prepare_batch=prepare_batch, **training_params) else: optim = optim_fact(model=model) t, metrics, b_state_dict = train(model=model, train_loader=train_loader, eval_loaders=eval_loaders, optimizer=optim, loss_fn=loss_fn, split_names=task['split_names'], viz=task_viz, prepare_batch=prepare_batch, **training_params) train_accs = metrics['Train accuracy_0'] best_t = b_state_dict['iter'] if 'training_archs' in metrics: plot_trajectory(model.ssn.graph, metrics['training_archs'], model.ssn.stochastic_node_ids, task_viz) weights = model.arch_sampler().squeeze() archs = model.ssn.get_top_archs(weights, 5) list_top_archs(archs, task_viz) list_arch_scores(self.arch_scores[t_id], task_viz) update_summary(self.arch_scores[t_id], task_viz, 'scores') if len(train_accs) > lca_n: lca_accs = [] for i in range(lca_n + 1): if i in train_accs: lca_accs.append(train_accs[i]) else: logger.warning('Missing step for {}/{} for lca computation' .format(i, lca_n)) lca = np.mean(lca_accs) else: lca = np.float('nan') stats = {} start = time.time() # train_idx = task['split_names'].index('Train') # train_path = task['data_path'][train_idx] # train_dataset = _load_datasets([train_path])[0] train_dataset = _load_datasets(task, 'Train')[0] stats.update(self.finish_task(train_dataset, t_id, task_viz, path='drawings')) stats['duration'] = {'iterations': t, 'finish': time.time() - start, 'best_iterations': best_t} stats['params'] = {'total': self.n_params(t_id), 'new': self.new_params(t_id)} stats['lca'] = lca return stats
# 从给定的检查点恢复(训练出错) def _restore(self, checkpoint_path): with open(checkpoint_path) as f: self.timestep = json.loads(f.read())["timestep"] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--smoke-test", action="store_true", help="Finish quickly for testing") args, _ = parser.parse_known_args() ray.init() exp = Experiment( name="hyperband_test", run=MyTrainableClass, # num_samples 从超参数空间抽样两次,此代码即训练两次可训练函数MyTrainableClass num_samples=10, # 可训练函数重命名 # trial_name_creator=tune.function(trial_str_creator), # 打印训练过程中每次试验的状态信息。 # loggers=[TestLogger], stop={"training_iteration": 1 if args.smoke_test else 999}, # random.random()生成0和1之间的随机浮点数float # config参数的含义是 width取值空间为[10, 100], height 取值空间为 [0, 100] config={ "width": tune.sample_from(lambda spec: 10 + int(90 * random.random())), "height": tune.sample_from(lambda spec: int(100 * random.random())) }) trials = run(exp)
def testPointsToEvaluateBasicVariantAdvanced(self): config = { "grid_1": tune.grid_search(["a", "b", "c", "d"]), "grid_2": tune.grid_search(["x", "y", "z"]), "nested": { "random": tune.uniform(2., 10.), "dependent": tune.sample_from(lambda spec: -1. * spec.config.nested.random) } } points = [ { "grid_1": "b" }, { "grid_2": "z" }, { "grid_1": "a", "grid_2": "y" }, { "nested": { "random": 8.0 } }, ] from ray.tune.suggest.basic_variant import BasicVariantGenerator # grid_1 * grid_2 are 3 * 4 = 12 variants per complete grid search # However if one grid var is set by preset variables, that run # is excluded from grid search. # Point 1 overwrites grid_1, so the first trial only grid searches # over grid_2 (3 trials). # The remaining 5 trials search over the whole space (5 * 12 trials) searcher = BasicVariantGenerator(points_to_evaluate=[points[0]]) exp = Experiment(run=_mock_objective, name="test", config=config, num_samples=6) searcher.add_configurations(exp) self.assertEqual(searcher.total_samples, 1 * 3 + 5 * 12) # Point 2 overwrites grid_2, so the first trial only grid searches # over grid_1 (4 trials). # The remaining 5 trials search over the whole space (5 * 12 trials) searcher = BasicVariantGenerator(points_to_evaluate=[points[1]]) exp = Experiment(run=_mock_objective, name="test", config=config, num_samples=6) searcher.add_configurations(exp) self.assertEqual(searcher.total_samples, 1 * 4 + 5 * 12) # Point 3 overwrites grid_1 and grid_2, so the first trial does not # grid search. # The remaining 5 trials search over the whole space (5 * 12 trials) searcher = BasicVariantGenerator(points_to_evaluate=[points[2]]) exp = Experiment(run=_mock_objective, name="test", config=config, num_samples=6) searcher.add_configurations(exp) self.assertEqual(searcher.total_samples, 1 + 5 * 12) # When initialized with all points, the first three trials are # defined by the logic above. Only 3 trials are grid searched # compeletely. searcher = BasicVariantGenerator(points_to_evaluate=points) exp = Experiment(run=_mock_objective, name="test", config=config, num_samples=6) searcher.add_configurations(exp) self.assertEqual(searcher.total_samples, 1 * 3 + 1 * 4 + 1 + 3 * 12) # Run this and confirm results analysis = tune.run(exp, search_alg=searcher) configs = [trial.config for trial in analysis.trials] self.assertEqual(len(configs), searcher.total_samples) self.assertTrue(all(config["grid_1"] == "b" for config in configs[0:3])) self.assertTrue(all(config["grid_2"] == "z" for config in configs[3:7])) self.assertTrue(configs[7]["grid_1"] == "a" and configs[7]["grid_2"] == "y") self.assertTrue(configs[8]["nested"]["random"] == 8.0) self.assertTrue(configs[8]["nested"]["dependent"] == -8.0)
'optimizer': self.optimizer.state_dict() } torch.save(cpd, checkpoint_dir + "/save") def _restore(self, path): cpd = torch.load(path) self.iteration = cpd['iteration'] self.sc.load_state_dict(cpd['state_dict']) self.optimizer.load_state_dict(cpd['optimizer']) if __name__ == "__main__": ray.init() dset = TensorDataset( torch.randn(100, 64, 1024), torch.randn(100, 1024), torch.randint(100, size=(100, )).type(torch.LongTensor)) dset_id = pin_in_object_store(dset) tune.register_trainable('train_sc', Trainer) exp = Experiment(name="speaker classification", run='train_sc', stop={"timesteps_total": 1}, config={ "lr": 1e-3, "dset_id": dset_id, "nspeakers": 100, "batch_size": 1, }) tune.run_experiments(exp)
def train_model_on_task(self, task, task_viz, exp_dir, use_ray, use_ray_logging, smoke_test, n_it_max, grace_period, num_hp_samplings, local_mode, tune_register_lock, resources, **training_params): logger.info("Training dashboard: {}".format(get_env_url(task_viz))) model = self.get_model(task_id=task.id) trainable = self.get_trainable(use_ray_logging=use_ray_logging) self.prepare_task(task, training_params) if use_ray: # Required to avoid collisions in Tune's global Registry: # https://github.com/ray-project/ray/blob/master/python/ray/tune/registry.py trainable = rename_class(trainable, training_params['name']) scheduler = None training_params['loss_fn'] = tune.function( training_params['loss_fn']) training_params['optim_func'] = tune.function(self.optim_func) training_params['n_it_max'] = n_it_max init_model_path = os.path.join(exp_dir, 'model_initializations') model_file_name = '{}_init.pth'.format(training_params['name']) model_path = os.path.join(init_model_path, model_file_name) torch.save(model, model_path) training_params['model_path'] = model_path config = {'hyper-params': self.get_search_space(smoke_test), 'tp': training_params} if use_ray_logging: stop_condition = {'training_iteration': n_it_max} loggers = None else: stop_condition = None loggers = [JsonLogger, MyCSVLogger] # We need to create the experiment using a lock here to avoid issues # with Tune's global registry, more specifically with the # `_to_flush` dict that may change during the iteration over it. # https://github.com/ray-project/ray/blob/e3c9f7e83a6007ded7ae7e99fcbe9fcaa371bad3/python/ray/tune/registry.py#L91-L93 tune_register_lock.acquire() experiment = Experiment( name=training_params['name'], run=trainable, stop=stop_condition, config=config, resources_per_trial=resources, num_samples=num_hp_samplings, local_dir=exp_dir, loggers=loggers, keep_checkpoints_num=1, checkpoint_score_attr='min-mean_loss') tune_register_lock.release() analysis = tune.run(experiment, scheduler=scheduler, verbose=1, raise_on_failed_trial=True, # max_failures=-1, # with_server=True, # server_port=4321 ) os.remove(model_path) logger.info("Training dashboard: {}".format(get_env_url(task_viz))) all_trials = {t.logdir: t for t in analysis.trials} best_logdir = analysis.get_best_logdir('mean_loss', 'min') best_trial = all_trials[best_logdir] # picked_metric = 'accuracy_0' # metric_names = {s: '{} {}'.format(s, picked_metric) for s in # ['Train', 'Val', 'Test']} logger.info('Best trial: {}'.format(best_trial)) best_res = best_trial._checkpoint.last_result best_point = (best_res['training_iteration'], best_res['mean_loss']) y_keys = ['mean_loss' if use_ray_logging else 'Val nll', 'train_loss'] epoch_key = 'training_epoch' it_key = 'training_iteration' if use_ray_logging else 'training_iterations' plot_res_dataframe(analysis, training_params['name'], best_point, task_viz, epoch_key, it_key, y_keys) best_model = self.get_model(task_id=task.id) best_model.load_state_dict(torch.load(best_trial._checkpoint.value)) t = best_trial._checkpoint.last_result['training_iteration'] else: data_path = training_params.pop('data_path') past_tasks = training_params.pop('past_tasks') datasets = trainable._load_datasets(data_path, training_params['loss_fn'], past_tasks) train_loader, eval_loaders = get_classic_dataloaders(datasets, training_params.pop('batch_sizes')) optim = self.optim_func(model.parameters()) t, accs, best_state_dict = train(model, train_loader, eval_loaders, optimizer=optim, viz=task_viz, n_it_max=n_it_max, **training_params) logger.info('Finishing task ...') t1 = time.time() self.finish_task(task.datasets[0]) logger.info('done in {}s'.format(time.time() - t1)) return t
class FCNetSliceLocalizationTrainable(AbstractFCNetTrainable): BENCHMARK_CLASS = FCNetSliceLocalizationBenchmark class FCNetNavalPropulsionTrainable(AbstractFCNetTrainable): BENCHMARK_CLASS = FCNetNavalPropulsionBenchmark class FCNetParkinsonsTelemonitoringTrainable(AbstractFCNetTrainable): BENCHMARK_CLASS = FCNetParkinsonsTelemonitoringBenchmark if __name__ == "__main__": """Example with FCNetProteinStructure and Hyperband.""" download_fcnet() hyperband = HyperBandScheduler( time_attr="training_iteration", metric="episode_reward_mean", mode="max", max_t=100) exp = Experiment( name="hyperband_fcnet_protein_test", run=FCNetProteinStructureTrainable, num_samples=20, stop={"training_iteration": 1}, config=FCNetProteinStructureTrainable().get_configuration_space()) run(exp, scheduler=hyperband)
self.timestep = json.loads(f.read())["timestep"] register_trainable("my_class", MyTrainableClass) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--smoke-test", action="store_true", help="Finish quickly for testing") args, _ = parser.parse_known_args() ray.init() # Hyperband early stopping, configured with `episode_reward_mean` as the # objective and `timesteps_total` as the time unit. hyperband = HyperBandScheduler(time_attr="timesteps_total", reward_attr="episode_reward_mean", max_t=100) exp = Experiment( name="hyperband_test", run="my_class", repeat=20, stop={"training_iteration": 1 if args.smoke_test else 99999}, config={ "width": lambda spec: 10 + int(90 * random.random()), "height": lambda spec: int(100 * random.random()) }) run_experiments(exp, scheduler=hyperband)
register_trainable( name, lambda augs, reporter: eval_tta(copy.deepcopy(copied_c), augs, reporter)) algo = HyperOptSearch(space, metric=reward_attr, mode="max") algo = ConcurrencyLimiter(algo, max_concurrent=num_process_per_gpu * torch.cuda.device_count()) experiment_spec = Experiment( name, run=name, num_samples=args.num_search, # if r == args.repeat-1 else 25, resources_per_trial={'gpu': 1. / num_process_per_gpu}, stop={'training_iteration': args.iter}, config={ 'dataroot': args.dataroot, 'save_path': paths[cv_fold], 'cv_ratio_test': args.cv_ratio, 'cv_fold': cv_fold, 'num_op': args.num_op, 'num_policy': args.num_policy }, local_dir=os.path.join(base_path, "ray_results"), ) analysis = run(experiment_spec, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, resume=args.resume, raise_on_failed_trial=False, global_checkpoint_period=np.inf)
tune.grid_search(['grad_inp', 'grad_latent', 'no_reg', 'grad_mc']) } config = {} for d in (_config, ukn_args): config.update(d) tune.register_trainable("train_dnn", train_with_dic) ray.init() exp = Experiment( name=args.name, run="train_dnn", trial_resources={ "cpu": 6, "gpu": 1 }, config=config, local_dir="./ray_results", repeat=args.repeat, max_failures=args.max_failure, stop={"training_iteration": 1 if args.smoke_test else 99999}) ahb = AsyncHyperBandScheduler(time_attr="timesteps_total", reward_attr="mean_accuracy", grace_period=40, max_t=200) # run_experiments(exp, verbose=args.verbose, scheduler=ahb) run_experiments(exp, verbose=args.verbose)
for _ in range(args.repeat): # run multiple times. for gr_id in range(gr_num): for cv_id in range(cv_num): final_policy_set = [] name = "search_%s_%s_group%d_%d_cv%d_ratio%.1f" % (C.get()['dataset'], C.get()['model']['type'], gr_id, gr_num, cv_id, args.cv_ratio) print(name) bo_log_file = open(os.path.join(base_path, name+"_bo_result.csv"), "w", newline="") wr = csv.writer(bo_log_file) wr.writerow(result_to_save) register_trainable(name, lambda augs, reporter: eval_tta2(copy.deepcopy(copied_c), augs, reporter)) algo = HyperOptSearch(space, metric=reward_attr, mode="max") algo = ConcurrencyLimiter(algo, max_concurrent=num_process_per_gpu*torch.cuda.device_count()) experiment_spec = Experiment( name, run=name, num_samples=args.num_search,# if r == args.repeat-1 else 25, resources_per_trial={'gpu': 1./num_process_per_gpu}, stop={'training_iteration': args.iter}, 'config': { 'dataroot': args.dataroot, 'save_path': paths[cv_id], 'cv_ratio_test': args.cv_ratio, 'cv_id': cv_id, 'num_op': args.num_op, 'num_policy': args.num_policy, "gr_assign": gr_assign, "gr_id": gr_id }, local_dir=os.path.join(base_path, "ray_results"), ) analysis = run(experiment_spec, search_alg=algo, scheduler=None, verbose=0, queue_trials=True, resume=args.resume, raise_on_failed_trial=False, global_checkpoint_period=np.inf) results = analysis.trials print() results = [x for x in results if x.last_result]
self.timestep = json.loads(f.read())["timestep"] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--smoke-test", action="store_true", help="Finish quickly for testing") args, _ = parser.parse_known_args() ray.init() # Hyperband early stopping, configured with `episode_reward_mean` as the # objective and `training_iteration` as the time unit, # which is automatically filled by Tune. hyperband = HyperBandScheduler(time_attr="training_iteration", metric="episode_reward_mean", mode="max", max_t=100) exp = Experiment( name="hyperband_test", run=MyTrainableClass, num_samples=20, stop={"training_iteration": 1 if args.smoke_test else 99999}, config={ "width": sample_from(lambda spec: 10 + int(90 * random.random())), "height": sample_from(lambda spec: int(100 * random.random())) }) run(exp, scheduler=hyperband)
# Load dev set dev_set = load_dev_set(args) dev_set_id = pin_in_object_store(dev_set) print("Loaded dev. pinned={}".format(True)) tune.register_trainable('train_sc', train.Trainer) exp = Experiment( name="speaker classification", run='train_sc', config={ "stop": { 'training_iteration': 500 }, # "lr": ray.tune.grid_search([1e-1, 1e-2]), "lr": 1e-2, "alpha": 0.9, "train_set_id": train_set_id, "dev_set_id": dev_set_id, "nspeakers": nspeakers, # "batch_size": ray.tune.grid_search([128, 64, 32]), "batch_size": 32, "data_parallel": args.data_parallel }, trial_resources={ "cpu": 20, "gpu": 1 }, ) tune.run_experiments(exp, with_server=True, server_port=4321)
def ray_train(cfg, pl_module_cls): # We need Munch to hold tune functions. DictConfig can only hold static config. cfg = munchconfig_to_tune_munchconfig(dictconfig_to_munch(cfg)) ray_config = { 'model': cfg.model, 'dataset': cfg.dataset, 'train': cfg.train, 'seed': cfg.seed, 'wandb': cfg.wandb, 'gpu': cfg.runner.gpu_per_trial != 0.0, } dataset_str = cfg.dataset._target_.split('.')[-1] model_str = cfg.model._target_.split('.')[-1] args_str = '_' # If we're writing to dfs or efs already, no need to sync explicitly # This needs to be a noop function, not just False. If False, ray won't restore failed spot instances sync_to_driver = None if not cfg.runner.nfs else lambda source, target: None experiment = Experiment( name=f'{dataset_str}_{model_str}', run=partial(pl_train_with_tune, pl_module_cls=pl_module_cls), local_dir=cfg.runner.result_dir, num_samples=cfg.runner.ntrials if not cfg.smoke_test else 1, resources_per_trial={ 'cpu': 1 + cfg.dataset.num_workers, 'gpu': cfg.runner.gpu_per_trial }, # epochs + 1 because calling trainer.test(model) counts as one epoch stop={ "training_iteration": 1 if cfg.smoke_test else cfg.train.epochs + 1 }, config=ray_config, loggers=[WandbLogger], keep_checkpoints_num=1, # Save disk space, just need 1 for recovery # checkpoint_at_end=True, # checkpoint_freq=1000, # Just to enable recovery with @max_failures max_failures=-1, sync_to_driver=sync_to_driver, # As of Ray 1.0.0, still need this here ) if cfg.smoke_test or cfg.runner.local: ray.init(num_gpus=torch.cuda.device_count()) else: try: ray.init(address='auto') except: try: with open(project_root / 'ray_config/redis_address', 'r') as f: address = f.read().strip() with open(project_root / 'ray_config/redis_password', 'r') as f: password = f.read().strip() ray.init(address=address, _redis_password=password) except: ray.init(num_gpus=torch.cuda.device_count()) import warnings warnings.warn("Running Ray with just one node") if cfg.runner.hyperband: scheduler = AsyncHyperBandScheduler( metric='mean_accuracy', mode='max', max_t=cfg.train.epochs + 1, grace_period=cfg.runner.grace_period) else: scheduler = None trials = ray.tune.run( experiment, scheduler=scheduler, # sync_config=SyncConfig(sync_to_driver=sync_to_driver), raise_on_failed_trial=False, queue_trials=True) return trials
ray.init() register_env(env_name, lambda config: CollectMineralsAndGas()) experiment_spec = Experiment( experiment_name, #experiment name to log "DQN", #model to be used checkpoint_freq=100, #save model each 100th iteration stop={ "training_iteration": 300, #stop model training after 300 iteration }, config={ "env": env_name, "framework": "tensorflow", # used framework "buffer_size": 50000, "timesteps_per_iteration": 1000, "n_step": 3, "prioritized_replay": True, "grad_clip": None, "num_workers": 1, "num_gpus": 1, # use gpu "exploration_config": { "type": "EpsilonGreedy", # use EpsilonGreedy for exploration "initial_epsilon": 1.0, "final_epsilon": 0.02, "epsilon_timesteps": 1000 } }, ) run_experiments(experiment_spec)
with open(path, "w") as f: f.write(json.dumps({"timestep": self.timestep})) return path def _restore(self, checkpoint_path): with open(checkpoint_path) as f: self.timestep = json.loads(f.read())["timestep"] if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--smoke-test", action="store_true", help="Finish quickly for testing") args, _ = parser.parse_known_args() ray.init() exp = Experiment( name="hyperband_test", run=MyTrainableClass, num_samples=1, trial_name_creator=tune.function(trial_str_creator), loggers=[TestLogger], stop={"training_iteration": 1 if args.smoke_test else 99999}, config={ "width": tune.sample_from(lambda spec: 10 + int(90 * random.random())), "height": tune.sample_from(lambda spec: int(100 * random.random())) }) trials = run_experiments(exp)
config["multiagent"] = { 'policy_graphs': policy_graph, 'policy_mapping_fn': tune.function(lambda agent_id: POLICY_ID), 'policies_to_train': [POLICY_ID] } env_name_list.append(env_name) config_list.append(config) # Register as rllib env register_env(env_name, create_env) exp_list = [] for config, env_name in zip(config_list, env_name_list): exp_tag = { "run": alg_run, "env": env_name, "config": { **config }, "checkpoint_freq": 10, "max_failures": 999, "stop": { "training_iteration": 50 }, "num_samples": 6, } exp_list.append(Experiment.from_json(args.exp_tag, exp_tag)) trials = run_experiments(experiments=exp_list)
def run_experiment(args, parser): # args.ray_object_store_memory = int(1e10) args.ray_redis_max_memory = int(2e9) if args.config_file: with open(args.config_file) as f: exp = yaml.load(f) else: raise Exception('No config file!') exp = merge_dicts(exp, args.config) log.info('Num workers: %d, num_envs_per_worker: %d', exp['config']['num_workers'], exp['config']['num_envs_per_worker']) if args.cfg_mixins is not None: for cfg_mixin_file in args.cfg_mixins: with open(cfg_mixin_file, 'r') as f: override_cfg = yaml.load(f) log.info('Overriding parameters from %s: %r', cfg_mixin_file, override_cfg) exp = merge_dicts(exp, override_cfg) if not exp.get("run"): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") if args.ray_num_nodes: cluster = Cluster() for _ in range(args.ray_num_nodes): cluster.add_node( num_cpus=args.ray_num_cpus or 1, num_gpus=args.ray_num_gpus or 0, object_store_memory=args.ray_object_store_memory, redis_max_memory=args.ray_redis_max_memory, ) ray.init(redis_address=cluster.redis_address, local_mode=args.local_mode) else: ray.init( redis_address=args.redis_address, object_store_memory=args.ray_object_store_memory, redis_max_memory=args.ray_redis_max_memory, num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus, local_mode=args.local_mode, ) exp = Experiment.from_json(args.experiment_name, exp) exp.spec['checkpoint_freq'] = 20 if args.pbt: exp.spec['checkpoint_freq'] = 3 exp.spec['checkpoint_at_end'] = True # exp.spec['checkpoint_score_attr'] = 'episode_reward_mean' exp.spec['keep_checkpoints_num'] = 5 if args.stop_seconds > 0: exp.spec['stop'] = {'time_total_s': args.stop_seconds} # if 'multiagent' in exp.spec['config']: # # noinspection PyProtectedMember # make_env = ray.tune.registry._global_registry.get(ENV_CREATOR, exp.spec['config']['env']) # temp_env = make_env(None) # obs_space, action_space = temp_env.observation_space, temp_env.action_space # temp_env.close() # del temp_env # # policies = dict( # main=(None, obs_space, action_space, {}), # dummy=(None, obs_space, action_space, {}), # ) # # exp.spec['config']['multiagent'] = { # 'policies': policies, # 'policy_mapping_fn': function(lambda agent_id: 'main'), # 'policies_to_train': ['main'], # } # # if args.dbg: # exp.spec['config']['num_workers'] = 1 # exp.spec['config']['num_gpus'] = 1 # exp.spec['config']['num_envs_per_worker'] = 1 # # if 'callbacks' not in exp.spec['config']: # exp.spec['config']['callbacks'] = {} # # fps_helper = FpsHelper() # # def on_train_result(info): # if 'APPO' in exp.spec['run']: # samples = info['result']['info']['num_steps_sampled'] # else: # samples = info['trainer'].optimizer.num_steps_trained # # fps_helper.record(samples) # fps = fps_helper.get_fps() # info['result']['custom_metrics']['fps'] = fps # # # remove this as currently # skip_frames = exp.spec['config']['env_config']['skip_frames'] # info['result']['custom_metrics']['fps_frameskip'] = fps * skip_frames # # exp.spec['config']['callbacks']['on_train_result'] = function(on_train_result) # # def on_episode_end(info): # episode = info['episode'] # stats = { # 'DEATHCOUNT': 0, # 'FRAGCOUNT': 0, # 'HITCOUNT': 0, # 'DAMAGECOUNT': 0, # 'KDR': 0, # 'FINAL_PLACE': 0, # 'LEADER_GAP': 0, # 'PLAYER_COUNT': 0, # 'BOT_DIFFICULTY': 0, # } # # # noinspection PyProtectedMember # agent_to_last_info = episode._agent_to_last_info # for agent in agent_to_last_info.keys(): # agent_info = agent_to_last_info[agent] # for stats_key in stats.keys(): # stats[stats_key] += agent_info.get(stats_key, 0.0) # # for stats_key in stats.keys(): # stats[stats_key] /= len(agent_to_last_info.keys()) # # episode.custom_metrics.update(stats) # # exp.spec['config']['callbacks']['on_episode_end'] = function(on_episode_end) extra_kwargs = {} if args.pbt: extra_kwargs['reuse_actors'] = False run(exp, name=args.experiment_name, scheduler=make_custom_scheduler(args), resume=args.resume, queue_trials=args.queue_trials, **extra_kwargs)
def main(args): ray.init(num_cpus=args.rayNumCpu, num_gpus=args.rayNumGpu) t_loader, v_loader = get_loaders(train_batch_size=16, num_workers=1, data_folder=args.dataFolder, cuda_available=torch.cuda.is_available()) pinned_obj_dict['data_loader_train'] = pin_in_object_store(t_loader) pinned_obj_dict['data_loader_valid'] = pin_in_object_store(v_loader) pinned_obj_dict['args'] = pin_in_object_store(args) trainable_name = 'hyp_search_train' register_trainable(trainable_name, TrainerClass) reward_attr = "acc" ############################# # Define hyperband scheduler ############################# hpb = AsyncHyperBandScheduler(time_attr="training_iteration", reward_attr=reward_attr, grace_period=40, max_t=300) ############################## # Define hyperopt search algo ############################## space = { 'lr': hp.uniform('lr', 0.001, 0.1), 'optimizer': hp.choice("optimizer", ['SGD', 'Adam' ]), #, 'Adadelta']), # Adadelta gets the worst results 'batch_accumulation': hp.choice("batch_accumulation", [4, 8, 16]) } hos = HyperOptSearch(space, max_concurrent=4, reward_attr=reward_attr) ##################### # Define experiments ##################### exp_name = "resnet152_hyp_search_hyperband_hyperopt_{}".format( time.strftime("%Y-%m-%d_%H.%M.%S")) exp = Experiment( name=exp_name, run=trainable_name, num_samples=args.numSamples, # the number of experiments resources_per_trial={ "cpu": args.trialNumCpu, "gpu": args.trialNumGpu }, checkpoint_freq=args.checkpointFreq, checkpoint_at_end=True, stop={ reward_attr: 0.95, "training_iteration": args. trainingIteration, # how many times a specific config will be trained }) ################## # Run tensorboard ################## if args.runTensorBoard: thread = threading.Thread(target=launch_tensorboard, args=[exp_name]) thread.start() launch_tensorboard(exp_name) ################## # Run experiments ################## run_experiments(exp, search_alg=hos, scheduler=hpb, verbose=False)