def _create_sampler(self, sampler_method: str) -> BaseSampler: # n_warmup_steps: Disable pruner until the trial reaches the given number of step. if sampler_method == "random": sampler = RandomSampler(seed=self.seed) elif sampler_method == "tpe": # TODO: try with multivariate=True sampler = TPESampler(n_startup_trials=self.n_startup_trials, seed=self.seed) elif sampler_method == "skopt": # cf https://scikit-optimize.github.io/#skopt.Optimizer # GP: gaussian process # Gradient boosted regression: GBRT sampler = SkoptSampler(skopt_kwargs={"base_estimator": "GP", "acq_func": "gp_hedge"}) else: raise ValueError(f"Unknown sampler: {sampler_method}") return sampler
def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=5000, hyperparams=None, n_jobs=1, sampler_method='random', pruner_method='halving', seed=0, verbose=1): """ :param algo: (str) :param model_fn: (func) function that is used to instantiate the model :param env_fn: (func) function that is used to instantiate the env :param n_trials: (int) maximum number of trials for finding the best hyperparams :param n_timesteps: (int) maximum number of timesteps per trial :param hyperparams: (dict) :param n_jobs: (int) number of parallel jobs :param sampler_method: (str) :param pruner_method: (str) :param seed: (int) :param verbose: (int) :return: (pd.Dataframe) detailed result of the optimization """ # TODO: eval each hyperparams several times to account for noisy evaluation # TODO: take into account the normalization (also for the test env -> sync obs_rms) if hyperparams is None: hyperparams = {} n_startup_trials = 10 # test during 5 episodes n_eval_episodes = 5 # evaluate every 20th of the maximum budget per iteration n_evaluations = 20 eval_freq = int(n_timesteps / n_evaluations) # n_warmup_steps: Disable pruner until the trial reaches the given number of step. if sampler_method == 'random': sampler = RandomSampler(seed=seed) elif sampler_method == 'tpe': sampler = TPESampler(n_startup_trials=n_startup_trials, seed=seed) elif sampler_method == 'skopt': # cf https://scikit-optimize.github.io/#skopt.Optimizer # GP: gaussian process # Gradient boosted regression: GBRT sampler = SkoptSampler(skopt_kwargs={ 'base_estimator': "GP", 'acq_func': 'gp_hedge' }) else: raise ValueError('Unknown sampler: {}'.format(sampler_method)) if pruner_method == 'halving': pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=4, min_early_stopping_rate=0) elif pruner_method == 'median': pruner = MedianPruner(n_startup_trials=n_startup_trials, n_warmup_steps=n_evaluations // 3) elif pruner_method == 'none': # Do not prune pruner = MedianPruner(n_startup_trials=n_trials, n_warmup_steps=n_evaluations) else: raise ValueError('Unknown pruner: {}'.format(pruner_method)) if verbose > 0: print("Sampler: {} - Pruner: {}".format(sampler_method, pruner_method)) study = optuna.create_study(sampler=sampler, pruner=pruner) algo_sampler = HYPERPARAMS_SAMPLER[algo] def objective(trial): kwargs = hyperparams.copy() trial.model_class = None if algo == 'her': trial.model_class = hyperparams['model_class'] # Hack to use DDPG/TD3 noise sampler # if algo in ['ddpg', 'td3'] or trial.model_class in ['ddpg', 'td3']: if algo in ['ddpg', 'td3'] or trial.model_class in [ DDPG, TD3 ]: # bug to report: changed by Pierre trial.n_actions = env_fn(n_envs=1).action_space.shape[0] kwargs.update(algo_sampler(trial)) model = model_fn(**kwargs) eval_env = env_fn(n_envs=1, eval_env=True) # Account for parallel envs eval_freq_ = eval_freq if isinstance(model.get_env(), VecEnv): eval_freq_ = max(eval_freq // model.get_env().num_envs, 1) # TODO: use non-deterministic eval for Atari? eval_callback = TrialEvalCallback(eval_env, trial, n_eval_episodes=n_eval_episodes, eval_freq=eval_freq_, deterministic=True) if algo == 'her': # Wrap the env if need to flatten the dict obs if isinstance(eval_env, VecEnv): print("UNVECTORIZE ENV") eval_env = _UnvecWrapper(eval_env) # eval_env = HERGoalEnvWrapper(eval_env) # commented by Pierre try: model.learn(n_timesteps, callback=eval_callback) # Free memory model.env.close() eval_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() eval_env.close() raise optuna.exceptions.TrialPruned() is_pruned = eval_callback.is_pruned cost = -1 * eval_callback.last_mean_reward del model.env, eval_env del model if is_pruned: raise optuna.exceptions.TrialPruned() return cost try: study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs) except KeyboardInterrupt: pass print('Number of finished trials: ', len(study.trials)) print('Best trial:') trial = study.best_trial print('Value: ', trial.value) print('Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) ######## added by pierre best_params = trial.params print("best params: ", best_params) # print("best value: ", study.best_value) # print("best best trial: ", study.best_trial) # with open('hyperparameter.yml', 'w') as outfile: # yaml.dump(best_params, outfile) ######## return study.trials_dataframe(), best_params
def hyperparam_optimization( algo, model_fn, env_fn, n_trials=10, n_timesteps=5000, hyperparams=None, # noqa: C901 n_jobs=1, sampler_method="tpe", pruner_method="median", n_startup_trials=10, n_evaluations=20, n_eval_episodes=5, storage=None, study_name=None, seed=0, verbose=1, deterministic_eval=True, ): """ :param algo: (str) :param model_fn: (func) function that is used to instantiate the model :param env_fn: (func) function that is used to instantiate the env :param n_trials: (int) maximum number of trials for finding the best hyperparams :param n_timesteps: (int) maximum number of timesteps per trial :param hyperparams: (dict) :param n_jobs: (int) number of parallel jobs :param sampler_method: (str) :param pruner_method: (str) :param n_startup_trials: (int) :param n_evaluations: (int) Evaluate every 20th of the maximum budget per iteration :param n_eval_episodes: (int) Evaluate the model during 5 episodes :param storage: (Optional[str]) :param study_name: (Optional[str]) :param seed: (int) :param verbose: (int) :param deterministic_eval: (bool) :return: (pd.Dataframe) detailed result of the optimization """ # TODO: eval each hyperparams several times to account for noisy evaluation if hyperparams is None: hyperparams = {} eval_freq = int(n_timesteps / n_evaluations) # n_warmup_steps: Disable pruner until the trial reaches the given number of step. if sampler_method == "random": sampler = RandomSampler(seed=seed) elif sampler_method == "tpe": # TODO: try with multivariate=True sampler = TPESampler(n_startup_trials=n_startup_trials, seed=seed) elif sampler_method == "skopt": # cf https://scikit-optimize.github.io/#skopt.Optimizer # GP: gaussian process # Gradient boosted regression: GBRT sampler = SkoptSampler(skopt_kwargs={ "base_estimator": "GP", "acq_func": "gp_hedge" }) else: raise ValueError(f"Unknown sampler: {sampler_method}") if pruner_method == "halving": pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=4, min_early_stopping_rate=0) elif pruner_method == "median": pruner = MedianPruner(n_startup_trials=n_startup_trials, n_warmup_steps=n_evaluations // 3) elif pruner_method == "none": # Do not prune pruner = MedianPruner(n_startup_trials=n_trials, n_warmup_steps=n_evaluations) else: raise ValueError(f"Unknown pruner: {pruner_method}") if verbose > 0: print(f"Sampler: {sampler_method} - Pruner: {pruner_method}") study = optuna.create_study(sampler=sampler, pruner=pruner, storage=storage, study_name=study_name, load_if_exists=True, direction="maximize") algo_sampler = HYPERPARAMS_SAMPLER[algo] def objective(trial): kwargs = hyperparams.copy() trial.model_class = None if algo == "her": trial.model_class = hyperparams["model_class"] # Hack to use DDPG/TD3 noise sampler if algo in ["ddpg", "td3"] or trial.model_class in ["ddpg", "td3"]: trial.n_actions = env_fn(n_envs=1).action_space.shape[0] kwargs.update(algo_sampler(trial)) model = model_fn(**kwargs) model.trial = trial eval_env = env_fn(n_envs=1, eval_env=True) # Account for parallel envs eval_freq_ = max(eval_freq // model.get_env().num_envs, 1) # TODO: Use non-deterministic eval for Atari # or use maximum number of steps to avoid infinite loop eval_callback = TrialEvalCallback(eval_env, trial, n_eval_episodes=n_eval_episodes, eval_freq=eval_freq_, deterministic=deterministic_eval) try: model.learn(n_timesteps, callback=eval_callback) # Free memory model.env.close() eval_env.close() except AssertionError as e: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() eval_env.close() # Prune hyperparams that generate NaNs print(e) raise optuna.exceptions.TrialPruned() is_pruned = eval_callback.is_pruned reward = eval_callback.last_mean_reward del model.env, eval_env del model if is_pruned: raise optuna.exceptions.TrialPruned() return reward try: study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs) except KeyboardInterrupt: pass print("Number of finished trials: ", len(study.trials)) print("Best trial:") trial = study.best_trial print("Value: ", trial.value) print("Params: ") for key, value in trial.params.items(): print(f" {key}: {value}") return study.trials_dataframe()
def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=5000, hyperparams=None, n_jobs=1, sampler_method='random', pruner_method='halving', seed=0, verbose=1, timeout=None): """ :param algo: (str) :param model_fn: (func) function that is used to instantiate the model :param env_fn: (func) function that is used to instantiate the env :param n_trials: (int) maximum number of trials for finding the best hyperparams :param n_timesteps: (int) maximum number of timesteps per trial :param hyperparams: (dict) :param n_jobs: (int) number of parallel jobs :param sampler_method: (str) :param pruner_method: (str) :param seed: (int) :param verbose: (int) :return: (pd.Dataframe) detailed result of the optimization """ # TODO: eval each hyperparams several times to account for noisy evaluation # TODO: take into account the normalization (also for the test env -> sync obs_rms) if hyperparams is None: hyperparams = {} # test during 5 episodes n_test_episodes = 5 # evaluate every 20th of the maximum budget per iteration n_evaluations = 20 evaluate_interval = int(n_timesteps / n_evaluations) # n_warmup_steps: Disable pruner until the trial reaches the given number of step. if sampler_method == 'random': sampler = RandomSampler(seed=seed) elif sampler_method == 'tpe': sampler = TPESampler(n_startup_trials=5, seed=seed) elif sampler_method == 'skopt': # cf https://scikit-optimize.github.io/#skopt.Optimizer # GP: gaussian process # Gradient boosted regression: GBRT sampler = SkoptSampler(skopt_kwargs={ 'base_estimator': "GP", 'acq_func': 'gp_hedge' }) else: raise ValueError('Unknown sampler: {}'.format(sampler_method)) if pruner_method == 'halving': pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=4, min_early_stopping_rate=0) elif pruner_method == 'median': pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=n_evaluations // 3) elif pruner_method == 'none': # Do not prune pruner = MedianPruner(n_startup_trials=n_trials, n_warmup_steps=n_evaluations) else: raise ValueError('Unknown pruner: {}'.format(pruner_method)) if verbose > 0: print("Sampler: {} - Pruner: {}".format(sampler_method, pruner_method)) study = optuna.create_study(sampler=sampler, pruner=pruner) algo_sampler = HYPERPARAMS_SAMPLER[algo] def objective(trial): kwargs = hyperparams.copy() trial.model_class = None if algo == 'her': trial.model_class = hyperparams['model_class'] # Hack to use DDPG/TD3 noise sampler if algo in ['ddpg', 'td3'] or trial.model_class in ['ddpg', 'td3']: trial.n_actions = env_fn(n_envs=1).action_space.shape[0] kwargs.update(algo_sampler(trial)) def callback(_locals, _globals): """ Callback for monitoring learning progress. :param _locals: (dict) :param _globals: (dict) :return: (bool) If False: stop training """ self_ = _locals['self'] trial = self_.trial # Initialize variables if not hasattr(self_, 'is_pruned'): self_.is_pruned = False self_.last_mean_test_reward = -np.inf self_.last_time_evaluated = 0 self_.eval_idx = 0 if (self_.num_timesteps - self_.last_time_evaluated) < evaluate_interval: return True self_.last_time_evaluated = self_.num_timesteps # Evaluate the trained agent on the test env rewards = [] n_episodes, reward_sum = 0, 0.0 # Sync the obs rms if using vecnormalize # NOTE: this does not cover all the possible cases if isinstance(self_.test_env, VecNormalize): self_.test_env.obs_rms = deepcopy(self_.env.obs_rms) # Do not normalize reward self_.test_env.norm_reward = False obs = self_.test_env.reset() while n_episodes < n_test_episodes: # Use default value for deterministic action, _ = self_.predict(obs) obs, reward, done, _ = self_.test_env.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = self_.test_env.reset() mean_reward = np.mean(rewards) self_.last_mean_test_reward = mean_reward self_.eval_idx += 1 # report best or report current ? # report num_timesteps or elasped time ? trial.report(-1 * mean_reward, self_.eval_idx) # Prune trial if need if trial.should_prune(self_.eval_idx): self_.is_pruned = True return False return True model = model_fn(**kwargs) model.test_env = env_fn(n_envs=1) model.trial = trial if algo == 'her': model.model.trial = trial # Wrap the env if need to flatten the dict obs if isinstance(model.test_env, VecEnv): model.test_env = _UnvecWrapper(model.test_env) model.model.test_env = HERGoalEnvWrapper(model.test_env) try: model.learn(n_timesteps, callback=callback) # Free memory model.env.close() model.test_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() model.test_env.close() raise is_pruned = False cost = np.inf if hasattr(model, 'is_pruned'): is_pruned = model.is_pruned cost = -1 * model.last_mean_test_reward del model.env, model.test_env del model if is_pruned: raise optuna.structs.TrialPruned() return cost try: study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs, timeout=timeout, catch=((ValueError, AssertionError))) except KeyboardInterrupt: pass print('Number of finished trials: ', len(study.trials)) print('Best trial:') trial = study.best_trial print('Value: ', trial.value) print('Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) return study.trials_dataframe()
def hyperparam_optimization(n_trials=20, n_timesteps=100000, hyperparams=None, n_jobs=1, sampler_method='random', pruner_method='halving', seed=1, verbose=1): """ :param algo: (str) :param model_fn: (func) function that is used to instantiate the model :param env_fn: (func) function that is used to instantiate the env :param n_trials: (int) maximum number of trials for finding the best hyperparams :param n_timesteps: (int) maximum number of timesteps per trial :param hyperparams: (dict) :param n_jobs: (int) number of parallel jobs :param sampler_method: (str) :param pruner_method: (str) :param seed: (int) :param verbose: (int) :return: (pd.Dataframe) detailed result of the optimization """ # TODO: eval each hyperparams several times to account for noisy evaluation # TODO: take into account the normalization (also for the test env -> sync obs_rms) if hyperparams is None: hyperparams = {} # test during 3000 steps n_test_steps = 1500 # evaluate every 20th of the maximum budget per iteration n_evaluations = 40 evaluate_interval = int(n_timesteps / n_evaluations) # n_warmup_steps: Disable pruner until the trial reaches the given number of step. #sampler = RandomSampler(seed=seed) #sampler = TPESampler(n_startup_trials=5, seed=seed) sampler = SkoptSampler(skopt_kwargs={ 'base_estimator': "GP", 'acq_func': 'gp_hedge' }) #pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=4, min_early_stopping_rate=0) pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=n_evaluations // 3) study = optuna.create_study(study_name="optimisation_PPO2", sampler=sampler, pruner=pruner, storage='sqlite:///optimizationSAC.db', load_if_exists=True) def objective(trial): kwargs = hyperparams.copy() trial.model_class = None kwargs.update(sample_td3_params(trial)) def callback(_locals, _globals): """ Callback for monitoring learning progress. :param _locals: (dict) :param _globals: (dict) :return: (bool) If False: stop training """ self_ = _locals['self'] trial = self_.trial # Initialize variables if not hasattr(self_, 'is_pruned'): self_.is_pruned = False self_.last_mean_test_reward = -np.inf self_.last_time_evaluated = 0 self_.eval_idx = 0 if (self_.num_timesteps - self_.last_time_evaluated) < evaluate_interval: return True self_.last_time_evaluated = self_.num_timesteps # Evaluate the trained agent on the test env rewards = [] n_steps_done, reward_sum = 0, 0.0 # Sync the obs rms if using vecnormalize # NOTE: this does not cover all the possible cases if isinstance(self_.test_env, VecNormalize): self_.test_env.obs_rms = deepcopy(self_.env.obs_rms) self_.test_env.ret_rms = deepcopy(self_.env.ret_rms) # Do not normalize reward self_.test_env.norm_reward = False obs = self_.test_env.reset() while n_steps_done < n_test_steps: # Use default value for deterministic action, _ = self_.predict(obs, ) obs, reward, done, _ = self_.test_env.step(action) reward_sum += reward n_steps_done += 1 if done: rewards.append(reward_sum) reward_sum = 0.0 obs = self_.test_env.reset() n_steps_done = n_test_steps rewards.append(reward_sum) mean_reward = np.mean(rewards) summary = tf.Summary(value=[ tf.Summary.Value(tag='evaluation', simple_value=mean_reward) ]) _locals['writer'].add_summary(summary, self_.num_timesteps) self_.last_mean_test_reward = mean_reward self_.eval_idx += 1 # report best or report current ? # report num_timesteps or elasped time ? trial.report(-1 * mean_reward, self_.eval_idx) # Prune trial if need if trial.should_prune(self_.eval_idx): self_.is_pruned = True return False return True commands = [[1, 0], [2, 0], [3, 0]] env = DummyVecEnv([ lambda: e.AidaBulletEnv(commands, render=True, on_rack=False, default_reward=2, height_weight=5, orientation_weight=3, direction_weight=2, speed_weight=4) ]) model = TD3(MlpPolicy, env, gamma=kwargs['gamma'], learning_rate=kwargs['learning_rate'], batch_size=kwargs['batch_size'], buffer_size=kwargs['buffer_size'], train_freq=kwargs['train_freq'], gradient_steps=kwargs['gradient_steps'], action_noise=kwargs['action_noise'], tensorboard_log="./optimisationSAC/logOPTI") model.test_env = DummyVecEnv([ lambda: e.AidaBulletEnv(commands, render=False, on_rack=False, default_reward=2, height_weight=5, orientation_weight=3, direction_weight=2, speed_weight=2) ]) model.trial = trial try: model.learn(n_timesteps, callback=callback) # Free memory model.env.close() model.test_env.close() except AssertionError: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() model.test_env.close() raise is_pruned = False cost = np.inf if hasattr(model, 'is_pruned'): is_pruned = model.is_pruned cost = -1 * model.last_mean_test_reward try: os.mkdir("./optimisationSAC/resultats/" + str(trial.number)) except FileExistsError: print("Directory already exists") model.save("./optimisation/resultats/" + str(trial.number) + "/" + str(trial.number)) del model.env, model.test_env del model if is_pruned: try: # Optuna >= 0.19.0 raise optuna.exceptions.TrialPruned() except AttributeError: raise optuna.structs.TrialPruned() return cost try: study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs) except KeyboardInterrupt: pass print('Number of finished trials: ', len(study.trials)) print('Best trial:') trial = study.best_trial print('Value: ', trial.value) print('Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) return study.trials_dataframe()
def hyperparam_optimization(algo, model_fn, env_fn, n_trials=10, n_timesteps=5000, hyperparams=None, n_jobs=1, sampler_method='random', pruner_method='halving', n_startup_trials=10, n_evaluations=20, n_eval_episodes=1, seed=0, verbose=1): """ :param algo: (str) :param model_fn: (func) function that is used to instantiate the model :param env_fn: (func) function that is used to instantiate the env :param n_trials: (int) maximum number of trials for finding the best hyperparams :param n_timesteps: (int) maximum number of timesteps per trial :param hyperparams: (dict) :param n_jobs: (int) number of parallel jobs :param sampler_method: (str) :param pruner_method: (str) :param n_startup_trials: (int) :param n_evaluations: (int) Evaluate every 20th of the maximum budget per iteration :param n_eval_episodes: (int) Evaluate the model during 5 episodes :param seed: (int) :param verbose: (int) :return: (pd.Dataframe) detailed result of the optimization """ # TODO: eval each hyperparams several times to account for noisy evaluation if hyperparams is None: hyperparams = {} eval_freq = int(n_timesteps / n_evaluations) # n_warmup_steps: Disable pruner until the trial reaches the given number of step. if sampler_method == 'random': sampler = RandomSampler(seed=seed) elif sampler_method == 'tpe': sampler = TPESampler(n_startup_trials=n_startup_trials, seed=seed) elif sampler_method == 'skopt': # cf https://scikit-optimize.github.io/#skopt.Optimizer # GP: gaussian process # Gradient boosted regression: GBRT sampler = SkoptSampler(skopt_kwargs={ 'base_estimator': "GP", 'acq_func': 'gp_hedge' }) else: raise ValueError(f'Unknown sampler: {sampler_method}') if pruner_method == 'halving': pruner = SuccessiveHalvingPruner(min_resource=1, reduction_factor=4, min_early_stopping_rate=0) elif pruner_method == 'median': pruner = MedianPruner(n_startup_trials=n_startup_trials, n_warmup_steps=n_evaluations // 3) elif pruner_method == 'none': # Do not prune pruner = MedianPruner(n_startup_trials=n_trials, n_warmup_steps=n_evaluations) else: raise ValueError(f'Unknown pruner: {pruner_method}') if verbose > 0: print(f"Sampler: {sampler_method} - Pruner: {pruner_method}") study = optuna.create_study(sampler=sampler, pruner=pruner) algo_sampler = HYPERPARAMS_SAMPLER[algo] def objective(trial): kwargs = hyperparams.copy() trial.model_class = None if algo == 'her': trial.model_class = hyperparams['model_class'] # Hack to use DDPG/TD3 noise sampler if algo in ['ddpg', 'td3'] or trial.model_class in ['ddpg', 'td3']: trial.n_actions = env_fn(n_envs=1).action_space.shape[0] kwargs.update(algo_sampler(trial)) model = model_fn(**kwargs) model.trial = trial eval_env = env_fn(n_envs=1, eval_env=True) # Account for parallel envs eval_freq_ = max(eval_freq // model.get_env().num_envs, 1) # TODO: use non-deterministic eval for Atari? eval_callback = TrialEvalCallback(eval_env, trial, n_eval_episodes=n_eval_episodes, eval_freq=eval_freq_, deterministic=True) try: model.learn(n_timesteps, callback=eval_callback) # Free memory model.env.close() eval_env.close() except AssertionError as e: # Sometimes, random hyperparams can generate NaN # Free memory model.env.close() eval_env.close() # Prune hyperparams that generate NaNs print(e) raise optuna.exceptions.TrialPruned() is_pruned = eval_callback.is_pruned cost = -1 * eval_callback.last_mean_reward del model.env, eval_env del model if is_pruned: raise optuna.exceptions.TrialPruned() return cost try: study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs) except KeyboardInterrupt: pass print('Number of finished trials: ', len(study.trials)) print('Best trial:') trial = study.best_trial print('Value: ', trial.value) print('Params: ') for key, value in trial.params.items(): print(f' {key}: {value}') return study.trials_dataframe()