def get_hyperparameters(args, policy): """Return the hyperparameters of a training algorithm from the parser.""" algorithm_params = { "nb_train_steps": args.nb_train_steps, "nb_rollout_steps": args.nb_rollout_steps, "nb_eval_episodes": args.nb_eval_episodes, "actor_update_freq": args.actor_update_freq, "meta_update_freq": args.meta_update_freq, "reward_scale": args.reward_scale, "render": args.render, "render_eval": args.render_eval, "save_replay_buffer": args.save_replay_buffer, "verbose": args.verbose, "num_envs": args.num_envs, "_init_setup_model": True, } # add FeedForwardPolicy parameters policy_kwargs = { "l2_penalty": args.l2_penalty, "model_params": { "model_type": getattr(args, "model_params:model_type"), "layer_norm": getattr(args, "model_params:layer_norm"), "ignore_image": getattr(args, "model_params:ignore_image"), "image_height": getattr(args, "model_params:image_height"), "image_width": getattr(args, "model_params:image_width"), "image_channels": getattr(args, "model_params:image_channels"), "ignore_flat_channels": getattr(args, "model_params:ignore_flat_channels") or FEEDFORWARD_PARAMS["model_params"]["ignore_flat_channels"], "filters": getattr(args, "model_params:filters") or FEEDFORWARD_PARAMS["model_params"]["filters"], "kernel_sizes": getattr(args, "model_params:kernel_sizes") or FEEDFORWARD_PARAMS["model_params"]["kernel_sizes"], "strides": getattr(args, "model_params:strides") or FEEDFORWARD_PARAMS["model_params"]["strides"], "layers": getattr(args, "model_params:layers") or FEEDFORWARD_PARAMS["model_params"]["layers"], } } # add TD3 parameters if is_td3_policy(policy): policy_kwargs.update({ "buffer_size": args.buffer_size, "batch_size": args.batch_size, "actor_lr": args.actor_lr, "critic_lr": args.critic_lr, "tau": args.tau, "gamma": args.gamma, "use_huber": args.use_huber, "noise": args.noise, "target_policy_noise": args.target_policy_noise, "target_noise_clip": args.target_noise_clip, }) # add SAC parameters if is_sac_policy(policy): policy_kwargs.update({ "buffer_size": args.buffer_size, "batch_size": args.batch_size, "actor_lr": args.actor_lr, "critic_lr": args.critic_lr, "tau": args.tau, "gamma": args.gamma, "use_huber": args.use_huber, "target_entropy": args.target_entropy, }) # add PPO parameters if is_ppo_policy(policy): policy_kwargs.update({ "learning_rate": args.learning_rate, "n_minibatches": args.n_minibatches, "n_opt_epochs": args.n_opt_epochs, "gamma": args.gamma, "lam": args.lam, "ent_coef": args.ent_coef, "vf_coef": args.vf_coef, "max_grad_norm": args.max_grad_norm, "cliprange": args.cliprange, "cliprange_vf": args.cliprange_vf, }) # add GoalConditionedPolicy parameters if is_goal_conditioned_policy(policy): policy_kwargs.update({ "num_levels": args.num_levels, "meta_period": args.meta_period, "intrinsic_reward_type": args.intrinsic_reward_type, "intrinsic_reward_scale": args.intrinsic_reward_scale, "relative_goals": args.relative_goals, "off_policy_corrections": args.off_policy_corrections, "hindsight": args.hindsight, "subgoal_testing_rate": args.subgoal_testing_rate, "cooperative_gradients": args.cooperative_gradients, "cg_weights": args.cg_weights, "cg_delta": args.cg_delta, "pretrain_worker": args.pretrain_worker, "pretrain_path": args.pretrain_path, "pretrain_ckpt": args.pretrain_ckpt, }) # add MultiActorCriticPolicy parameters if is_multiagent_policy(policy): policy_kwargs.update({ "shared": args.shared, "maddpg": args.maddpg, "n_agents": args.n_agents, }) # add the policy_kwargs term to the algorithm parameters algorithm_params['policy_kwargs'] = policy_kwargs return algorithm_params
def get_hyperparameters(args, policy): """Return the hyperparameters of a training algorithm from the parser.""" algorithm_params = { "nb_train_steps": args.nb_train_steps, "nb_rollout_steps": args.nb_rollout_steps, "nb_eval_episodes": args.nb_eval_episodes, "actor_update_freq": args.actor_update_freq, "meta_update_freq": args.meta_update_freq, "reward_scale": args.reward_scale, "render": args.render, "render_eval": args.render_eval, "verbose": args.verbose, "num_envs": args.num_envs, "_init_setup_model": True, } # add FeedForwardPolicy parameters policy_kwargs = { "buffer_size": args.buffer_size, "batch_size": args.batch_size, "actor_lr": args.actor_lr, "critic_lr": args.critic_lr, "tau": args.tau, "gamma": args.gamma, "layer_norm": args.layer_norm, "use_huber": args.use_huber, } # add TD3 parameters if is_td3_policy(policy): policy_kwargs.update({ "noise": args.noise, "target_policy_noise": args.target_policy_noise, "target_noise_clip": args.target_noise_clip, }) # add SAC parameters if is_sac_policy(policy): policy_kwargs.update({ "target_entropy": args.target_entropy, }) # add GoalConditionedPolicy parameters if is_goal_conditioned_policy(policy): policy_kwargs.update({ "num_levels": args.num_levels, "meta_period": args.meta_period, "intrinsic_reward_type": args.intrinsic_reward_type, "intrinsic_reward_scale": args.intrinsic_reward_scale, "relative_goals": args.relative_goals, "off_policy_corrections": args.off_policy_corrections, "hindsight": args.hindsight, "subgoal_testing_rate": args.subgoal_testing_rate, "connected_gradients": args.connected_gradients, "cg_weights": args.cg_weights, "use_fingerprints": args.use_fingerprints, "centralized_value_functions": args.centralized_value_functions, }) # add MultiFeedForwardPolicy parameters if is_multiagent_policy(policy): policy_kwargs.update({ "shared": args.shared, "maddpg": args.maddpg, }) # add the policy_kwargs term to the algorithm parameters algorithm_params['policy_kwargs'] = policy_kwargs return algorithm_params
def __init__(self, policy, env, eval_env=None, nb_train_steps=1, nb_rollout_steps=1, nb_eval_episodes=50, actor_update_freq=2, meta_update_freq=10, reward_scale=1., render=False, render_eval=False, eval_deterministic=True, verbose=0, policy_kwargs=None, _init_setup_model=True): """Instantiate the algorithm object. Parameters ---------- policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use env : gym.Env or str the environment to learn from (if registered in Gym, can be str) eval_env : gym.Env or str the environment to evaluate from (if registered in Gym, can be str) nb_train_steps : int the number of training steps nb_rollout_steps : int the number of rollout steps nb_eval_episodes : int the number of evaluation episodes actor_update_freq : int number of training steps per actor policy update step. The critic policy is updated every training step. meta_update_freq : int number of training steps per meta policy update step. The actor policy of the meta-policy is further updated at the frequency provided by the actor_update_freq variable. Note that this value is only relevant when using the GoalConditionedPolicy policy. reward_scale : float the value the reward should be scaled by render : bool enable rendering of the training environment render_eval : bool enable rendering of the evaluation environment eval_deterministic : bool if set to True, the policy provides deterministic actions to the evaluation environment. Otherwise, stochastic or noisy actions are returned. verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug policy_kwargs : dict policy-specific hyperparameters _init_setup_model : bool Whether or not to build the network at the creation of the instance """ shared = False if policy_kwargs is None else \ policy_kwargs.get("shared", False) maddpg = False if policy_kwargs is None else \ policy_kwargs.get("maddpg", False) self.policy = policy self.env_name = deepcopy(env) if isinstance(env, str) \ else env.__str__() self.env = create_env(env, render, shared, maddpg, evaluate=False) self.eval_env = create_env(eval_env, render_eval, shared, maddpg, evaluate=True) self.nb_train_steps = nb_train_steps self.nb_rollout_steps = nb_rollout_steps self.nb_eval_episodes = nb_eval_episodes self.actor_update_freq = actor_update_freq self.meta_update_freq = meta_update_freq self.reward_scale = reward_scale self.render = render self.render_eval = render_eval self.eval_deterministic = eval_deterministic self.verbose = verbose self.action_space = self.env.action_space self.observation_space = self.env.observation_space self.context_space = getattr(self.env, "context_space", None) self.policy_kwargs = {'verbose': verbose} # add the default policy kwargs to the policy_kwargs term if is_feedforward_policy(policy): self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy()) elif is_goal_conditioned_policy(policy): self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy()) self.policy_kwargs['env_name'] = self.env_name.__str__() elif is_multiagent_policy(policy): self.policy_kwargs.update(MULTI_FEEDFORWARD_PARAMS.copy()) self.policy_kwargs["all_ob_space"] = getattr( self.env, "all_observation_space", Box(-1, 1, (1, ), dtype=np.float32)) if is_td3_policy(policy): self.policy_kwargs.update(TD3_PARAMS.copy()) elif is_sac_policy(policy): self.policy_kwargs.update(SAC_PARAMS.copy()) self.policy_kwargs.update(policy_kwargs or {}) # Compute the time horizon, which is used to check if an environment # terminated early and used to compute the done mask as per TD3 # implementation (see appendix A of their paper). If the horizon cannot # be found, it is assumed to be 500 (default value for most gym # environments). if hasattr(self.env, "horizon"): self.horizon = self.env.horizon elif hasattr(self.env, "_max_episode_steps"): self.horizon = self.env._max_episode_steps elif hasattr(self.env, "env_params"): # for Flow environments self.horizon = self.env.env_params.horizon else: raise ValueError("Horizon attribute not found.") # init self.graph = None self.policy_tf = None self.sess = None self.summary = None self.obs = None self.all_obs = None self.episode_step = 0 self.episodes = 0 self.total_steps = 0 self.epoch_episode_steps = [] self.epoch_episode_rewards = [] self.epoch_episodes = 0 self.epoch = 0 self.episode_rew_history = deque(maxlen=100) self.episode_reward = 0 self.rew_ph = None self.rew_history_ph = None self.eval_rew_ph = None self.eval_success_ph = None self.saver = None # Append the fingerprint dimension to the observation dimension, if # needed. if self.policy_kwargs.get("use_fingerprints", False): fingerprint_range = self.policy_kwargs["fingerprint_range"] low = np.concatenate( (self.observation_space.low, fingerprint_range[0])) high = np.concatenate( (self.observation_space.high, fingerprint_range[1])) self.observation_space = Box(low=low, high=high, dtype=np.float32) # Create the model variables and operations. if _init_setup_model: self.trainable_vars = self.setup_model()
def __init__(self, policy, env, eval_env=None, nb_train_steps=1, nb_rollout_steps=1, nb_eval_episodes=50, actor_update_freq=2, meta_update_freq=10, reward_scale=1., render=False, render_eval=False, eval_deterministic=True, save_replay_buffer=False, num_envs=1, verbose=0, policy_kwargs=None, _init_setup_model=True): """Instantiate the algorithm object. Parameters ---------- policy : type [ hbaselines.base_policies.ActorCriticPolicy ] the policy model to use env : gym.Env or str the environment to learn from (if registered in Gym, can be str) eval_env : gym.Env or str the environment to evaluate from (if registered in Gym, can be str) nb_train_steps : int the number of training steps nb_rollout_steps : int the number of rollout steps nb_eval_episodes : int the number of evaluation episodes actor_update_freq : int number of training steps per actor policy update step. The critic policy is updated every training step. meta_update_freq : int number of training steps per meta policy update step. The actor policy of the meta-policy is further updated at the frequency provided by the actor_update_freq variable. Note that this value is only relevant when using the GoalConditionedPolicy policy. reward_scale : float the value the reward should be scaled by render : bool enable rendering of the training environment render_eval : bool enable rendering of the evaluation environment eval_deterministic : bool if set to True, the policy provides deterministic actions to the evaluation environment. Otherwise, stochastic or noisy actions are returned. save_replay_buffer : bool whether to save the data from the replay buffer, at the frequency that the model is saved. Only the most recent replay buffer is stored. num_envs : int number of environments used to run simulations in parallel. Each environment is run on a separate CPUS and uses the same policy as the rest. Must be less than or equal to nb_rollout_steps. verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug policy_kwargs : dict policy-specific hyperparameters _init_setup_model : bool Whether or not to build the network at the creation of the instance Raises ------ AssertionError if num_envs > nb_rollout_steps """ shared = False if policy_kwargs is None else \ policy_kwargs.get("shared", False) maddpg = False if policy_kwargs is None else \ policy_kwargs.get("maddpg", False) # Run assertions. assert num_envs <= nb_rollout_steps, \ "num_envs must be less than or equal to nb_rollout_steps" # Instantiate the ray instance. if num_envs > 1: ray.init(num_cpus=num_envs + 1, ignore_reinit_error=True) self.policy = policy self.env_name = deepcopy(env) if isinstance(env, str) \ else env.__str__() self.eval_env, _ = create_env(eval_env, render_eval, shared, maddpg, evaluate=True) self.nb_train_steps = nb_train_steps self.nb_rollout_steps = nb_rollout_steps self.nb_eval_episodes = nb_eval_episodes self.actor_update_freq = actor_update_freq self.meta_update_freq = meta_update_freq self.reward_scale = reward_scale self.render = render self.render_eval = render_eval self.eval_deterministic = eval_deterministic self.save_replay_buffer = save_replay_buffer self.num_envs = num_envs self.verbose = verbose self.policy_kwargs = {'verbose': verbose} # Create the environment and collect the initial observations. self.sampler, self.obs, self.all_obs = self.setup_sampler( env, render, shared, maddpg) # Collect the spaces of the environments. self.ac_space, self.ob_space, self.co_space, all_ob_space = \ self.get_spaces() # Add the default policy kwargs to the policy_kwargs term. if is_feedforward_policy(policy): self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy()) if is_goal_conditioned_policy(policy): self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy()) self.policy_kwargs['env_name'] = self.env_name.__str__() self.policy_kwargs['num_envs'] = num_envs if is_multiagent_policy(policy): self.policy_kwargs.update(MULTIAGENT_PARAMS.copy()) self.policy_kwargs["all_ob_space"] = all_ob_space if is_td3_policy(policy): self.policy_kwargs.update(TD3_PARAMS.copy()) elif is_sac_policy(policy): self.policy_kwargs.update(SAC_PARAMS.copy()) self.policy_kwargs = recursive_update(self.policy_kwargs, policy_kwargs or {}) # Compute the time horizon, which is used to check if an environment # terminated early and used to compute the done mask for TD3. if self.num_envs > 1: self.horizon = ray.get(self.sampler[0].horizon.remote()) else: self.horizon = self.sampler[0].horizon() # init self.graph = None self.policy_tf = None self.sess = None self.summary = None self.episode_step = [0 for _ in range(num_envs)] self.episodes = 0 self.total_steps = 0 self.epoch_episode_steps = [] self.epoch_episode_rewards = [] self.epoch_episodes = 0 self.epoch = 0 self.episode_rew_history = deque(maxlen=100) self.episode_reward = [0 for _ in range(num_envs)] self.rew_ph = None self.rew_history_ph = None self.eval_rew_ph = None self.eval_success_ph = None self.saver = None # Create the model variables and operations. if _init_setup_model: self.trainable_vars = self.setup_model()