コード例 #1
0
def get_hyperparameters(args, policy):
    """Return the hyperparameters of a training algorithm from the parser."""
    algorithm_params = {
        "nb_train_steps": args.nb_train_steps,
        "nb_rollout_steps": args.nb_rollout_steps,
        "nb_eval_episodes": args.nb_eval_episodes,
        "actor_update_freq": args.actor_update_freq,
        "meta_update_freq": args.meta_update_freq,
        "reward_scale": args.reward_scale,
        "render": args.render,
        "render_eval": args.render_eval,
        "save_replay_buffer": args.save_replay_buffer,
        "verbose": args.verbose,
        "num_envs": args.num_envs,
        "_init_setup_model": True,
    }

    # add FeedForwardPolicy parameters
    policy_kwargs = {
        "l2_penalty": args.l2_penalty,
        "model_params": {
            "model_type":
            getattr(args, "model_params:model_type"),
            "layer_norm":
            getattr(args, "model_params:layer_norm"),
            "ignore_image":
            getattr(args, "model_params:ignore_image"),
            "image_height":
            getattr(args, "model_params:image_height"),
            "image_width":
            getattr(args, "model_params:image_width"),
            "image_channels":
            getattr(args, "model_params:image_channels"),
            "ignore_flat_channels":
            getattr(args, "model_params:ignore_flat_channels")
            or FEEDFORWARD_PARAMS["model_params"]["ignore_flat_channels"],
            "filters":
            getattr(args, "model_params:filters")
            or FEEDFORWARD_PARAMS["model_params"]["filters"],
            "kernel_sizes":
            getattr(args, "model_params:kernel_sizes")
            or FEEDFORWARD_PARAMS["model_params"]["kernel_sizes"],
            "strides":
            getattr(args, "model_params:strides")
            or FEEDFORWARD_PARAMS["model_params"]["strides"],
            "layers":
            getattr(args, "model_params:layers")
            or FEEDFORWARD_PARAMS["model_params"]["layers"],
        }
    }

    # add TD3 parameters
    if is_td3_policy(policy):
        policy_kwargs.update({
            "buffer_size": args.buffer_size,
            "batch_size": args.batch_size,
            "actor_lr": args.actor_lr,
            "critic_lr": args.critic_lr,
            "tau": args.tau,
            "gamma": args.gamma,
            "use_huber": args.use_huber,
            "noise": args.noise,
            "target_policy_noise": args.target_policy_noise,
            "target_noise_clip": args.target_noise_clip,
        })

    # add SAC parameters
    if is_sac_policy(policy):
        policy_kwargs.update({
            "buffer_size": args.buffer_size,
            "batch_size": args.batch_size,
            "actor_lr": args.actor_lr,
            "critic_lr": args.critic_lr,
            "tau": args.tau,
            "gamma": args.gamma,
            "use_huber": args.use_huber,
            "target_entropy": args.target_entropy,
        })

    # add PPO parameters
    if is_ppo_policy(policy):
        policy_kwargs.update({
            "learning_rate": args.learning_rate,
            "n_minibatches": args.n_minibatches,
            "n_opt_epochs": args.n_opt_epochs,
            "gamma": args.gamma,
            "lam": args.lam,
            "ent_coef": args.ent_coef,
            "vf_coef": args.vf_coef,
            "max_grad_norm": args.max_grad_norm,
            "cliprange": args.cliprange,
            "cliprange_vf": args.cliprange_vf,
        })

    # add GoalConditionedPolicy parameters
    if is_goal_conditioned_policy(policy):
        policy_kwargs.update({
            "num_levels": args.num_levels,
            "meta_period": args.meta_period,
            "intrinsic_reward_type": args.intrinsic_reward_type,
            "intrinsic_reward_scale": args.intrinsic_reward_scale,
            "relative_goals": args.relative_goals,
            "off_policy_corrections": args.off_policy_corrections,
            "hindsight": args.hindsight,
            "subgoal_testing_rate": args.subgoal_testing_rate,
            "cooperative_gradients": args.cooperative_gradients,
            "cg_weights": args.cg_weights,
            "cg_delta": args.cg_delta,
            "pretrain_worker": args.pretrain_worker,
            "pretrain_path": args.pretrain_path,
            "pretrain_ckpt": args.pretrain_ckpt,
        })

    # add MultiActorCriticPolicy parameters
    if is_multiagent_policy(policy):
        policy_kwargs.update({
            "shared": args.shared,
            "maddpg": args.maddpg,
            "n_agents": args.n_agents,
        })

    # add the policy_kwargs term to the algorithm parameters
    algorithm_params['policy_kwargs'] = policy_kwargs

    return algorithm_params
コード例 #2
0
ファイル: train.py プロジェクト: RBZ-99/h-baselines
def get_hyperparameters(args, policy):
    """Return the hyperparameters of a training algorithm from the parser."""
    algorithm_params = {
        "nb_train_steps": args.nb_train_steps,
        "nb_rollout_steps": args.nb_rollout_steps,
        "nb_eval_episodes": args.nb_eval_episodes,
        "actor_update_freq": args.actor_update_freq,
        "meta_update_freq": args.meta_update_freq,
        "reward_scale": args.reward_scale,
        "render": args.render,
        "render_eval": args.render_eval,
        "verbose": args.verbose,
        "num_envs": args.num_envs,
        "_init_setup_model": True,
    }

    # add FeedForwardPolicy parameters
    policy_kwargs = {
        "buffer_size": args.buffer_size,
        "batch_size": args.batch_size,
        "actor_lr": args.actor_lr,
        "critic_lr": args.critic_lr,
        "tau": args.tau,
        "gamma": args.gamma,
        "layer_norm": args.layer_norm,
        "use_huber": args.use_huber,
    }

    # add TD3 parameters
    if is_td3_policy(policy):
        policy_kwargs.update({
            "noise": args.noise,
            "target_policy_noise": args.target_policy_noise,
            "target_noise_clip": args.target_noise_clip,
        })

    # add SAC parameters
    if is_sac_policy(policy):
        policy_kwargs.update({
            "target_entropy": args.target_entropy,
        })

    # add GoalConditionedPolicy parameters
    if is_goal_conditioned_policy(policy):
        policy_kwargs.update({
            "num_levels": args.num_levels,
            "meta_period": args.meta_period,
            "intrinsic_reward_type": args.intrinsic_reward_type,
            "intrinsic_reward_scale": args.intrinsic_reward_scale,
            "relative_goals": args.relative_goals,
            "off_policy_corrections": args.off_policy_corrections,
            "hindsight": args.hindsight,
            "subgoal_testing_rate": args.subgoal_testing_rate,
            "connected_gradients": args.connected_gradients,
            "cg_weights": args.cg_weights,
            "use_fingerprints": args.use_fingerprints,
            "centralized_value_functions": args.centralized_value_functions,
        })

    # add MultiFeedForwardPolicy parameters
    if is_multiagent_policy(policy):
        policy_kwargs.update({
            "shared": args.shared,
            "maddpg": args.maddpg,
        })

    # add the policy_kwargs term to the algorithm parameters
    algorithm_params['policy_kwargs'] = policy_kwargs

    return algorithm_params
コード例 #3
0
    def __init__(self,
                 policy,
                 env,
                 eval_env=None,
                 nb_train_steps=1,
                 nb_rollout_steps=1,
                 nb_eval_episodes=50,
                 actor_update_freq=2,
                 meta_update_freq=10,
                 reward_scale=1.,
                 render=False,
                 render_eval=False,
                 eval_deterministic=True,
                 verbose=0,
                 policy_kwargs=None,
                 _init_setup_model=True):
        """Instantiate the algorithm object.

        Parameters
        ----------
        policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use
        env : gym.Env or str
            the environment to learn from (if registered in Gym, can be str)
        eval_env : gym.Env or str
            the environment to evaluate from (if registered in Gym, can be str)
        nb_train_steps : int
            the number of training steps
        nb_rollout_steps : int
            the number of rollout steps
        nb_eval_episodes : int
            the number of evaluation episodes
        actor_update_freq : int
            number of training steps per actor policy update step. The critic
            policy is updated every training step.
        meta_update_freq : int
            number of training steps per meta policy update step. The actor
            policy of the meta-policy is further updated at the frequency
            provided by the actor_update_freq variable. Note that this value is
            only relevant when using the GoalConditionedPolicy policy.
        reward_scale : float
            the value the reward should be scaled by
        render : bool
            enable rendering of the training environment
        render_eval : bool
            enable rendering of the evaluation environment
        eval_deterministic : bool
            if set to True, the policy provides deterministic actions to the
            evaluation environment. Otherwise, stochastic or noisy actions are
            returned.
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        policy_kwargs : dict
            policy-specific hyperparameters
        _init_setup_model : bool
            Whether or not to build the network at the creation of the instance
        """
        shared = False if policy_kwargs is None else \
            policy_kwargs.get("shared", False)
        maddpg = False if policy_kwargs is None else \
            policy_kwargs.get("maddpg", False)

        self.policy = policy
        self.env_name = deepcopy(env) if isinstance(env, str) \
            else env.__str__()
        self.env = create_env(env, render, shared, maddpg, evaluate=False)
        self.eval_env = create_env(eval_env,
                                   render_eval,
                                   shared,
                                   maddpg,
                                   evaluate=True)
        self.nb_train_steps = nb_train_steps
        self.nb_rollout_steps = nb_rollout_steps
        self.nb_eval_episodes = nb_eval_episodes
        self.actor_update_freq = actor_update_freq
        self.meta_update_freq = meta_update_freq
        self.reward_scale = reward_scale
        self.render = render
        self.render_eval = render_eval
        self.eval_deterministic = eval_deterministic
        self.verbose = verbose
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.context_space = getattr(self.env, "context_space", None)
        self.policy_kwargs = {'verbose': verbose}

        # add the default policy kwargs to the policy_kwargs term
        if is_feedforward_policy(policy):
            self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy())
        elif is_goal_conditioned_policy(policy):
            self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy())
            self.policy_kwargs['env_name'] = self.env_name.__str__()
        elif is_multiagent_policy(policy):
            self.policy_kwargs.update(MULTI_FEEDFORWARD_PARAMS.copy())
            self.policy_kwargs["all_ob_space"] = getattr(
                self.env, "all_observation_space",
                Box(-1, 1, (1, ), dtype=np.float32))

        if is_td3_policy(policy):
            self.policy_kwargs.update(TD3_PARAMS.copy())
        elif is_sac_policy(policy):
            self.policy_kwargs.update(SAC_PARAMS.copy())

        self.policy_kwargs.update(policy_kwargs or {})

        # Compute the time horizon, which is used to check if an environment
        # terminated early and used to compute the done mask as per TD3
        # implementation (see appendix A of their paper). If the horizon cannot
        # be found, it is assumed to be 500 (default value for most gym
        # environments).
        if hasattr(self.env, "horizon"):
            self.horizon = self.env.horizon
        elif hasattr(self.env, "_max_episode_steps"):
            self.horizon = self.env._max_episode_steps
        elif hasattr(self.env, "env_params"):
            # for Flow environments
            self.horizon = self.env.env_params.horizon
        else:
            raise ValueError("Horizon attribute not found.")

        # init
        self.graph = None
        self.policy_tf = None
        self.sess = None
        self.summary = None
        self.obs = None
        self.all_obs = None
        self.episode_step = 0
        self.episodes = 0
        self.total_steps = 0
        self.epoch_episode_steps = []
        self.epoch_episode_rewards = []
        self.epoch_episodes = 0
        self.epoch = 0
        self.episode_rew_history = deque(maxlen=100)
        self.episode_reward = 0
        self.rew_ph = None
        self.rew_history_ph = None
        self.eval_rew_ph = None
        self.eval_success_ph = None
        self.saver = None

        # Append the fingerprint dimension to the observation dimension, if
        # needed.
        if self.policy_kwargs.get("use_fingerprints", False):
            fingerprint_range = self.policy_kwargs["fingerprint_range"]
            low = np.concatenate(
                (self.observation_space.low, fingerprint_range[0]))
            high = np.concatenate(
                (self.observation_space.high, fingerprint_range[1]))
            self.observation_space = Box(low=low, high=high, dtype=np.float32)

        # Create the model variables and operations.
        if _init_setup_model:
            self.trainable_vars = self.setup_model()
コード例 #4
0
    def __init__(self,
                 policy,
                 env,
                 eval_env=None,
                 nb_train_steps=1,
                 nb_rollout_steps=1,
                 nb_eval_episodes=50,
                 actor_update_freq=2,
                 meta_update_freq=10,
                 reward_scale=1.,
                 render=False,
                 render_eval=False,
                 eval_deterministic=True,
                 save_replay_buffer=False,
                 num_envs=1,
                 verbose=0,
                 policy_kwargs=None,
                 _init_setup_model=True):
        """Instantiate the algorithm object.

        Parameters
        ----------
        policy : type [ hbaselines.base_policies.ActorCriticPolicy ]
            the policy model to use
        env : gym.Env or str
            the environment to learn from (if registered in Gym, can be str)
        eval_env : gym.Env or str
            the environment to evaluate from (if registered in Gym, can be str)
        nb_train_steps : int
            the number of training steps
        nb_rollout_steps : int
            the number of rollout steps
        nb_eval_episodes : int
            the number of evaluation episodes
        actor_update_freq : int
            number of training steps per actor policy update step. The critic
            policy is updated every training step.
        meta_update_freq : int
            number of training steps per meta policy update step. The actor
            policy of the meta-policy is further updated at the frequency
            provided by the actor_update_freq variable. Note that this value is
            only relevant when using the GoalConditionedPolicy policy.
        reward_scale : float
            the value the reward should be scaled by
        render : bool
            enable rendering of the training environment
        render_eval : bool
            enable rendering of the evaluation environment
        eval_deterministic : bool
            if set to True, the policy provides deterministic actions to the
            evaluation environment. Otherwise, stochastic or noisy actions are
            returned.
        save_replay_buffer : bool
            whether to save the data from the replay buffer, at the frequency
            that the model is saved. Only the most recent replay buffer is
            stored.
        num_envs : int
            number of environments used to run simulations in parallel. Each
            environment is run on a separate CPUS and uses the same policy as
            the rest. Must be less than or equal to nb_rollout_steps.
        verbose : int
            the verbosity level: 0 none, 1 training information, 2 tensorflow
            debug
        policy_kwargs : dict
            policy-specific hyperparameters
        _init_setup_model : bool
            Whether or not to build the network at the creation of the instance

        Raises
        ------
        AssertionError
            if num_envs > nb_rollout_steps
        """
        shared = False if policy_kwargs is None else \
            policy_kwargs.get("shared", False)
        maddpg = False if policy_kwargs is None else \
            policy_kwargs.get("maddpg", False)

        # Run assertions.
        assert num_envs <= nb_rollout_steps, \
            "num_envs must be less than or equal to nb_rollout_steps"

        # Instantiate the ray instance.
        if num_envs > 1:
            ray.init(num_cpus=num_envs + 1, ignore_reinit_error=True)

        self.policy = policy
        self.env_name = deepcopy(env) if isinstance(env, str) \
            else env.__str__()
        self.eval_env, _ = create_env(eval_env,
                                      render_eval,
                                      shared,
                                      maddpg,
                                      evaluate=True)
        self.nb_train_steps = nb_train_steps
        self.nb_rollout_steps = nb_rollout_steps
        self.nb_eval_episodes = nb_eval_episodes
        self.actor_update_freq = actor_update_freq
        self.meta_update_freq = meta_update_freq
        self.reward_scale = reward_scale
        self.render = render
        self.render_eval = render_eval
        self.eval_deterministic = eval_deterministic
        self.save_replay_buffer = save_replay_buffer
        self.num_envs = num_envs
        self.verbose = verbose
        self.policy_kwargs = {'verbose': verbose}

        # Create the environment and collect the initial observations.
        self.sampler, self.obs, self.all_obs = self.setup_sampler(
            env, render, shared, maddpg)

        # Collect the spaces of the environments.
        self.ac_space, self.ob_space, self.co_space, all_ob_space = \
            self.get_spaces()

        # Add the default policy kwargs to the policy_kwargs term.
        if is_feedforward_policy(policy):
            self.policy_kwargs.update(FEEDFORWARD_PARAMS.copy())

        if is_goal_conditioned_policy(policy):
            self.policy_kwargs.update(GOAL_CONDITIONED_PARAMS.copy())
            self.policy_kwargs['env_name'] = self.env_name.__str__()
            self.policy_kwargs['num_envs'] = num_envs

        if is_multiagent_policy(policy):
            self.policy_kwargs.update(MULTIAGENT_PARAMS.copy())
            self.policy_kwargs["all_ob_space"] = all_ob_space

        if is_td3_policy(policy):
            self.policy_kwargs.update(TD3_PARAMS.copy())
        elif is_sac_policy(policy):
            self.policy_kwargs.update(SAC_PARAMS.copy())

        self.policy_kwargs = recursive_update(self.policy_kwargs, policy_kwargs
                                              or {})

        # Compute the time horizon, which is used to check if an environment
        # terminated early and used to compute the done mask for TD3.
        if self.num_envs > 1:
            self.horizon = ray.get(self.sampler[0].horizon.remote())
        else:
            self.horizon = self.sampler[0].horizon()

        # init
        self.graph = None
        self.policy_tf = None
        self.sess = None
        self.summary = None
        self.episode_step = [0 for _ in range(num_envs)]
        self.episodes = 0
        self.total_steps = 0
        self.epoch_episode_steps = []
        self.epoch_episode_rewards = []
        self.epoch_episodes = 0
        self.epoch = 0
        self.episode_rew_history = deque(maxlen=100)
        self.episode_reward = [0 for _ in range(num_envs)]
        self.rew_ph = None
        self.rew_history_ph = None
        self.eval_rew_ph = None
        self.eval_success_ph = None
        self.saver = None

        # Create the model variables and operations.
        if _init_setup_model:
            self.trainable_vars = self.setup_model()