Example #1
0
 def setUp(self):
   super(PpoTest, self).setUp()
   self.rng_key = trainer_lib.init_random_number_generators(0)
Example #2
0
 def setUp(self):
     super(PolicyBasedUtilsTest, self).setUp()
     self.rng_key = trainer_lib.init_random_number_generators(0)
Example #3
0
    def __init__(
            self,
            train_env,
            eval_env,
            output_dir=None,
            random_seed=None,
            controller=None,
            # Policy and Value model arguments.
            policy_and_value_model=None,
            policy_and_value_optimizer=None,
            policy_and_value_two_towers=False,
            policy_and_value_vocab_size=None,
            init_policy_from_world_model_output_dir=None,
            # Trajectory collection arguments.
            boundary=20,
            max_timestep=100,
            max_timestep_eval=20000,
            len_history_for_policy=4,
            # Save / Restore arguments.
            should_save_checkpoints=True,
            should_write_summaries=True,
            eval_every_n=1000,
            save_every_n=1000,
            done_frac_for_policy_save=0.5,
            # Eval arguments.
            n_evals=1,
            eval_temperatures=(1.0, 0.5),
            separate_eval=True,
            # Optimization arguments.
            n_optimizer_steps=N_OPTIMIZER_STEPS,
            optimizer_batch_size=64,
            **kwargs):
        """Creates the PolicyBasedTrainer.

    Args:
      train_env: gym.Env to use for training.
      eval_env: gym.Env to use for evaluation.
      output_dir: Output dir.
      random_seed: Random seed.
      controller: Function history -> (step -> {'name': value}) controlling
        nontrainable parameters.
      policy_and_value_model: Function defining the policy and value network,
        without the policy and value heads.
      policy_and_value_optimizer: Function defining the optimizer.
      policy_and_value_two_towers: Whether to use two separate models as the
        policy and value networks. If False, share their parameters.
      policy_and_value_vocab_size: Vocabulary size of a policy and value network
        operating on serialized representation. If None, use raw continuous
        representation.
      init_policy_from_world_model_output_dir: Model output dir for initializing
        the policy. If None, initialize randomly.
      boundary: We pad trajectories at integer multiples of this number.
      max_timestep: If set to an integer, maximum number of time-steps in a
        trajectory. Used in the collect procedure.
      max_timestep_eval: If set to an integer, maximum number of time-steps in
        an evaluation trajectory. Used in the collect procedure.
      len_history_for_policy: How much of history to give to the policy.
      should_save_checkpoints: Whether to save policy checkpoints.
      should_write_summaries: Whether to save summaries.
      eval_every_n: How frequently to eval the policy.
      save_every_n: How frequently to save the policy.
      done_frac_for_policy_save: Fraction of the trajectories that should be
        done to checkpoint the policy.
      n_evals: Number of times to evaluate.
      eval_temperatures: Sequence of temperatures to try for categorical
        sampling during evaluation.
      separate_eval: Whether to run separate evaluation using a set of
        temperatures. If False, the training reward is reported as evaluation
        reward with temperature 1.0.
      n_optimizer_steps: Number of optimizer steps.
      optimizer_batch_size: Batch size of an optimizer step.
      **kwargs: Additional keyword arguments passed to the base class.
    """
        super(PolicyBasedTrainer, self).__init__(train_env, eval_env,
                                                 output_dir, **kwargs)

        self._rng = trainer_lib.init_random_number_generators(random_seed)
        self._controller = controller
        self._history = None
        self._epoch = 0

        # Trajectory collection arguments.
        self._boundary = boundary
        self._max_timestep = max_timestep
        self._max_timestep_eval = max_timestep_eval
        self._len_history_for_policy = len_history_for_policy

        # Save / Restore arguments.
        self._should_save_checkpoints = should_save_checkpoints
        self._should_write_summaries = should_write_summaries
        self._train_sw, self._eval_sw, self._timing_sw = None, None, None
        self._eval_every_n = eval_every_n
        self._save_every_n = save_every_n
        self._done_frac_for_policy_save = done_frac_for_policy_save
        self._n_trajectories_done_since_last_save = 0
        self._last_saved_at_epoch = self._epoch

        # Eval arguments.
        self._n_evals = n_evals
        self._eval_temperatures = eval_temperatures
        self._separate_eval = separate_eval

        # Optimization arguments.
        self._n_optimizer_steps = n_optimizer_steps
        self._optimizer_batch_size = optimizer_batch_size
        self._total_opt_step = 0

        # Policy and Value model arguments.
        self._policy_and_value_vocab_size = policy_and_value_vocab_size
        self._serialization_kwargs = {}
        if self._policy_and_value_vocab_size is not None:
            self._serialization_kwargs = ppo.init_serialization(
                vocab_size=self._policy_and_value_vocab_size,
                observation_space=train_env.observation_space,
                action_space=train_env.action_space,
                n_timesteps=(self._max_timestep + 1),
            )
        self._init_policy_from_world_model_output_dir = (
            init_policy_from_world_model_output_dir)

        self._rewards_to_actions = ppo.init_rewards_to_actions(
            self._policy_and_value_vocab_size,
            train_env.observation_space,
            train_env.action_space,
            n_timesteps=(self._max_timestep + 1),
        )

        (n_controls, n_actions) = serialization_utils.analyze_action_space(
            train_env.action_space)
        self._policy_and_value_net_fn = functools.partial(
            ppo.policy_and_value_net,
            n_actions=n_actions,
            n_controls=n_controls,
            vocab_size=self._policy_and_value_vocab_size,
            bottom_layers_fn=policy_and_value_model,
            two_towers=policy_and_value_two_towers,
        )
        self._policy_and_value_net_apply = jit(self._policy_and_value_net_fn())
        self._policy_and_value_optimizer = policy_and_value_optimizer()
        self._model_state = None
        self._policy_and_value_opt_state = None
Example #4
0
  def __init__(self,
               train_env,
               eval_env,
               output_dir=None,
               policy_and_value_model=trax_models.FrameStackMLP,
               policy_and_value_optimizer=functools.partial(
                   trax_opt.Adam, learning_rate=1e-3),
               policy_and_value_two_towers=False,
               policy_and_value_vocab_size=None,
               n_optimizer_steps=N_OPTIMIZER_STEPS,
               optimizer_batch_size=64,
               print_every_optimizer_steps=PRINT_EVERY_OPTIMIZER_STEP,
               target_kl=0.01,
               boundary=20,
               max_timestep=100,
               max_timestep_eval=20000,
               random_seed=None,
               gamma=GAMMA,
               lambda_=LAMBDA,
               value_weight=1.0,
               entropy_weight=0.01,
               epsilon=0.1,
               eval_every_n=1000,
               save_every_n=1000,
               done_frac_for_policy_save=0.5,
               n_evals=1,
               len_history_for_policy=4,
               eval_temperatures=(1.0, 0.5),
               separate_eval=True,
               init_policy_from_world_model_output_dir=None,
               controller=None,
               should_save_checkpoints=True,
               should_write_summaries=True,
               **kwargs):
    """Creates the PPO trainer.

    Args:
      train_env: gym.Env to use for training.
      eval_env: gym.Env to use for evaluation.
      output_dir: Output dir.
      policy_and_value_model: Function defining the policy and value network,
        without the policy and value heads.
      policy_and_value_optimizer: Function defining the optimizer.
      policy_and_value_two_towers: Whether to use two separate models as the
        policy and value networks. If False, share their parameters.
      policy_and_value_vocab_size: Vocabulary size of a policy and value network
        operating on serialized representation. If None, use raw continuous
        representation.
      n_optimizer_steps: Number of optimizer steps.
      optimizer_batch_size: Batch size of an optimizer step.
      print_every_optimizer_steps: How often to log during the policy
        optimization process.
      target_kl: Policy iteration early stopping. Set to infinity to disable
        early stopping.
      boundary: We pad trajectories at integer multiples of this number.
      max_timestep: If set to an integer, maximum number of time-steps in a
        trajectory. Used in the collect procedure.
      max_timestep_eval: If set to an integer, maximum number of time-steps in
        an evaluation trajectory. Used in the collect procedure.
      random_seed: Random seed.
      gamma: Reward discount factor.
      lambda_: N-step TD-error discount factor in GAE.
      value_weight: Value loss coefficient.
      entropy_weight: Entropy loss coefficient.
      epsilon: Clipping coefficient.
      eval_every_n: How frequently to eval the policy.
      save_every_n: How frequently to save the policy.
      done_frac_for_policy_save: Fraction of the trajectories that should be
        done to checkpoint the policy.
      n_evals: Number of times to evaluate.
      len_history_for_policy: How much of history to give to the policy.
      eval_temperatures: Sequence of temperatures to try for categorical
        sampling during evaluation.
      separate_eval: Whether to run separate evaluation using a set of
        temperatures. If False, the training reward is reported as evaluation
        reward with temperature 1.0.
      init_policy_from_world_model_output_dir: Model output dir for initializing
        the policy. If None, initialize randomly.
      controller: Function history -> (step -> {'name': value}) controlling
        nontrainable parameters.
      should_save_checkpoints: Whether to save policy checkpoints.
      should_write_summaries: Whether to save summaries.
      **kwargs: Additional keyword arguments passed to the base class.
    """
    # Set in base class constructor.
    self._train_env = None
    self._should_reset = None

    self._n_optimizer_steps = n_optimizer_steps
    self._optimizer_batch_size = optimizer_batch_size
    self._print_every_optimizer_steps = print_every_optimizer_steps
    self._target_kl = target_kl
    self._boundary = boundary
    self._max_timestep = max_timestep
    self._max_timestep_eval = max_timestep_eval
    self._nontrainable_params = {
        'gamma': np.array(gamma),
        'lambda': np.array(lambda_),
        'value_weight': np.array(value_weight),
        'entropy_weight': np.array(entropy_weight),
        'epsilon': np.array(epsilon),
    }
    self._eval_every_n = eval_every_n
    self._save_every_n = save_every_n
    self._done_frac_for_policy_save = done_frac_for_policy_save
    self._n_evals = n_evals
    self._len_history_for_policy = len_history_for_policy
    self._eval_temperatures = eval_temperatures
    self._separate_eval = separate_eval
    self._controller = controller
    self._should_save_checkpoints = should_save_checkpoints
    self._should_write_summaries = should_write_summaries
    self._history = None

    (n_controls, n_actions) = ppo.analyze_action_space(train_env.action_space)

    self._rng = trainer_lib.init_random_number_generators(random_seed)

    self._policy_and_value_vocab_size = policy_and_value_vocab_size
    if self._policy_and_value_vocab_size is not None:
      self._serialization_kwargs = ppo.init_serialization(
          vocab_size=self._policy_and_value_vocab_size,
          observation_space=train_env.observation_space,
          action_space=train_env.action_space,
          n_timesteps=(self._max_timestep + 1),
      )
    else:
      self._serialization_kwargs = {}
    self.init_policy_from_world_model_output_dir = (
        init_policy_from_world_model_output_dir
    )

    self._rewards_to_actions = ppo.init_rewards_to_actions(
        self._policy_and_value_vocab_size,
        train_env.observation_space,
        train_env.action_space,
        n_timesteps=(self._max_timestep + 1),
    )

    self._policy_and_value_net_fn = functools.partial(
        ppo.policy_and_value_net,
        n_actions=n_actions,
        n_controls=n_controls,
        vocab_size=self._policy_and_value_vocab_size,
        bottom_layers_fn=policy_and_value_model,
        two_towers=policy_and_value_two_towers,
    )
    self._policy_and_value_net_apply = jit(self._policy_and_value_net_fn())
    self._policy_and_value_optimizer = policy_and_value_optimizer()

    # Super ctor calls reset(), which uses fields initialized above.
    super(PPO, self).__init__(train_env, eval_env, output_dir, **kwargs)