Example #1
0
    def __init__(self):

        observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32)
        action_tensor_spec = tensor_spec.BoundedTensorSpec([2], tf.float32, -1,
                                                           1)

        actor_net_builder = ppo_actor_network.PPOActorNetwork()
        actor_net = actor_net_builder.create_sequential_actor_net(
            fc_layer_units=(1, ), action_tensor_spec=action_tensor_spec)
        value_net = value_network.ValueNetwork(observation_tensor_spec,
                                               fc_layer_params=(1, ))

        super(FakePPOAgent, self).__init__(
            time_step_spec=ts.time_step_spec(observation_tensor_spec),
            action_spec=action_tensor_spec,
            actor_net=actor_net,
            value_net=value_net,
            # Ensures value_prediction, return and advantage are included as parts
            # of the training_data_spec.
            compute_value_and_advantage_in_train=False,
            update_normalizers_in_train=False,
        )
        # There is an artificial call on `_train` during the initialization which
        # ensures that the variables of the optimizer are initialized. This is
        # excluded from the call count.
        self.train_called_times = -1
        self.experiences = []
Example #2
0
    def test_same_actor_net_output(self):
        if not tf.executing_eagerly():
            self.skipTest(
                'Skipping test: sequential networks not supported in TF1')
        observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32)
        action_tensor_spec = tensor_spec.BoundedTensorSpec((8, ), tf.float32,
                                                           -1, 1)

        actor_net_lib = ppo_actor_network.PPOActorNetwork()
        actor_net_lib.seed_stream_class = DeterministicSeedStream
        actor_net_sequential = actor_net_lib.create_sequential_actor_net(
            fc_layer_units=(1, ),
            action_tensor_spec=action_tensor_spec,
            seed=1)

        actor_net_actor_dist = actor_distribution_network.ActorDistributionNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(1, ),
            activation_fn=tf.nn.tanh,
            kernel_initializer=tf.keras.initializers.Orthogonal(seed=1),
            seed_stream_class=DeterministicSeedStream,
            seed=1)

        sample_observation = tf.constant([[1], [2]], dtype=tf.float32)
        tf.random.set_seed(111)
        sequential_output_dist, _ = actor_net_sequential(
            sample_observation, step_type=ts.StepType.MID, network_state=())
        tf.random.set_seed(111)
        actor_dist_output_dist, _ = actor_net_actor_dist(
            sample_observation, step_type=ts.StepType.MID, network_state=())
        self.assertAllEqual(sequential_output_dist.mean(),
                            actor_dist_output_dist.mean())
        self.assertAllEqual(sequential_output_dist.stddev(),
                            actor_dist_output_dist.stddev())
Example #3
0
    def test_same_policy_same_output(self):
        if not tf.executing_eagerly():
            self.skipTest(
                'Skipping test: sequential networks not supported in TF1')
        observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32)
        action_tensor_spec = tensor_spec.BoundedTensorSpec((8, ), tf.float32,
                                                           -1, 1)

        value_net = value_network.ValueNetwork(observation_tensor_spec,
                                               fc_layer_params=(1, ))

        actor_net_lib = ppo_actor_network.PPOActorNetwork()
        actor_net_lib.seed_stream_class = DeterministicSeedStream
        actor_net_sequential = actor_net_lib.create_sequential_actor_net(
            fc_layer_units=(1, ),
            action_tensor_spec=action_tensor_spec,
            seed=1)
        actor_net_actor_dist = actor_distribution_network.ActorDistributionNetwork(
            observation_tensor_spec,
            action_tensor_spec,
            fc_layer_params=(1, ),
            activation_fn=tf.nn.tanh,
            kernel_initializer=tf.keras.initializers.Orthogonal(seed=1),
            seed_stream_class=DeterministicSeedStream,
            seed=1)

        tf.random.set_seed(111)
        seq_policy = ppo_policy.PPOPolicy(
            ts.time_step_spec(observation_tensor_spec),
            action_tensor_spec,
            actor_net_sequential,
            value_net,
            collect=True)
        tf.random.set_seed(111)
        actor_dist_policy = ppo_policy.PPOPolicy(
            ts.time_step_spec(observation_tensor_spec),
            action_tensor_spec,
            actor_net_actor_dist,
            value_net,
            collect=True)

        sample_timestep = ts.TimeStep(step_type=tf.constant([1, 1],
                                                            dtype=tf.int32),
                                      reward=tf.constant([1, 1],
                                                         dtype=tf.float32),
                                      discount=tf.constant([1, 1],
                                                           dtype=tf.float32),
                                      observation=tf.constant(
                                          [[1], [2]], dtype=tf.float32))
        seq_policy_step = seq_policy._distribution(sample_timestep,
                                                   policy_state=())
        act_dist_policy_step = actor_dist_policy._distribution(sample_timestep,
                                                               policy_state=())

        seq_scale = seq_policy_step.info['dist_params']['scale_diag']
        act_dist_scale = act_dist_policy_step.info['dist_params']['scale']
        self.assertAllEqual(seq_scale, act_dist_scale)
        self.assertAllEqual(seq_policy_step.info['dist_params']['loc'],
                            act_dist_policy_step.info['dist_params']['loc'])
Example #4
0
    def test_no_mismatched_shape(self):
        if not tf.executing_eagerly():
            self.skipTest(
                'Skipping test: sequential networks not supported in TF1')
        observation_tensor_spec = tf.TensorSpec(shape=[1], dtype=tf.float32)
        action_tensor_spec = tensor_spec.BoundedTensorSpec((8, ), tf.float32,
                                                           -1, 1)

        actor_net_lib = ppo_actor_network.PPOActorNetwork()
        actor_net_lib.seed_stream_class = DeterministicSeedStream
        actor_net = actor_net_lib.create_sequential_actor_net(
            fc_layer_units=(1, ),
            action_tensor_spec=action_tensor_spec,
            seed=1)

        actor_output_spec = actor_net.create_variables(observation_tensor_spec)

        distribution_utils.assert_specs_are_compatible(
            actor_output_spec, action_tensor_spec,
            'actor_network output spec does not match action spec')
Example #5
0
def train_eval(
    root_dir,
    env_name='HalfCheetah-v2',
    # Training params
    num_iterations=1600,
    actor_fc_layers=(64, 64),
    value_fc_layers=(64, 64),
    learning_rate=3e-4,
    collect_sequence_length=2048,
    minibatch_size=64,
    num_epochs=10,
    # Agent params
    importance_ratio_clipping=0.2,
    lambda_value=0.95,
    discount_factor=0.99,
    entropy_regularization=0.,
    value_pred_loss_coef=0.5,
    use_gae=True,
    use_td_lambda_return=True,
    gradient_clipping=0.5,
    value_clipping=None,
    # Replay params
    reverb_port=None,
    replay_capacity=10000,
    # Others
    policy_save_interval=5000,
    summary_interval=1000,
    eval_interval=10000,
    eval_episodes=100,
    debug_summaries=False,
    summarize_grads_and_vars=False):
  """Trains and evaluates PPO (Importance Ratio Clipping).

  Args:
    root_dir: Main directory path where checkpoints, saved_models, and summaries
      will be written to.
    env_name: Name for the Mujoco environment to load.
    num_iterations: The number of iterations to perform collection and training.
    actor_fc_layers: List of fully_connected parameters for the actor network,
      where each item is the number of units in the layer.
    value_fc_layers: : List of fully_connected parameters for the value network,
      where each item is the number of units in the layer.
    learning_rate: Learning rate used on the Adam optimizer.
    collect_sequence_length: Number of steps to take in each collect run.
    minibatch_size: Number of elements in each mini batch. If `None`, the entire
      collected sequence will be treated as one batch.
    num_epochs: Number of iterations to repeat over all collected data per data
      collection step. (Schulman,2017) sets this to 10 for Mujoco, 15 for
      Roboschool and 3 for Atari.
    importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For
      more detail, see explanation at the top of the doc.
    lambda_value: Lambda parameter for TD-lambda computation.
    discount_factor: Discount factor for return computation. Default to `0.99`
      which is the value used for all environments from (Schulman, 2017).
    entropy_regularization: Coefficient for entropy regularization loss term.
      Default to `0.0` because no entropy bonus was used in (Schulman, 2017).
    value_pred_loss_coef: Multiplier for value prediction loss to balance with
      policy gradient loss. Default to `0.5`, which was used for all
      environments in the OpenAI baseline implementation. This parameters is
      irrelevant unless you are sharing part of actor_net and value_net. In that
      case, you would want to tune this coeeficient, whose value depends on the
      network architecture of your choice.
    use_gae: If True (default False), uses generalized advantage estimation for
      computing per-timestep advantage. Else, just subtracts value predictions
      from empirical return.
    use_td_lambda_return: If True (default False), uses td_lambda_return for
      training value function; here: `td_lambda_return = gae_advantage +
        value_predictions`. `use_gae` must be set to `True` as well to enable TD
        -lambda returns. If `use_td_lambda_return` is set to True while
        `use_gae` is False, the empirical return will be used and a warning will
        be logged.
    gradient_clipping: Norm length to clip gradients.
    value_clipping: Difference between new and old value predictions are clipped
      to this threshold. Value clipping could be helpful when training
      very deep networks. Default: no clipping.
    reverb_port: Port for reverb server, if None, use a randomly chosen unused
      port.
    replay_capacity: The maximum number of elements for the replay buffer. Items
      will be wasted if this is smalled than collect_sequence_length.
    policy_save_interval: How often, in train_steps, the policy will be saved.
    summary_interval: How often to write data into Tensorboard.
    eval_interval: How often to run evaluation, in train_steps.
    eval_episodes: Number of episodes to evaluate over.
    debug_summaries: Boolean for whether to gather debug summaries.
    summarize_grads_and_vars: If true, gradient summaries will be written.
  """
  collect_env = suite_mujoco.load(env_name)
  eval_env = suite_mujoco.load(env_name)
  num_environments = 1

  observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
      spec_utils.get_tensor_specs(collect_env))
  # TODO(b/172267869): Remove this conversion once TensorNormalizer stops
  # converting float64 inputs to float32.
  observation_tensor_spec = tf.TensorSpec(
      dtype=tf.float32, shape=observation_tensor_spec.shape)

  train_step = train_utils.create_train_step()
  actor_net_builder = ppo_actor_network.PPOActorNetwork()
  actor_net = actor_net_builder.create_sequential_actor_net(
      actor_fc_layers, action_tensor_spec)
  value_net = value_network.ValueNetwork(
      observation_tensor_spec,
      fc_layer_params=value_fc_layers,
      kernel_initializer=tf.keras.initializers.Orthogonal())

  current_iteration = tf.Variable(0, dtype=tf.int64)
  def learning_rate_fn():
    # Linearly decay the learning rate.
    return learning_rate * (1 - current_iteration / num_iterations)

  agent = ppo_clip_agent.PPOClipAgent(
      time_step_tensor_spec,
      action_tensor_spec,
      optimizer=tf.keras.optimizers.Adam(
          learning_rate=learning_rate_fn, epsilon=1e-5),
      actor_net=actor_net,
      value_net=value_net,
      importance_ratio_clipping=importance_ratio_clipping,
      lambda_value=lambda_value,
      discount_factor=discount_factor,
      entropy_regularization=entropy_regularization,
      value_pred_loss_coef=value_pred_loss_coef,
      # This is a legacy argument for the number of times we repeat the data
      # inside of the train function, incompatible with mini batch learning.
      # We set the epoch number from the replay buffer and tf.Data instead.
      num_epochs=1,
      use_gae=use_gae,
      use_td_lambda_return=use_td_lambda_return,
      gradient_clipping=gradient_clipping,
      value_clipping=value_clipping,
      # TODO(b/150244758): Default compute_value_and_advantage_in_train to False
      # after Reverb open source.
      compute_value_and_advantage_in_train=False,
      # Skips updating normalizers in the agent, as it's handled in the learner.
      update_normalizers_in_train=False,
      debug_summaries=debug_summaries,
      summarize_grads_and_vars=summarize_grads_and_vars,
      train_step_counter=train_step)
  agent.initialize()

  reverb_server = reverb.Server(
      [
          reverb.Table(  # Replay buffer storing experience for training.
              name='training_table',
              sampler=reverb.selectors.Fifo(),
              remover=reverb.selectors.Fifo(),
              rate_limiter=reverb.rate_limiters.MinSize(1),
              max_size=replay_capacity,
              max_times_sampled=1,
          ),
          reverb.Table(  # Replay buffer storing experience for normalization.
              name='normalization_table',
              sampler=reverb.selectors.Fifo(),
              remover=reverb.selectors.Fifo(),
              rate_limiter=reverb.rate_limiters.MinSize(1),
              max_size=replay_capacity,
              max_times_sampled=1,
          )
      ],
      port=reverb_port)

  # Create the replay buffer.
  reverb_replay_train = reverb_replay_buffer.ReverbReplayBuffer(
      agent.collect_data_spec,
      sequence_length=collect_sequence_length,
      table_name='training_table',
      server_address='localhost:{}'.format(reverb_server.port),
      # The only collected sequence is used to populate the batches.
      max_cycle_length=1,
      rate_limiter_timeout_ms=1000)
  reverb_replay_normalization = reverb_replay_buffer.ReverbReplayBuffer(
      agent.collect_data_spec,
      sequence_length=collect_sequence_length,
      table_name='normalization_table',
      server_address='localhost:{}'.format(reverb_server.port),
      # The only collected sequence is used to populate the batches.
      max_cycle_length=1,
      rate_limiter_timeout_ms=1000)

  rb_observer = reverb_utils.ReverbTrajectorySequenceObserver(
      reverb_replay_train.py_client, ['training_table', 'normalization_table'],
      sequence_length=collect_sequence_length,
      stride_length=collect_sequence_length)

  saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
  collect_env_step_metric = py_metrics.EnvironmentSteps()
  learning_triggers = [
      triggers.PolicySavedModelTrigger(
          saved_model_dir,
          agent,
          train_step,
          interval=policy_save_interval,
          metadata_metrics={
              triggers.ENV_STEP_METADATA_KEY: collect_env_step_metric
          }),
      triggers.StepPerSecondLogTrigger(train_step, interval=summary_interval),
  ]

  def training_dataset_fn():
    return reverb_replay_train.as_dataset(
        sample_batch_size=num_environments,
        sequence_preprocess_fn=agent.preprocess_sequence)

  def normalization_dataset_fn():
    return reverb_replay_normalization.as_dataset(
        sample_batch_size=num_environments,
        sequence_preprocess_fn=agent.preprocess_sequence)

  agent_learner = ppo_learner.PPOLearner(
      root_dir,
      train_step,
      agent,
      experience_dataset_fn=training_dataset_fn,
      normalization_dataset_fn=normalization_dataset_fn,
      num_samples=1,
      num_epochs=num_epochs,
      minibatch_size=minibatch_size,
      shuffle_buffer_size=collect_sequence_length,
      triggers=learning_triggers)

  tf_collect_policy = agent.collect_policy
  collect_policy = py_tf_eager_policy.PyTFEagerPolicy(
      tf_collect_policy, use_tf_function=True)

  collect_actor = actor.Actor(
      collect_env,
      collect_policy,
      train_step,
      steps_per_run=collect_sequence_length,
      observers=[rb_observer],
      metrics=actor.collect_metrics(buffer_size=10) + [collect_env_step_metric],
      reference_metrics=[collect_env_step_metric],
      summary_dir=os.path.join(root_dir, learner.TRAIN_DIR),
      summary_interval=summary_interval)

  eval_greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(
      agent.policy, use_tf_function=True)

  if eval_interval:
    logging.info('Intial evaluation.')
    eval_actor = actor.Actor(
        eval_env,
        eval_greedy_policy,
        train_step,
        metrics=actor.eval_metrics(eval_episodes),
        reference_metrics=[collect_env_step_metric],
        summary_dir=os.path.join(root_dir, 'eval'),
        episodes_per_run=eval_episodes)

    eval_actor.run_and_log()

  logging.info('Training on %s', env_name)
  last_eval_step = 0
  for i in range(num_iterations):
    collect_actor.run()
    rb_observer.flush()
    agent_learner.run()
    reverb_replay_train.clear()
    reverb_replay_normalization.clear()
    current_iteration.assign_add(1)

    # Eval only if `eval_interval` has been set. Then, eval if the current train
    # step is equal or greater than the `last_eval_step` + `eval_interval` or if
    # this is the last iteration. This logic exists because agent_learner.run()
    # does not return after every train step.
    if (eval_interval and
        (agent_learner.train_step_numpy >= eval_interval + last_eval_step
         or i == num_iterations - 1)):
      logging.info('Evaluating.')
      eval_actor.run_and_log()
      last_eval_step = agent_learner.train_step_numpy

  rb_observer.close()
  reverb_server.stop()