def get_agent(time_step_spec, action_spec, actor_net, value_net, num_epochs,
              step_counter, learning_rate):
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    tf_agent = ppo_clip_agent.PPOClipAgent(time_step_spec,
                                           action_spec,
                                           optimizer,
                                           actor_net=actor_net,
                                           value_net=value_net,
                                           entropy_regularization=0.0,
                                           importance_ratio_clipping=0.2,
                                           normalize_observations=False,
                                           normalize_rewards=False,
                                           use_gae=True,
                                           num_epochs=num_epochs,
                                           debug_summaries=False,
                                           summarize_grads_and_vars=False,
                                           train_step_counter=step_counter)

    return tf_agent
Example #2
0
def train_eval(
        root_dir,
        env_name='HalfCheetah-v2',
        env_load_fn=suite_mujoco.load,
        random_seed=None,
        # TODO(b/127576522): rename to policy_fc_layers.
        actor_fc_layers=(200, 100),
        value_fc_layers=(200, 100),
        use_rnns=False,
        lstm_size=(20, ),
        # Params for collect
        num_environment_steps=25000000,
        collect_episodes_per_iteration=30,
        num_parallel_environments=30,
        replay_buffer_capacity=1001,  # Per-environment
        # Params for train
    num_epochs=25,
        learning_rate=1e-3,
        # Params for eval
        num_eval_episodes=30,
        eval_interval=500,
        # Params for summaries and logging
        train_checkpoint_interval=500,
        policy_checkpoint_interval=500,
        log_interval=50,
        summary_interval=50,
        summaries_flush_secs=1,
        use_tf_functions=True,
        debug_summaries=False,
        summarize_grads_and_vars=False):
    """A simple train and eval for PPO."""
    if root_dir is None:
        raise AttributeError('train_eval requires a root_dir.')

    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')
    saved_model_dir = os.path.join(root_dir, 'policy_saved_model')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)
    eval_metrics = [
        tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
        tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
    ]

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):
        if random_seed is not None:
            tf.compat.v1.set_random_seed(random_seed)
        eval_tf_env = tf_py_environment.TFPyEnvironment(env_load_fn(env_name))
        tf_env = tf_py_environment.TFPyEnvironment(
            parallel_py_environment.ParallelPyEnvironment(
                [lambda: env_load_fn(env_name)] * num_parallel_environments))
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate)

        if use_rnns:
            actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                input_fc_layer_params=actor_fc_layers,
                output_fc_layer_params=None,
                lstm_size=lstm_size)
            value_net = value_rnn_network.ValueRnnNetwork(
                tf_env.observation_spec(),
                input_fc_layer_params=value_fc_layers,
                output_fc_layer_params=None)
        else:
            actor_net = actor_distribution_network.ActorDistributionNetwork(
                tf_env.observation_spec(),
                tf_env.action_spec(),
                fc_layer_params=actor_fc_layers,
                activation_fn=tf.keras.activations.tanh)
            value_net = value_network.ValueNetwork(
                tf_env.observation_spec(),
                fc_layer_params=value_fc_layers,
                activation_fn=tf.keras.activations.tanh)

        tf_agent = ppo_clip_agent.PPOClipAgent(
            tf_env.time_step_spec(),
            tf_env.action_spec(),
            optimizer,
            actor_net=actor_net,
            value_net=value_net,
            entropy_regularization=0.0,
            importance_ratio_clipping=0.2,
            normalize_observations=False,
            normalize_rewards=False,
            use_gae=True,
            num_epochs=num_epochs,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=global_step)
        tf_agent.initialize()

        environment_steps_metric = tf_metrics.EnvironmentSteps()
        step_metrics = [
            tf_metrics.NumberOfEpisodes(),
            environment_steps_metric,
        ]

        train_metrics = step_metrics + [
            tf_metrics.AverageReturnMetric(
                batch_size=num_parallel_environments),
            tf_metrics.AverageEpisodeLengthMetric(
                batch_size=num_parallel_environments),
        ]

        eval_policy = tf_agent.policy
        collect_policy = tf_agent.collect_policy

        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            tf_agent.collect_data_spec,
            batch_size=num_parallel_environments,
            max_length=replay_buffer_capacity)

        train_checkpointer = common.Checkpointer(
            ckpt_dir=train_dir,
            agent=tf_agent,
            global_step=global_step,
            metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
        policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'policy'),
                                                  policy=eval_policy,
                                                  global_step=global_step)
        saved_model = policy_saver.PolicySaver(eval_policy,
                                               train_step=global_step)

        train_checkpointer.initialize_or_restore()

        collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
            tf_env,
            collect_policy,
            observers=[replay_buffer.add_batch] + train_metrics,
            num_episodes=collect_episodes_per_iteration)

        def train_step():
            trajectories = replay_buffer.gather_all()
            return tf_agent.train(experience=trajectories)

        if use_tf_functions:
            # TODO(b/123828980): Enable once the cause for slowdown was identified.
            collect_driver.run = common.function(collect_driver.run,
                                                 autograph=False)
            tf_agent.train = common.function(tf_agent.train, autograph=False)
            train_step = common.function(train_step)

        collect_time = 0
        train_time = 0
        timed_at_step = global_step.numpy()

        while environment_steps_metric.result() < num_environment_steps:
            global_step_val = global_step.numpy()
            if global_step_val % eval_interval == 0:
                metric_utils.eager_compute(
                    eval_metrics,
                    eval_tf_env,
                    eval_policy,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix='Metrics',
                )

            start_time = time.time()
            collect_driver.run()
            collect_time += time.time() - start_time

            start_time = time.time()
            total_loss, _ = train_step()
            replay_buffer.clear()
            train_time += time.time() - start_time

            for train_metric in train_metrics:
                train_metric.tf_summaries(train_step=global_step,
                                          step_metrics=step_metrics)

            if global_step_val % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step_val,
                             total_loss)
                steps_per_sec = ((global_step_val - timed_at_step) /
                                 (collect_time + train_time))
                logging.info('%.3f steps/sec', steps_per_sec)
                logging.info('collect_time = %.3f, train_time = %.3f',
                             collect_time, train_time)
                with tf.compat.v2.summary.record_if(True):
                    tf.compat.v2.summary.scalar(name='global_steps_per_sec',
                                                data=steps_per_sec,
                                                step=global_step)

                if global_step_val % train_checkpoint_interval == 0:
                    train_checkpointer.save(global_step=global_step_val)

                if global_step_val % policy_checkpoint_interval == 0:
                    policy_checkpointer.save(global_step=global_step_val)
                    saved_model_path = os.path.join(
                        saved_model_dir,
                        'policy_' + ('%d' % global_step_val).zfill(9))
                    saved_model.save(saved_model_path)

                timed_at_step = global_step_val
                collect_time = 0
                train_time = 0

        # One final eval before exiting.
        metric_utils.eager_compute(
            eval_metrics,
            eval_tf_env,
            eval_policy,
            num_episodes=num_eval_episodes,
            train_step=global_step,
            summary_writer=eval_summary_writer,
            summary_prefix='Metrics',
        )
Example #3
0
def train_eval(
    root_dir,
    tf_master='',
    env_name='HalfCheetah-v2',
    env_load_fn=suite_mujoco.load,
    random_seed=None,
    # TODO(b/127576522): rename to policy_fc_layers.
    actor_fc_layers=(200, 100),
    value_fc_layers=(200, 100),
    use_rnns=False,
    # Params for collect
    num_environment_steps=25000000,
    collect_episodes_per_iteration=30,
    num_parallel_environments=30,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for train
    num_epochs=25,
    learning_rate=1e-3,
    # Params for eval
    num_eval_episodes=30,
    eval_interval=500,
    # Params for summaries and logging
    train_checkpoint_interval=500,
    policy_checkpoint_interval=500,
    log_interval=50,
    summary_interval=50,
    summaries_flush_secs=1,
    debug_summaries=False,
    summarize_grads_and_vars=False,
    eval_metrics_callback=None):
  """A simple train and eval for PPO."""
  if root_dir is None:
    raise AttributeError('train_eval requires a root_dir.')

  root_dir = os.path.expanduser(root_dir)
  train_dir = os.path.join(root_dir, 'train')
  eval_dir = os.path.join(root_dir, 'eval')

  train_summary_writer = tf.compat.v2.summary.create_file_writer(
      train_dir, flush_millis=summaries_flush_secs * 1000)
  train_summary_writer.set_as_default()

  eval_summary_writer = tf.compat.v2.summary.create_file_writer(
      eval_dir, flush_millis=summaries_flush_secs * 1000)
  eval_metrics = [
      batched_py_metric.BatchedPyMetric(
          AverageReturnMetric,
          metric_args={'buffer_size': num_eval_episodes},
          batch_size=num_parallel_environments),
      batched_py_metric.BatchedPyMetric(
          AverageEpisodeLengthMetric,
          metric_args={'buffer_size': num_eval_episodes},
          batch_size=num_parallel_environments),
  ]
  eval_summary_writer_flush_op = eval_summary_writer.flush()

  global_step = tf.compat.v1.train.get_or_create_global_step()
  with tf.compat.v2.summary.record_if(
      lambda: tf.math.equal(global_step % summary_interval, 0)):
    if random_seed is not None:
      tf.compat.v1.set_random_seed(random_seed)
    eval_py_env = parallel_py_environment.ParallelPyEnvironment(
        [lambda: env_load_fn(env_name)] * num_parallel_environments)
    tf_env = tf_py_environment.TFPyEnvironment(
        parallel_py_environment.ParallelPyEnvironment(
            [lambda: env_load_fn(env_name)] * num_parallel_environments))
    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

    if use_rnns:
      actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork(
          tf_env.observation_spec(),
          tf_env.action_spec(),
          input_fc_layer_params=actor_fc_layers,
          output_fc_layer_params=None)
      value_net = value_rnn_network.ValueRnnNetwork(
          tf_env.observation_spec(),
          input_fc_layer_params=value_fc_layers,
          output_fc_layer_params=None)
    else:
      actor_net = actor_distribution_network.ActorDistributionNetwork(
          tf_env.observation_spec(),
          tf_env.action_spec(),
          fc_layer_params=actor_fc_layers,
          activation_fn=tf.keras.activations.tanh)
      value_net = value_network.ValueNetwork(
          tf_env.observation_spec(),
          fc_layer_params=value_fc_layers,
          activation_fn=tf.keras.activations.tanh)

    tf_agent = ppo_clip_agent.PPOClipAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        optimizer,
        actor_net=actor_net,
        value_net=value_net,
        entropy_regularization=0.0,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=True,
        num_epochs=num_epochs,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=global_step)

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        tf_agent.collect_data_spec,
        batch_size=num_parallel_environments,
        max_length=replay_buffer_capacity)

    eval_py_policy = py_tf_policy.PyTFPolicy(tf_agent.policy)

    environment_steps_metric = tf_metrics.EnvironmentSteps()
    environment_steps_count = environment_steps_metric.result()
    step_metrics = [
        tf_metrics.NumberOfEpisodes(),
        environment_steps_metric,
    ]
    train_metrics = step_metrics + [
        tf_metrics.AverageReturnMetric(
            batch_size=num_parallel_environments),
        tf_metrics.AverageEpisodeLengthMetric(
            batch_size=num_parallel_environments),
    ]

    # Add to replay buffer and other agent specific observers.
    replay_buffer_observer = [replay_buffer.add_batch]

    collect_policy = tf_agent.collect_policy

    collect_op = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        collect_policy,
        observers=replay_buffer_observer + train_metrics,
        num_episodes=collect_episodes_per_iteration).run()

    trajectories = replay_buffer.gather_all()

    train_op, _ = tf_agent.train(experience=trajectories)

    with tf.control_dependencies([train_op]):
      clear_replay_op = replay_buffer.clear()

    with tf.control_dependencies([clear_replay_op]):
      train_op = tf.identity(train_op)

    train_checkpointer = common.Checkpointer(
        ckpt_dir=train_dir,
        agent=tf_agent,
        global_step=global_step,
        metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
    policy_checkpointer = common.Checkpointer(
        ckpt_dir=os.path.join(train_dir, 'policy'),
        policy=tf_agent.policy,
        global_step=global_step)

    summary_ops = []
    for train_metric in train_metrics:
      summary_ops.append(train_metric.tf_summaries(
          train_step=global_step, step_metrics=step_metrics))

    with eval_summary_writer.as_default(), \
         tf.compat.v2.summary.record_if(True):
      for eval_metric in eval_metrics:
        eval_metric.tf_summaries(
            train_step=global_step, step_metrics=step_metrics)

    init_agent_op = tf_agent.initialize()

    with tf.compat.v1.Session(tf_master) as sess:
      # Initialize graph.
      train_checkpointer.initialize_or_restore(sess)
      common.initialize_uninitialized_variables(sess)

      sess.run(init_agent_op)
      sess.run(train_summary_writer.init())
      sess.run(eval_summary_writer.init())

      collect_time = 0
      train_time = 0
      timed_at_step = sess.run(global_step)
      steps_per_second_ph = tf.compat.v1.placeholder(
          tf.float32, shape=(), name='steps_per_sec_ph')
      steps_per_second_summary = tf.compat.v2.summary.scalar(
          name='global_steps_per_sec', data=steps_per_second_ph,
          step=global_step)

      while sess.run(environment_steps_count) < num_environment_steps:
        global_step_val = sess.run(global_step)
        if global_step_val % eval_interval == 0:
          metric_utils.compute_summaries(
              eval_metrics,
              eval_py_env,
              eval_py_policy,
              num_episodes=num_eval_episodes,
              global_step=global_step_val,
              callback=eval_metrics_callback,
              log=True,
          )
          sess.run(eval_summary_writer_flush_op)

        start_time = time.time()
        sess.run(collect_op)
        collect_time += time.time() - start_time
        start_time = time.time()
        total_loss, _ = sess.run([train_op, summary_ops])
        train_time += time.time() - start_time

        global_step_val = sess.run(global_step)
        if global_step_val % log_interval == 0:
          logging.info('step = %d, loss = %f', global_step_val, total_loss)
          steps_per_sec = (
              (global_step_val - timed_at_step) / (collect_time + train_time))
          logging.info('%.3f steps/sec', steps_per_sec)
          sess.run(
              steps_per_second_summary,
              feed_dict={steps_per_second_ph: steps_per_sec})
          logging.info('%s', 'collect_time = {}, train_time = {}'.format(
              collect_time, train_time))
          timed_at_step = global_step_val
          collect_time = 0
          train_time = 0

        if global_step_val % train_checkpoint_interval == 0:
          train_checkpointer.save(global_step=global_step_val)

        if global_step_val % policy_checkpoint_interval == 0:
          policy_checkpointer.save(global_step=global_step_val)

      # One final eval before exiting.
      metric_utils.compute_summaries(
          eval_metrics,
          eval_py_env,
          eval_py_policy,
          num_episodes=num_eval_episodes,
          global_step=global_step_val,
          callback=eval_metrics_callback,
          log=True,
      )
      sess.run(eval_summary_writer_flush_op)
Example #4
0
def train_eval(
    root_dir,
    env_name='HalfCheetah-v2',
    # Training params
    num_iterations=1600,
    actor_fc_layers=(64, 64),
    value_fc_layers=(64, 64),
    learning_rate=3e-4,
    collect_sequence_length=2048,
    minibatch_size=64,
    num_epochs=10,
    # Agent params
    importance_ratio_clipping=0.2,
    lambda_value=0.95,
    discount_factor=0.99,
    entropy_regularization=0.,
    value_pred_loss_coef=0.5,
    use_gae=True,
    use_td_lambda_return=True,
    gradient_clipping=0.5,
    value_clipping=None,
    # Replay params
    reverb_port=None,
    replay_capacity=10000,
    # Others
    policy_save_interval=5000,
    summary_interval=1000,
    eval_interval=10000,
    eval_episodes=100,
    debug_summaries=False,
    summarize_grads_and_vars=False):
  """Trains and evaluates PPO (Importance Ratio Clipping).

  Args:
    root_dir: Main directory path where checkpoints, saved_models, and summaries
      will be written to.
    env_name: Name for the Mujoco environment to load.
    num_iterations: The number of iterations to perform collection and training.
    actor_fc_layers: List of fully_connected parameters for the actor network,
      where each item is the number of units in the layer.
    value_fc_layers: : List of fully_connected parameters for the value network,
      where each item is the number of units in the layer.
    learning_rate: Learning rate used on the Adam optimizer.
    collect_sequence_length: Number of steps to take in each collect run.
    minibatch_size: Number of elements in each mini batch. If `None`, the entire
      collected sequence will be treated as one batch.
    num_epochs: Number of iterations to repeat over all collected data per data
      collection step. (Schulman,2017) sets this to 10 for Mujoco, 15 for
      Roboschool and 3 for Atari.
    importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For
      more detail, see explanation at the top of the doc.
    lambda_value: Lambda parameter for TD-lambda computation.
    discount_factor: Discount factor for return computation. Default to `0.99`
      which is the value used for all environments from (Schulman, 2017).
    entropy_regularization: Coefficient for entropy regularization loss term.
      Default to `0.0` because no entropy bonus was used in (Schulman, 2017).
    value_pred_loss_coef: Multiplier for value prediction loss to balance with
      policy gradient loss. Default to `0.5`, which was used for all
      environments in the OpenAI baseline implementation. This parameters is
      irrelevant unless you are sharing part of actor_net and value_net. In that
      case, you would want to tune this coeeficient, whose value depends on the
      network architecture of your choice.
    use_gae: If True (default False), uses generalized advantage estimation for
      computing per-timestep advantage. Else, just subtracts value predictions
      from empirical return.
    use_td_lambda_return: If True (default False), uses td_lambda_return for
      training value function; here: `td_lambda_return = gae_advantage +
        value_predictions`. `use_gae` must be set to `True` as well to enable TD
        -lambda returns. If `use_td_lambda_return` is set to True while
        `use_gae` is False, the empirical return will be used and a warning will
        be logged.
    gradient_clipping: Norm length to clip gradients.
    value_clipping: Difference between new and old value predictions are clipped
      to this threshold. Value clipping could be helpful when training
      very deep networks. Default: no clipping.
    reverb_port: Port for reverb server, if None, use a randomly chosen unused
      port.
    replay_capacity: The maximum number of elements for the replay buffer. Items
      will be wasted if this is smalled than collect_sequence_length.
    policy_save_interval: How often, in train_steps, the policy will be saved.
    summary_interval: How often to write data into Tensorboard.
    eval_interval: How often to run evaluation, in train_steps.
    eval_episodes: Number of episodes to evaluate over.
    debug_summaries: Boolean for whether to gather debug summaries.
    summarize_grads_and_vars: If true, gradient summaries will be written.
  """
  collect_env = suite_mujoco.load(env_name)
  eval_env = suite_mujoco.load(env_name)
  num_environments = 1

  observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
      spec_utils.get_tensor_specs(collect_env))
  # TODO(b/172267869): Remove this conversion once TensorNormalizer stops
  # converting float64 inputs to float32.
  observation_tensor_spec = tf.TensorSpec(
      dtype=tf.float32, shape=observation_tensor_spec.shape)

  train_step = train_utils.create_train_step()
  actor_net_builder = ppo_actor_network.PPOActorNetwork()
  actor_net = actor_net_builder.create_sequential_actor_net(
      actor_fc_layers, action_tensor_spec)
  value_net = value_network.ValueNetwork(
      observation_tensor_spec,
      fc_layer_params=value_fc_layers,
      kernel_initializer=tf.keras.initializers.Orthogonal())

  current_iteration = tf.Variable(0, dtype=tf.int64)
  def learning_rate_fn():
    # Linearly decay the learning rate.
    return learning_rate * (1 - current_iteration / num_iterations)

  agent = ppo_clip_agent.PPOClipAgent(
      time_step_tensor_spec,
      action_tensor_spec,
      optimizer=tf.keras.optimizers.Adam(
          learning_rate=learning_rate_fn, epsilon=1e-5),
      actor_net=actor_net,
      value_net=value_net,
      importance_ratio_clipping=importance_ratio_clipping,
      lambda_value=lambda_value,
      discount_factor=discount_factor,
      entropy_regularization=entropy_regularization,
      value_pred_loss_coef=value_pred_loss_coef,
      # This is a legacy argument for the number of times we repeat the data
      # inside of the train function, incompatible with mini batch learning.
      # We set the epoch number from the replay buffer and tf.Data instead.
      num_epochs=1,
      use_gae=use_gae,
      use_td_lambda_return=use_td_lambda_return,
      gradient_clipping=gradient_clipping,
      value_clipping=value_clipping,
      # TODO(b/150244758): Default compute_value_and_advantage_in_train to False
      # after Reverb open source.
      compute_value_and_advantage_in_train=False,
      # Skips updating normalizers in the agent, as it's handled in the learner.
      update_normalizers_in_train=False,
      debug_summaries=debug_summaries,
      summarize_grads_and_vars=summarize_grads_and_vars,
      train_step_counter=train_step)
  agent.initialize()

  reverb_server = reverb.Server(
      [
          reverb.Table(  # Replay buffer storing experience for training.
              name='training_table',
              sampler=reverb.selectors.Fifo(),
              remover=reverb.selectors.Fifo(),
              rate_limiter=reverb.rate_limiters.MinSize(1),
              max_size=replay_capacity,
              max_times_sampled=1,
          ),
          reverb.Table(  # Replay buffer storing experience for normalization.
              name='normalization_table',
              sampler=reverb.selectors.Fifo(),
              remover=reverb.selectors.Fifo(),
              rate_limiter=reverb.rate_limiters.MinSize(1),
              max_size=replay_capacity,
              max_times_sampled=1,
          )
      ],
      port=reverb_port)

  # Create the replay buffer.
  reverb_replay_train = reverb_replay_buffer.ReverbReplayBuffer(
      agent.collect_data_spec,
      sequence_length=collect_sequence_length,
      table_name='training_table',
      server_address='localhost:{}'.format(reverb_server.port),
      # The only collected sequence is used to populate the batches.
      max_cycle_length=1,
      rate_limiter_timeout_ms=1000)
  reverb_replay_normalization = reverb_replay_buffer.ReverbReplayBuffer(
      agent.collect_data_spec,
      sequence_length=collect_sequence_length,
      table_name='normalization_table',
      server_address='localhost:{}'.format(reverb_server.port),
      # The only collected sequence is used to populate the batches.
      max_cycle_length=1,
      rate_limiter_timeout_ms=1000)

  rb_observer = reverb_utils.ReverbTrajectorySequenceObserver(
      reverb_replay_train.py_client, ['training_table', 'normalization_table'],
      sequence_length=collect_sequence_length,
      stride_length=collect_sequence_length)

  saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
  collect_env_step_metric = py_metrics.EnvironmentSteps()
  learning_triggers = [
      triggers.PolicySavedModelTrigger(
          saved_model_dir,
          agent,
          train_step,
          interval=policy_save_interval,
          metadata_metrics={
              triggers.ENV_STEP_METADATA_KEY: collect_env_step_metric
          }),
      triggers.StepPerSecondLogTrigger(train_step, interval=summary_interval),
  ]

  def training_dataset_fn():
    return reverb_replay_train.as_dataset(
        sample_batch_size=num_environments,
        sequence_preprocess_fn=agent.preprocess_sequence)

  def normalization_dataset_fn():
    return reverb_replay_normalization.as_dataset(
        sample_batch_size=num_environments,
        sequence_preprocess_fn=agent.preprocess_sequence)

  agent_learner = ppo_learner.PPOLearner(
      root_dir,
      train_step,
      agent,
      experience_dataset_fn=training_dataset_fn,
      normalization_dataset_fn=normalization_dataset_fn,
      num_samples=1,
      num_epochs=num_epochs,
      minibatch_size=minibatch_size,
      shuffle_buffer_size=collect_sequence_length,
      triggers=learning_triggers)

  tf_collect_policy = agent.collect_policy
  collect_policy = py_tf_eager_policy.PyTFEagerPolicy(
      tf_collect_policy, use_tf_function=True)

  collect_actor = actor.Actor(
      collect_env,
      collect_policy,
      train_step,
      steps_per_run=collect_sequence_length,
      observers=[rb_observer],
      metrics=actor.collect_metrics(buffer_size=10) + [collect_env_step_metric],
      reference_metrics=[collect_env_step_metric],
      summary_dir=os.path.join(root_dir, learner.TRAIN_DIR),
      summary_interval=summary_interval)

  eval_greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(
      agent.policy, use_tf_function=True)

  if eval_interval:
    logging.info('Intial evaluation.')
    eval_actor = actor.Actor(
        eval_env,
        eval_greedy_policy,
        train_step,
        metrics=actor.eval_metrics(eval_episodes),
        reference_metrics=[collect_env_step_metric],
        summary_dir=os.path.join(root_dir, 'eval'),
        episodes_per_run=eval_episodes)

    eval_actor.run_and_log()

  logging.info('Training on %s', env_name)
  last_eval_step = 0
  for i in range(num_iterations):
    collect_actor.run()
    rb_observer.flush()
    agent_learner.run()
    reverb_replay_train.clear()
    reverb_replay_normalization.clear()
    current_iteration.assign_add(1)

    # Eval only if `eval_interval` has been set. Then, eval if the current train
    # step is equal or greater than the `last_eval_step` + `eval_interval` or if
    # this is the last iteration. This logic exists because agent_learner.run()
    # does not return after every train step.
    if (eval_interval and
        (agent_learner.train_step_numpy >= eval_interval + last_eval_step
         or i == num_iterations - 1)):
      logging.info('Evaluating.')
      eval_actor.run_and_log()
      last_eval_step = agent_learner.train_step_numpy

  rb_observer.close()
  reverb_server.stop()
Example #5
0
    def __init__(
            self,
            time_step_spec,
            action_spec,
            # Specific to multi-agent case
            n_agents,
            learning_rate=1e-4,
            # Specific to multi-grid agents
            actor_fc_layers=(32, 32),
            value_fc_layers=(32, 32),
            lstm_size=(128, ),
            conv_filters=8,
            conv_kernel=3,
            direction_fc=5,
            # Modifying agents
            inactive_agent_ids=tuple(),
            non_learning_agents=tuple(),
            # PPO Clip agent params
            importance_ratio_clipping=0.0,
            lambda_value=0.95,
            discount_factor=0.99,
            entropy_regularization=0.05,
            policy_l2_reg=0.0,
            value_function_l2_reg=0.0,
            shared_vars_l2_reg=0.0,
            value_pred_loss_coef=0.5,
            num_epochs=25,
            use_gae=False,
            use_td_lambda_return=False,
            normalize_rewards=True,
            reward_norm_clipping=10.0,
            normalize_observations=True,
            log_prob_clipping=0.0,
            gradient_clipping=None,
            check_numerics=False,
            debug_summaries=False,
            summarize_grads_and_vars=False,
            train_step_counter=None,
            use_attention_networks=False,
            name='MultiagentPPO'):
        """Creates a centralized controller agent that creates several PPO Agents.

    Note that all architecture params apply to each of the sub-agents created.

    Args:
      time_step_spec: A `TimeStep` spec of the expected time_steps.
      action_spec: A nest of BoundedTensorSpec representing the actions.
      n_agents: The number of agents in this environment.
      learning_rate: Initial learning rate for all agents.
      actor_fc_layers: Number and size of fully-connected layers in the actor.
      value_fc_layers: Number and size of fully-connected layers in the critic.
      lstm_size: Number of cells in the LSTM in the actor and critic.
      conv_filters: Number of convolutional filters.
      conv_kernel: Size of the convolutional kernel.
      direction_fc: Number of fully-connected neurons connecting the one-hot
        direction to the main LSTM.
      inactive_agent_ids: Integer IDs of agents who will not train or act in the
        environment, but will simply return a no-op action.
      non_learning_agents: Integer IDs of agents who will not train, but still
        act in the environment.
      importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective.
        For more detail, see explanation at the top of the doc.
      lambda_value: Lambda parameter for TD-lambda computation.
      discount_factor: Discount factor for return computation.
      entropy_regularization: Coefficient for entropy regularization loss term.
      policy_l2_reg: Coefficient for l2 regularization of unshared policy
        weights.
      value_function_l2_reg: Coefficient for l2 regularization of unshared value
        function weights.
      shared_vars_l2_reg: Coefficient for l2 regularization of weights shared
        between the policy and value functions.
      value_pred_loss_coef: Multiplier for value prediction loss to balance with
        policy gradient loss.
      num_epochs: Number of epochs for computing policy updates.
      use_gae: If True (default False), uses generalized advantage estimation
        for computing per-timestep advantage. Else, just subtracts value
        predictions from empirical return.
      use_td_lambda_return: If True (default False), uses td_lambda_return for
        training value function. (td_lambda_return = gae_advantage +
        value_predictions)
      normalize_rewards: If true, keeps moving variance of rewards and
        normalizes incoming rewards.
      reward_norm_clipping: Value above and below to clip normalized reward.
      normalize_observations: If true, keeps moving mean and variance of
        observations and normalizes incoming observations.
      log_prob_clipping: +/- value for clipping log probs to prevent inf / NaN
        values.  Default: no clipping.
      gradient_clipping: Norm length to clip gradients.  Default: no clipping.
      check_numerics: If true, adds tf.debugging.check_numerics to help find NaN
        / Inf values. For debugging only.
      debug_summaries: A bool to gather debug summaries.
      summarize_grads_and_vars: If true, gradient summaries will be written.
      train_step_counter: An optional counter to increment every time the train
        op is run.  Defaults to the global_step.
      use_attention_networks: Option to use attention network architecture in
        the agent. This architecture requires observations from the previous
        time step.
      name: The name of this agent. All variables in this module will fall under
        that name. Defaults to the class name.

    Raises:
      ValueError: If the actor_net is not a DistributionNetwork.
    """
        self.n_agents = n_agents
        self.inactive_agent_ids = inactive_agent_ids
        self.non_learning_agents = non_learning_agents

        # Get single-agent specs
        (single_obs_spec, single_time_step_spec,
         single_action_spec) = self.get_single_agent_specs(
             time_step_spec, action_spec)

        # Make baby agents
        self.agents = [None] * self.n_agents
        self.optimizers = [None] * self.n_agents
        for agent_id in range(self.n_agents):
            with tf.name_scope('agent_' + str(agent_id)):
                self.optimizers[agent_id] = tf.compat.v1.train.AdamOptimizer(
                    learning_rate=learning_rate)

                if use_attention_networks:
                    network_build_fn = multigrid_networks.construct_attention_networks
                else:
                    network_build_fn = multigrid_networks.construct_multigrid_networks
                # Build actor and critic networks
                actor_net, value_net = network_build_fn(
                    single_obs_spec,
                    single_action_spec,
                    actor_fc_layers=actor_fc_layers,
                    value_fc_layers=value_fc_layers,
                    lstm_size=lstm_size,
                    conv_filters=conv_filters,
                    conv_kernel=conv_kernel,
                    scalar_fc=direction_fc)

                logging.info('Creating agent %d...', agent_id)
                self.agents[agent_id] = ppo_clip_agent.PPOClipAgent(
                    single_time_step_spec,
                    single_action_spec,
                    self.optimizers[agent_id],
                    actor_net=actor_net,
                    value_net=value_net,
                    entropy_regularization=entropy_regularization,
                    importance_ratio_clipping=0.2,
                    normalize_observations=False,
                    normalize_rewards=False,
                    use_gae=True,
                    num_epochs=num_epochs,
                    debug_summaries=debug_summaries,
                    summarize_grads_and_vars=summarize_grads_and_vars,
                    train_step_counter=train_step_counter,
                    compute_value_and_advantage_in_train=True)
                self.agents[agent_id].initialize()

        with tf.name_scope('meta_agent'):
            # Initialize policies
            self._policies = [
                self.agents[a].policy for a in range(self.n_agents)
            ]
            policy = multiagent_ppo_policy.MultiagentPPOPolicy(
                self._policies,
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                clip=False,
                collect=False,
                inactive_agent_ids=inactive_agent_ids)

            self._collect_policies = [
                self.agents[a].collect_policy for a in range(self.n_agents)
            ]
            collect_policy = multiagent_ppo_policy.MultiagentPPOPolicy(
                self._collect_policies,
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                clip=False,
                collect=True,
                inactive_agent_ids=inactive_agent_ids)

            super(MultiagentPPO, self).__init__(
                time_step_spec,
                action_spec,
                policy,
                collect_policy,
                train_sequence_length=None,
                debug_summaries=debug_summaries,
                summarize_grads_and_vars=summarize_grads_and_vars,
                train_step_counter=train_step_counter)

        print('Finished constructing multi-agent PPO')
Example #6
0
def create_ppo_agent_and_dataset_fn(action_spec, time_step_spec, train_step,
                                    batch_size):
    """Builds and returns a dummy PPO Agent, dataset and dataset function."""
    del action_spec  # Unused.
    del time_step_spec  # Unused.
    del batch_size  # Unused.

    # No arbitrary spec supported.
    obs_spec = tensor_spec.TensorSpec([2], tf.float32)
    ts_spec = ts.time_step_spec(obs_spec)
    act_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        obs_spec,
        act_spec,
        fc_layer_params=(100, ),
        activation_fn=tf.keras.activations.tanh)

    value_net = value_network.ValueNetwork(
        obs_spec,
        fc_layer_params=(100, ),
        activation_fn=tf.keras.activations.tanh)

    agent = ppo_clip_agent.PPOClipAgent(
        ts_spec,
        act_spec,
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        actor_net=actor_net,
        value_net=value_net,
        entropy_regularization=0.0,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=False,
        use_td_lambda_return=False,
        num_epochs=1,
        debug_summaries=False,
        summarize_grads_and_vars=False,
        train_step_counter=train_step,
        compute_value_and_advantage_in_train=False)

    def _create_experience(_):
        observations = tf.constant([
            [[1, 2], [3, 4], [5, 6]],
            [[1, 2], [3, 4], [5, 6]],
        ],
                                   dtype=tf.float32)
        mid_time_step_val = ts.StepType.MID.tolist()
        time_steps = ts.TimeStep(step_type=tf.constant(
            [[mid_time_step_val] * 3] * 2, dtype=tf.int32),
                                 reward=tf.constant([[1] * 3] * 2,
                                                    dtype=tf.float32),
                                 discount=tf.constant([[1] * 3] * 2,
                                                      dtype=tf.float32),
                                 observation=observations)
        actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]],
                              dtype=tf.float32)

        action_distribution_parameters = {
            'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32),
            'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32),
        }
        value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]],
                                  dtype=tf.float32)

        policy_info = {
            'dist_params': action_distribution_parameters,
        }
        policy_info['value_prediction'] = value_preds
        experience = trajectory.Trajectory(time_steps.step_type, observations,
                                           actions, policy_info,
                                           time_steps.step_type,
                                           time_steps.reward,
                                           time_steps.discount)
        return agent._preprocess(experience)  # pylint: disable=protected-access

    dataset = tf.data.Dataset.from_tensor_slices([[i] for i in range(100)
                                                  ]).map(_create_experience)
    dataset = tf.data.Dataset.zip((dataset, tf.data.experimental.Counter()))
    dataset_fn = lambda: dataset

    return agent, dataset, dataset_fn, agent.training_data_spec
    def __init__(
        self,
        env,
        global_step,
        root_dir,
        step_metrics,
        name='Agent',
        is_environment=False,
        use_tf_functions=True,
        max_steps=250,
        replace_reward=True,
        non_negative_regret=False,
        id_num=0,
        block_budget_weight=0.,

        # Architecture hparams
        use_rnn=True,
        learning_rate=1e-4,
        actor_fc_layers=(32, 32),
        value_fc_layers=(32, 32),
        lstm_size=(128, ),
        conv_filters=8,
        conv_kernel=3,
        scalar_fc=5,
        entropy_regularization=0.,
        xy_dim=None,

        # Training & logging settings
        num_epochs=25,
        num_eval_episodes=5,
        num_parallel_envs=5,
        replay_buffer_capacity=1001,
        debug_summaries=True,
        summarize_grads_and_vars=True,
    ):
        """Initializes agent, replay buffer, metrics, and checkpointing.

    Args:
      env: An AdversarialTfPyEnvironment with specs and advesary specs.
      global_step: A tf variable tracking the global step.
      root_dir: Path to directory where metrics and checkpoints should be saved.
      step_metrics: A list of tf-agents metrics which represent the x-axis
        during training, such as the number of episodes or the number of
        environment steps.
      name: The name of this agent, e.g. 'Adversary'.
      is_environment: If True, will use the adversary specs from the environment
        and construct a network with additional inputs for the adversary.
      use_tf_functions: If True, will use tf.function to wrap the agent's train
        function.
      max_steps: The maximum number of steps the agent is allowed to interact
        with the environment in every data collection loop.
      replace_reward: If False, will not modify the reward stored in the agent's
        trajectories. This means the agent will be trained with the default
        environment reward rather than regret.
      non_negative_regret: If True, will ensure that the regret reward cannot
        be below 0.
      id_num: The ID number of this agent within the population of agents of the
        same type. I.e. this is adversary agent 3.
      block_budget_weight: Weight to place on the adversary's block budget
        reward. Default is 0 for no block budget.
      use_rnn: If True, will use an RNN within the network architecture.
      learning_rate: The learning rate used to initialize the optimizer for this
        agent.
      actor_fc_layers: The number and size of fully connected layers in the
        policy.
      value_fc_layers: The number and size of fully connected layers in the
        critic / value network.
      lstm_size: The number of LSTM cells in the RNN.
      conv_filters: The number of convolution filters.
      conv_kernel: The width of the convolution kernel.
      scalar_fc: The width of the fully-connected layer which inputs a scalar.
      entropy_regularization: Entropy regularization coefficient.
      xy_dim: Certain adversaries take in the current (x,y) position as a
        one-hot vector. In this case, the maximum value for x or y is required
        to create the one-hot representation.
      num_epochs: Number of epochs for computing PPO policy updates.
      num_eval_episodes: Number of evaluation episodes be eval step, used as
        batch size to initialize eval metrics.
      num_parallel_envs: Number of parallel environments used in trainin, used
        as batch size for training metrics and rewards.
      replay_buffer_capacity: Capacity of this agent's replay buffer.
      debug_summaries: Log additional summaries from the PPO agent.
      summarize_grads_and_vars: If True, logs gradient norms and variances in
        PPO agent.
    """
        self.name = name
        self.id = id_num
        self.max_steps = max_steps
        self.is_environment = is_environment
        self.replace_reward = replace_reward
        self.non_negative_regret = non_negative_regret
        self.block_budget_weight = block_budget_weight

        with tf.name_scope(self.name):
            self.optimizer = tf.compat.v1.train.AdamOptimizer(
                learning_rate=learning_rate)

            logging.info('\tCalculating specs and building networks...')
            if is_environment:
                self.time_step_spec = env.adversary_time_step_spec
                self.action_spec = env.adversary_action_spec
                self.observation_spec = env.adversary_observation_spec

                (self.actor_net, self.value_net
                 ) = multigrid_networks.construct_multigrid_networks(
                     self.observation_spec,
                     self.action_spec,
                     use_rnns=use_rnn,
                     actor_fc_layers=actor_fc_layers,
                     value_fc_layers=value_fc_layers,
                     lstm_size=lstm_size,
                     conv_filters=conv_filters,
                     conv_kernel=conv_kernel,
                     scalar_fc=scalar_fc,
                     scalar_name='time_step',
                     scalar_dim=self.observation_spec['time_step'].maximum + 1,
                     random_z=True,
                     xy_dim=xy_dim)
            else:
                self.time_step_spec = env.time_step_spec()
                self.action_spec = env.action_spec()
                self.observation_spec = env.observation_spec()

                (self.actor_net, self.value_net
                 ) = multigrid_networks.construct_multigrid_networks(
                     self.observation_spec,
                     self.action_spec,
                     use_rnns=use_rnn,
                     actor_fc_layers=actor_fc_layers,
                     value_fc_layers=value_fc_layers,
                     lstm_size=lstm_size,
                     conv_filters=conv_filters,
                     conv_kernel=conv_kernel,
                     scalar_fc=scalar_fc)

            self.tf_agent = ppo_clip_agent.PPOClipAgent(
                self.time_step_spec,
                self.action_spec,
                self.optimizer,
                actor_net=self.actor_net,
                value_net=self.value_net,
                entropy_regularization=entropy_regularization,
                importance_ratio_clipping=0.2,
                normalize_observations=False,
                normalize_rewards=False,
                use_gae=True,
                num_epochs=num_epochs,
                debug_summaries=debug_summaries,
                summarize_grads_and_vars=summarize_grads_and_vars,
                train_step_counter=global_step)
            self.tf_agent.initialize()
            self.eval_policy = self.tf_agent.policy
            self.collect_policy = self.tf_agent.collect_policy

            logging.info('\tAllocating replay buffer ...')
            self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
                self.tf_agent.collect_data_spec,
                batch_size=num_parallel_envs,
                max_length=replay_buffer_capacity)
            logging.info('\t\tRB capacity: %i', self.replay_buffer.capacity)
            self.final_reward = tf.zeros(shape=(num_parallel_envs),
                                         dtype=tf.float32)
            self.enemy_max = tf.zeros(shape=(num_parallel_envs),
                                      dtype=tf.float32)

            # Creates train metrics
            self.step_metrics = step_metrics
            self.train_metrics = step_metrics + [
                tf_metrics.AverageEpisodeLengthMetric(
                    batch_size=num_parallel_envs,
                    name=name + '_AverageEpisodeLength')
            ]
            self.eval_metrics = [
                tf_metrics.AverageEpisodeLengthMetric(
                    batch_size=num_eval_episodes,
                    name=name + '_AverageEpisodeLength')
            ]
            if is_environment:
                self.env_train_metric = adversarial_eval.AdversarialEnvironmentScalar(
                    batch_size=num_parallel_envs,
                    name=name + '_AdversaryReward')
                self.env_eval_metric = adversarial_eval.AdversarialEnvironmentScalar(
                    batch_size=num_eval_episodes,
                    name=name + '_AdversaryReward')
            else:
                self.train_metrics.append(
                    tf_metrics.AverageReturnMetric(
                        batch_size=num_parallel_envs,
                        name=name + '_AverageReturn'))
                self.eval_metrics.append(
                    tf_metrics.AverageReturnMetric(
                        batch_size=num_eval_episodes,
                        name=name + '_AverageReturn'))

            self.metrics_group = metric_utils.MetricsGroup(
                self.train_metrics, name + '_train_metrics')
            self.observers = self.train_metrics + [
                self.replay_buffer.add_batch
            ]

            self.train_dir = os.path.join(root_dir, 'train', name, str(id_num))
            self.eval_dir = os.path.join(root_dir, 'eval', name, str(id_num))
            self.train_checkpointer = common.Checkpointer(
                ckpt_dir=self.train_dir,
                agent=self.tf_agent,
                global_step=global_step,
                metrics=self.metrics_group,
            )
            self.policy_checkpointer = common.Checkpointer(
                ckpt_dir=os.path.join(self.train_dir, 'policy'),
                policy=self.eval_policy,
                global_step=global_step)
            self.saved_model = policy_saver.PolicySaver(self.eval_policy,
                                                        train_step=global_step)
            self.saved_model_dir = os.path.join(root_dir, 'policy_saved_model',
                                                name, str(id_num))

            self.train_checkpointer.initialize_or_restore()

            if use_tf_functions:
                self.tf_agent.train = common.function(self.tf_agent.train,
                                                      autograph=False)

            self.total_loss = None
            self.extra_loss = None
            self.loss_divergence_counter = 0
Example #8
0
def main():

    logging.set_verbosity(logging.INFO)
    tf.compat.v1.enable_v2_behavior()
    parser = argparse.ArgumentParser()

    ## Essential parameters
    parser.add_argument("--output_dir", default=None, type=str, required=True,help="The output directory where the model stats and checkpoints will be written.")
    parser.add_argument("--env", default=None, type=str, required=True,help="The environment to train the agent on")
    parser.add_argument("--max_horizon", default=4, type=int)
    parser.add_argument("--atari", default=False, type=bool, help = "Gets some data Types correctly")


    ##agent parameters
    parser.add_argument("--reward_scale_factor", default=1.0, type=float)
    parser.add_argument("--debug_summaries", default=False, type=bool)
    parser.add_argument("--summarize_grads_and_vars", default=False, type=bool)

    ##transformer parameters
    parser.add_argument("--d_model", default=64, type=int)
    parser.add_argument("--num_layers", default=3, type=int)
    parser.add_argument("--dff", default=256, type=int)

    ##Training parameters
    parser.add_argument('--num_iterations', type=int, default=100000,help="steps in the env")
    parser.add_argument('--num_parallel', type=int, default=30,help="how many envs should run in parallel")
    parser.add_argument("--collect_episodes_per_iteration", default=1, type=int)
    parser.add_argument('--num_epochs', type=int, default = 25,help = 'Number of epochs for computing policy updates.')


    ## Other parameters
    parser.add_argument("--num_eval_episodes", default=10, type=int)
    parser.add_argument("--eval_interval", default=1000, type=int)
    parser.add_argument("--log_interval", default=10, type=int)
    parser.add_argument("--summary_interval", default=1000, type=int)
    parser.add_argument("--run_graph_mode", default=True, type=bool)
    parser.add_argument("--checkpoint_interval", default=1000, type=int)
    parser.add_argument("--summary_flush", default=10, type=int)   #what does this exactly do? 

    # HP opt params
    #parser.add_argument("--doubleQ", default=True, type=bool,help="Whether to use a  DoubleQ agent")
    parser.add_argument("--custom_last_layer", default=True, type=bool)
    parser.add_argument("--custom_layer_init", default=1.0,type=    float)
    parser.add_argument("--initial_collect_steps", default=5000, type=int)
    #parser.add_argument("--loss_function", default="element_wise_huber_loss", type=str)
    parser.add_argument("--num_heads", default=4, type=int)
    parser.add_argument("--normalize_env", default=False, type=bool)  
    parser.add_argument('--custom_lr_schedule',default="No",type=str,help = "whether to use a custom LR schedule")
    #parser.add_argument("--epsilon_greedy", default=0.3, type=float)
    #parser.add_argument("--target_update_period", default=1000, type=int)
    parser.add_argument("--rate", default=0.1, type=float)  # dropout rate  (might be not used depending on the q network)  #Setting this to 0.0 somehow break the code. Not relevant tho just select a network without dropout
    parser.add_argument("--gradient_clipping", default=True, type=bool)
    parser.add_argument("--replay_buffer_max_length", default=1001, type=int)
    #parser.add_argument("--batch_size", default=32, type=int)
    parser.add_argument("--learning_rate", default=1e-4, type=float)
    parser.add_argument("--encoder_type", default=3, type=int,help="Which Type of encoder is used for the model")
    parser.add_argument("--layer_type", default=3, type=int,help="Which Type of layer is used for the encoder")
    #parser.add_argument("--target_update_tau", default=1, type=float)
    #parser.add_argument("--gamma", default=0.99, type=float)


    
    args = parser.parse_args()
    global_step = tf.compat.v1.train.get_or_create_global_step()
    
    baseEnv = gym.make(args.env)
    
    eval_tf_env = tf_py_environment.TFPyEnvironment(PyhistoryWrapper(suite_gym.load(args.env),args.max_horizon,args.atari))
        #[lambda: PyhistoryWrapper(suite_gym.load(args.env),args.max_horizon,args.atari)] * args.num_parallel)
    tf_env = tf_py_environment.TFPyEnvironment(
        parallel_py_environment.ParallelPyEnvironment(
            #[lambda: PyhistoryWrapper(suite_gym.load(args.env),args.max_horizon,args.atari)] * args.num_parallel))
            [lambda: PyhistoryWrapper(suite_gym.load(args.env),args.max_horizon,args.atari)] * args.num_parallel))
    
    
    actor_net = actor_distribution_network.ActorDistributionNetwork(
        tf_env.observation_spec(),
        tf_env.action_spec(),
        fc_layer_params=(200, 100),
        activation_fn=tf.keras.activations.tanh)
    value_net = value_network.ValueNetwork(
        tf_env.observation_spec(),
        fc_layer_params=(200, 100),
        activation_fn=tf.keras.activations.tanh)
    
    
    
    actor_net = QTransformer(
        tf_env.observation_spec(),
        baseEnv.action_space.n,
        num_layers=args.num_layers,
        d_model=args.d_model,
        num_heads=args.num_heads, 
        dff=args.dff,
        rate = args.rate,
        encoderType = args.encoder_type,
        enc_layer_type=args.layer_type,
        max_horizon=args.max_horizon,
        custom_layer = args.custom_layer_init, 
        custom_last_layer = args.custom_last_layer)

    if args.custom_lr_schedule == "Transformer":    # builds a lr schedule according to the original usage for the transformer
        learning_rate = CustomSchedule(args.d_model,int(args.num_iterations/10))
        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    elif args.custom_lr_schedule == "Transformer_low":    # builds a lr schedule according to the original usage for the transformer
        learning_rate = CustomSchedule(int(args.d_model/2),int(args.num_iterations/10)) # --> same schedule with lower general lr
        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    elif args.custom_lr_schedule == "Linear": 
        lrs = LinearCustomSchedule(learning_rate,args.num_iterations)
        optimizer = tf.keras.optimizers.Adam(lrs, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

    else:
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=args.learning_rate)




    tf_agent = ppo_clip_agent.PPOClipAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        optimizer,
        actor_net=actor_net,
        value_net=value_net,
        entropy_regularization=0.0,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=True,
        num_epochs=args.num_epochs,
        debug_summaries=args.debug_summaries,
        summarize_grads_and_vars=args.summarize_grads_and_vars,
        train_step_counter=global_step)
    tf_agent.initialize()


    
    train_eval(
    args.output_dir,
    0, # ??
    # TODO(b/127576522): rename to policy_fc_layers.
    tf_agent,
    eval_tf_env,
    tf_env,
    # Params for collect
    args.num_iterations,
    args.collect_episodes_per_iteration,
    args.num_parallel,
    args.replay_buffer_max_length,  # Per-environment
    # Params for train
    args.num_epochs,
    args.learning_rate,
    # Params for eval
    args.num_eval_episodes,
    args.eval_interval,
    # Params for summaries and logging
    args.checkpoint_interval,
    args.checkpoint_interval,
    args.checkpoint_interval,
    args.log_interval,
    args.summary_interval,
    args.summary_flush,
    args.debug_summaries,
    args.summarize_grads_and_vars,
    args.run_graph_mode,
    None)
    

    
    pickle.dump(args,open(args.output_dir + "/training_args.p","wb"))
    print("Successfully trained and evaluation.")
Example #9
0
        tf_env.action_spec(),
        fc_layer_params=actor_fc_layers,
        activation_fn=tf.keras.activations.tanh))
    value_net = (value_network.ValueNetwork(
        tf_env.observation_spec(),
        fc_layer_params=value_fc_layers,
        activation_fn=tf.keras.activations.tanh))

tf_agent = ppo_clip_agent.PPOClipAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    optimizer,
    actor_net=actor_net,
    value_net=value_net,
    entropy_regularization=0.0,
    importance_ratio_clipping=importance_ratio_clipping,
    normalize_observations=False,
    normalize_rewards=False,
    use_gae=True,
    num_epochs=num_epochs,
    debug_summaries=debug_summaries,
    summarize_grads_and_vars=summarize_grads_and_vars,
    train_step_counter=global_step)
tf_agent.initialize()
# -

# ### Replay buffer and initial data collection

eval_policy = tf_agent.policy
collect_policy = tf_agent.collect_policy