Esempio n. 1
0
    def _build_learner_with_strategy(self,
                                     create_agent_and_dataset_fn,
                                     strategy,
                                     sample_batch_size=2):
        if strategy is None:
            # Get default strategy if None provided.
            strategy = tf.distribute.get_strategy()

        with strategy.scope():
            tf_env = tf_py_environment.TFPyEnvironment(
                suite_gym.load('CartPole-v0'))

            train_step = train_utils.create_train_step()
            agent, dataset, dataset_fn, _ = create_agent_and_dataset_fn(
                tf_env.time_step_spec().observation, tf_env.action_spec(),
                tf_env.time_step_spec(), train_step, sample_batch_size)

            root_dir = os.path.join(self.create_tempdir().full_path, 'learner')

            test_learner = learner.Learner(root_dir=root_dir,
                                           train_step=train_step,
                                           agent=agent,
                                           experience_dataset_fn=dataset_fn)
            variables = agent.collect_policy.variables()
        return test_learner, dataset, variables, train_step
Esempio n. 2
0
    def _build_components(self, rb_port):
        env = suite_gym.load('CartPole-v0')

        observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
            spec_utils.get_tensor_specs(env))
        train_step = train_utils.create_train_step()

        q_net = q_network.QNetwork(observation_tensor_spec,
                                   action_tensor_spec,
                                   fc_layer_params=(100, ))

        agent = dqn_agent.DqnAgent(
            time_step_tensor_spec,
            action_tensor_spec,
            q_network=q_net,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001),
            train_step_counter=train_step)

        replay_buffer, rb_observer = (
            replay_buffer_utils.get_reverb_buffer_and_observer(
                agent.collect_data_spec,
                sequence_length=2,
                replay_capacity=1000,
                port=rb_port))

        return env, agent, train_step, replay_buffer, rb_observer
    def test_after_train_step_fn_with_fresh_data_only(self,
                                                      create_strategy_fn):
        strategy = create_strategy_fn()
        with strategy.scope():
            # Prepare the test context context.
            train_step = train_utils.create_train_step()
            train_step.assign(225)
            train_steps_per_policy_update = 100

            # Create the after train function to test, and the test input.
            after_train_step_fn = (
                train_utils.create_staleness_metrics_after_train_step_fn(
                    train_step,
                    train_steps_per_policy_update=train_steps_per_policy_update
                ))
            observation_train_steps = np.array([[200], [200], [200]],
                                               dtype=np.int64)

            # Define the expectations (expected scalar summary calls).
            expected_scalar_summary_calls = [
                mock.call(name='staleness/max_train_step_delta_in_batch',
                          data=0,
                          step=225),
                mock.call(name='staleness/max_policy_update_delta_in_batch',
                          data=0,
                          step=225),
                mock.call(name='staleness/num_stale_obserations_in_batch',
                          data=0,
                          step=225)
            ]

            # Call the after train function and check the expectations.
            with mock.patch.object(tf.summary, 'scalar',
                                   autospec=True) as mock_scalar_summary:
                # Call the `after_train_function` on the test input. Assumed the
                # observation train steps are stored in the field `priority` of the
                # the sample info of Reverb.
                strategy.run(after_train_step_fn,
                             args=((None,
                                    reverb.replay_sample.SampleInfo(
                                        key=None,
                                        probability=None,
                                        table_size=None,
                                        priority=observation_train_steps)),
                                   None))

                # Check if the expected calls happened on the scalar summary.
                mock_scalar_summary.assert_has_calls(
                    expected_scalar_summary_calls, any_order=False)
Esempio n. 4
0
  def build_and_run_actor():
    root_dir = test_case.create_tempdir().full_path
    env, action_tensor_spec, time_step_tensor_spec = (
        get_cartpole_env_and_specs())

    train_step = train_utils.create_train_step()

    q_net = build_dummy_sequential_net(fc_layer_params=(100,),
                                       action_spec=action_tensor_spec)

    agent = dqn_agent.DqnAgent(
        time_step_tensor_spec,
        action_tensor_spec,
        q_network=q_net,
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        train_step_counter=train_step)

    _, rb_observer = (
        replay_buffer_utils.get_reverb_buffer_and_observer(
            agent.collect_data_spec,
            table_name=reverb_replay_buffer.DEFAULT_TABLE,
            sequence_length=2,
            reverb_server_address='localhost:{}'.format(reverb_server_port)))

    variable_container = reverb_variable_container.ReverbVariableContainer(
        server_address='localhost:{}'.format(reverb_server_port),
        table_names=[reverb_variable_container.DEFAULT_TABLE])

    test_actor = build_actor(
        root_dir, env, agent, rb_observer, train_step)

    variables_dict = {
        reverb_variable_container.POLICY_KEY: agent.collect_policy.variables(),
        reverb_variable_container.TRAIN_STEP_KEY: train_step
    }
    variable_container.update(variables_dict)

    for _ in range(num_iterations):
      test_actor.run()
Esempio n. 5
0
def train_eval(
        root_dir,
        env_name='HalfCheetah-v2',
        # Training params
        num_iterations=20000,
        actor_fc_layers=(64, 64),
        value_fc_layers=(64, 64),
        learning_rate=3e-4,
        collect_sequence_length=2048,
        minibatch_size=64,
        num_epochs=10,
        # Agent params
        importance_ratio_clipping=0.2,
        lambda_value=0.95,
        discount_factor=0.99,
        entropy_regularization=0.,
        value_pred_loss_coef=0.5,
        use_gae=True,
        use_td_lambda_return=True,
        gradient_clipping=None,
        value_clipping=None,
        # Replay params
        reverb_port=None,
        replay_capacity=10000,
        # Others
        policy_save_interval=5000,
        summary_interval=1000,
        eval_interval=10000,
        eval_episodes=30,
        debug_summaries=False,
        summarize_grads_and_vars=False):
    """Trains and evaluates PPO (Importance Ratio Clipping).

  Args:
    root_dir: Main directory path where checkpoints, saved_models, and summaries
      will be written to.
    env_name: Name for the Mujoco environment to load.
    num_iterations: The number of iterations to perform collection and training.
    actor_fc_layers: List of fully_connected parameters for the actor network,
      where each item is the number of units in the layer.
    value_fc_layers: : List of fully_connected parameters for the value network,
      where each item is the number of units in the layer.
    learning_rate: Learning rate used on the Adam optimizer.
    collect_sequence_length: Number of steps to take in each collect run.
    minibatch_size: Number of elements in each mini batch. If `None`, the entire
      collected sequence will be treated as one batch.
    num_epochs: Number of iterations to repeat over all collected data per data
      collection step. (Schulman,2017) sets this to 10 for Mujoco, 15 for
      Roboschool and 3 for Atari.
    importance_ratio_clipping: Epsilon in clipped, surrogate PPO objective. For
      more detail, see explanation at the top of the doc.
    lambda_value: Lambda parameter for TD-lambda computation.
    discount_factor: Discount factor for return computation. Default to `0.99`
      which is the value used for all environments from (Schulman, 2017).
    entropy_regularization: Coefficient for entropy regularization loss term.
      Default to `0.0` because no entropy bonus was used in (Schulman, 2017).
    value_pred_loss_coef: Multiplier for value prediction loss to balance with
      policy gradient loss. Default to `0.5`, which was used for all
      environments in the OpenAI baseline implementation. This parameters is
      irrelevant unless you are sharing part of actor_net and value_net. In that
      case, you would want to tune this coeeficient, whose value depends on the
      network architecture of your choice.
    use_gae: If True (default False), uses generalized advantage estimation for
      computing per-timestep advantage. Else, just subtracts value predictions
      from empirical return.
    use_td_lambda_return: If True (default False), uses td_lambda_return for
      training value function; here: `td_lambda_return = gae_advantage +
        value_predictions`. `use_gae` must be set to `True` as well to enable TD
        -lambda returns. If `use_td_lambda_return` is set to True while
        `use_gae` is False, the empirical return will be used and a warning will
        be logged.
    gradient_clipping: Norm length to clip gradients.
    value_clipping: Difference between new and old value predictions are clipped
      to this threshold. Value clipping could be helpful when training
      very deep networks. Default: no clipping.
    reverb_port: Port for reverb server, if None, use a randomly chosen unused
      port.
    replay_capacity: The maximum number of elements for the replay buffer. Items
      will be wasted if this is smalled than collect_sequence_length.
    policy_save_interval: How often, in train_steps, the policy will be saved.
    summary_interval: How often to write data into Tensorboard.
    eval_interval: How often to run evaluation, in train_steps.
    eval_episodes: Number of episodes to evaluate over.
    debug_summaries: Boolean for whether to gather debug summaries.
    summarize_grads_and_vars: If true, gradient summaries will be written.
  """
    collect_env = suite_mujoco.load(env_name)
    eval_env = suite_mujoco.load(env_name)
    num_environments = 1

    observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
        spec_utils.get_tensor_specs(collect_env))

    train_step = train_utils.create_train_step()

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        observation_tensor_spec,
        action_tensor_spec,
        fc_layer_params=actor_fc_layers,
        activation_fn=tf.nn.tanh,
        kernel_initializer=tf.keras.initializers.Orthogonal())
    value_net = value_network.ValueNetwork(
        observation_tensor_spec,
        fc_layer_params=value_fc_layers,
        kernel_initializer=tf.keras.initializers.Orthogonal())

    current_iteration = tf.Variable(0, dtype=tf.int64)

    def learning_rate_fn():
        # Linearly decay the learning rate.
        return learning_rate * (1 - current_iteration / num_iterations)

    agent = ppo_clip_agent.PPOClipAgent(
        time_step_tensor_spec,
        action_tensor_spec,
        optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate_fn, epsilon=1e-5),
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=importance_ratio_clipping,
        lambda_value=lambda_value,
        discount_factor=discount_factor,
        entropy_regularization=entropy_regularization,
        value_pred_loss_coef=value_pred_loss_coef,
        # This is a legacy argument for the number of times we repeat the data
        # inside of the train function, incompatible with mini batch learning.
        # We set the epoch number from the replay buffer and tf.Data instead.
        num_epochs=1,
        use_gae=use_gae,
        use_td_lambda_return=use_td_lambda_return,
        gradient_clipping=gradient_clipping,
        value_clipping=value_clipping,
        # TODO(b/150244758): Default compute_value_and_advantage_in_train to False
        # after Reverb open source.
        compute_value_and_advantage_in_train=False,
        # Skips updating normalizers in the agent, as it's handled in the learner.
        update_normalizers_in_train=False,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step)
    agent.initialize()

    reverb_server = reverb.Server(
        [
            reverb.Table(  # Replay buffer storing experience for training.
                name='training_table',
                sampler=reverb.selectors.Fifo(),
                remover=reverb.selectors.Fifo(),
                rate_limiter=reverb.rate_limiters.MinSize(1),
                max_size=replay_capacity,
                max_times_sampled=1,
            ),
            reverb.
            Table(  # Replay buffer storing experience for normalization.
                name='normalization_table',
                sampler=reverb.selectors.Fifo(),
                remover=reverb.selectors.Fifo(),
                rate_limiter=reverb.rate_limiters.MinSize(1),
                max_size=replay_capacity,
                max_times_sampled=1,
            )
        ],
        port=reverb_port)

    # Create the replay buffer.
    reverb_replay_train = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=collect_sequence_length,
        table_name='training_table',
        server_address='localhost:{}'.format(reverb_server.port),
        # The only collected sequence is used to populate the batches.
        max_cycle_length=1,
        rate_limiter_timeout_ms=1000)
    reverb_replay_normalization = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=collect_sequence_length,
        table_name='normalization_table',
        server_address='localhost:{}'.format(reverb_server.port),
        # The only collected sequence is used to populate the batches.
        max_cycle_length=1,
        rate_limiter_timeout_ms=1000)

    rb_observer = reverb_utils.ReverbTrajectorySequenceObserver(
        reverb_replay_train.py_client,
        ['training_table', 'normalization_table'],
        sequence_length=collect_sequence_length,
        stride_length=collect_sequence_length)

    saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
    collect_env_step_metric = py_metrics.EnvironmentSteps()
    learning_triggers = [
        triggers.PolicySavedModelTrigger(saved_model_dir,
                                         agent,
                                         train_step,
                                         interval=policy_save_interval,
                                         metadata_metrics={
                                             triggers.ENV_STEP_METADATA_KEY:
                                             collect_env_step_metric
                                         }),
        triggers.StepPerSecondLogTrigger(train_step,
                                         interval=summary_interval),
    ]

    def training_dataset_fn():
        return reverb_replay_train.as_dataset(
            sample_batch_size=num_environments,
            sequence_preprocess_fn=agent.preprocess_sequence)

    def normalization_dataset_fn():
        return reverb_replay_normalization.as_dataset(
            sample_batch_size=num_environments,
            sequence_preprocess_fn=agent.preprocess_sequence)

    agent_learner = ppo_learner.PPOLearner(
        root_dir,
        train_step,
        agent,
        experience_dataset_fn=training_dataset_fn,
        normalization_dataset_fn=normalization_dataset_fn,
        num_batches=1,
        num_epochs=num_epochs,
        minibatch_size=minibatch_size,
        shuffle_buffer_size=collect_sequence_length,
        triggers=learning_triggers)

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy,
                                                        use_tf_function=True)

    collect_actor = actor.Actor(collect_env,
                                collect_policy,
                                train_step,
                                steps_per_run=collect_sequence_length,
                                observers=[rb_observer],
                                metrics=actor.collect_metrics(buffer_size=10) +
                                [collect_env_step_metric],
                                reference_metrics=[collect_env_step_metric],
                                summary_dir=os.path.join(
                                    root_dir, learner.TRAIN_DIR),
                                summary_interval=summary_interval)

    tf_greedy_policy = agent.policy
    greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_greedy_policy,
                                                       use_tf_function=True)

    if eval_interval:
        logging.info('Intial evaluation.')
        eval_actor = actor.Actor(eval_env,
                                 greedy_policy,
                                 train_step,
                                 metrics=actor.eval_metrics(eval_episodes),
                                 summary_dir=os.path.join(root_dir, 'eval'),
                                 episodes_per_run=eval_episodes)

        eval_actor.run_and_log()

    logging.info('Training.')
    for _ in range(num_iterations):
        collect_actor.run()
        # TODO(b/159615593): Update to use observer.flush.
        # Reset the reverb observer to make sure the data collected is flushed and
        # written to the RB.
        rb_observer.reset()
        agent_learner.run()
        reverb_replay_train.clear()
        reverb_replay_normalization.clear()
        current_iteration.assign_add(1)

        if eval_interval and agent_learner.train_step_numpy % eval_interval == 0:
            logging.info('Evaluating.')
            eval_actor.run_and_log()

    rb_observer.close()
    reverb_server.stop()
Esempio n. 6
0
        observation_fc_layer_params=None,
        action_fc_layer_params=None,
        joint_fc_layer_params=HyperParms.critic_joint_fc_layer_params,
        kernel_initializer='glorot_uniform',
        last_kernel_initializer='glorot_uniform')

with objStrategy.scope():
    nnActor = actor_distribution_network.ActorDistributionNetwork(
        specObservation,
        specAction,
        fc_layer_params=HyperParms.actor_fc_layer_params,
        continuous_projection_net=(
            tanh_normal_projection_network.TanhNormalProjectionNetwork))

with objStrategy.scope():
    train_step = train_utils.create_train_step()

    tf_agent = sac_agent.SacAgent(
        specTimeStep,
        specAction,
        actor_network=nnActor,
        critic_network=nnCritic,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=HyperParms.actor_learning_rate),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=HyperParms.critic_learning_rate),
        alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=HyperParms.alpha_learning_rate),
        target_update_tau=HyperParms.target_update_tau,
        target_update_period=HyperParms.target_update_period,
        td_errors_loss_fn=tf.math.squared_difference,
Esempio n. 7
0
def train(
    root_dir,
    strategy,
    replay_buffer_server_address,
    variable_container_server_address,
    create_agent_fn,
    create_env_fn,
    # Training params
    learning_rate=3e-4,
    batch_size=256,
    num_iterations=32000,
    learner_iterations_per_call=100):
  """Trains a DQN agent."""
  # Get the specs from the environment.
  logging.info('Training SAC with learning rate: %f', learning_rate)
  env = create_env_fn()
  observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
      spec_utils.get_tensor_specs(env))

  # Create the agent.
  with strategy.scope():
    train_step = train_utils.create_train_step()
    agent = create_agent_fn(train_step, observation_tensor_spec,
                            action_tensor_spec, time_step_tensor_spec,
                            learning_rate)
    agent.initialize()

  # Create the policy saver which saves the initial model now, then it
  # periodically checkpoints the policy weigths.
  saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
  save_model_trigger = triggers.PolicySavedModelTrigger(
      saved_model_dir, agent, train_step, interval=1000)

  # Create the variable container.
  variables = {
      reverb_variable_container.POLICY_KEY: agent.collect_policy.variables(),
      reverb_variable_container.TRAIN_STEP_KEY: train_step
  }
  variable_container = reverb_variable_container.ReverbVariableContainer(
      variable_container_server_address,
      table_names=[reverb_variable_container.DEFAULT_TABLE])
  variable_container.push(variables)

  # Create the replay buffer.
  reverb_replay = reverb_replay_buffer.ReverbReplayBuffer(
      agent.collect_data_spec,
      sequence_length=2,
      table_name=reverb_replay_buffer.DEFAULT_TABLE,
      server_address=replay_buffer_server_address)

  # Initialize the dataset.
  def experience_dataset_fn():
    with strategy.scope():
      return reverb_replay.as_dataset(
          sample_batch_size=batch_size, num_steps=2).prefetch(3)

  # Create the learner.
  learning_triggers = [
      save_model_trigger,
      triggers.StepPerSecondLogTrigger(train_step, interval=1000)
  ]
  sac_learner = learner.Learner(
      root_dir,
      train_step,
      agent,
      experience_dataset_fn,
      triggers=learning_triggers,
      strategy=strategy)

  # Run the training loop.
  # TODO(b/162440911) change the loop use train_step to handle preemptions
  for _ in range(num_iterations):
    sac_learner.run(iterations=learner_iterations_per_call)
    variable_container.push(variables)
Esempio n. 8
0
def collect(task,
            root_dir,
            replay_buffer_server_address,
            variable_container_server_address,
            create_env_fn,
            initial_collect_steps=10000,
            num_iterations=10000000):
  """Collects experience using a policy updated after every episode."""
  # Create the environment. For now support only single environment collection.
  collect_env = create_env_fn()

  # Create the path for the serialized collect policy.
  collect_policy_saved_model_path = os.path.join(
      root_dir, learner.POLICY_SAVED_MODEL_DIR,
      learner.COLLECT_POLICY_SAVED_MODEL_DIR)
  saved_model_pb_path = os.path.join(collect_policy_saved_model_path,
                                     'saved_model.pb')
  try:
    # Wait for the collect policy to be outputed by learner (timeout after 2
    # days), then load it.
    train_utils.wait_for_file(
        saved_model_pb_path, sleep_time_secs=2, num_retries=86400)
    collect_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy(
        collect_policy_saved_model_path, load_specs_from_pbtxt=True)
  except TimeoutError as e:
    # If the collect policy does not become available during the wait time of
    # the call `wait_for_file`, that probably means the learner is not running.
    logging.error('Could not get the file %s. Exiting.', saved_model_pb_path)
    raise e

  # Create the variable container.
  train_step = train_utils.create_train_step()
  variables = {
      reverb_variable_container.POLICY_KEY: collect_policy.variables(),
      reverb_variable_container.TRAIN_STEP_KEY: train_step
  }
  variable_container = reverb_variable_container.ReverbVariableContainer(
      variable_container_server_address,
      table_names=[reverb_variable_container.DEFAULT_TABLE])
  variable_container.update(variables)

  # Create the replay buffer observer.
  rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
      reverb.Client(replay_buffer_server_address),
      table_name=reverb_replay_buffer.DEFAULT_TABLE,
      sequence_length=2,
      stride_length=1)

  random_policy = random_py_policy.RandomPyPolicy(
      collect_env.time_step_spec(), collect_env.action_spec())
  initial_collect_actor = actor.Actor(
      collect_env,
      random_policy,
      train_step,
      steps_per_run=initial_collect_steps,
      observers=[rb_observer])
  logging.info('Doing initial collect.')
  initial_collect_actor.run()

  env_step_metric = py_metrics.EnvironmentSteps()
  collect_actor = actor.Actor(
      collect_env,
      collect_policy,
      train_step,
      steps_per_run=1,
      metrics=actor.collect_metrics(10),
      summary_dir=os.path.join(root_dir, learner.TRAIN_DIR, str(task)),
      observers=[rb_observer, env_step_metric])

  # Run the experience collection loop.
  for _ in range(num_iterations):
    logging.info('Collecting with policy at step: %d', train_step.numpy())
    collect_actor.run()
    variable_container.update(variables)
def train_eval(
        root_dir,
        env_name='HalfCheetah-v2',
        # Training params
        initial_collect_steps=10000,
        num_iterations=3200000,
        actor_fc_layers=(256, 256),
        critic_obs_fc_layers=None,
        critic_action_fc_layers=None,
        critic_joint_fc_layers=(256, 256),
        # Agent params
        batch_size=256,
        actor_learning_rate=3e-4,
        critic_learning_rate=3e-4,
        alpha_learning_rate=3e-4,
        gamma=0.99,
        target_update_tau=0.005,
        target_update_period=1,
        reward_scale_factor=0.1,
        # Replay params
        reverb_port=None,
        replay_capacity=1000000,
        # Others
        # Defaults to not checkpointing saved policy. If you wish to enable this,
        # please note the caveat explained in README.md.
        policy_save_interval=-1,
        eval_interval=10000,
        eval_episodes=30,
        debug_summaries=False,
        summarize_grads_and_vars=False):
    """Trains and evaluates SAC."""
    logging.info('Training SAC on: %s', env_name)
    collect_env = suite_mujoco.load(env_name)
    eval_env = suite_mujoco.load(env_name)

    observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
        spec_utils.get_tensor_specs(collect_env))

    train_step = train_utils.create_train_step()

    actor_net = actor_distribution_network.ActorDistributionNetwork(
        observation_tensor_spec,
        action_tensor_spec,
        fc_layer_params=actor_fc_layers,
        continuous_projection_net=tanh_normal_projection_network.
        TanhNormalProjectionNetwork)
    critic_net = critic_network.CriticNetwork(
        (observation_tensor_spec, action_tensor_spec),
        observation_fc_layer_params=critic_obs_fc_layers,
        action_fc_layer_params=critic_action_fc_layers,
        joint_fc_layer_params=critic_joint_fc_layers,
        kernel_initializer='glorot_uniform',
        last_kernel_initializer='glorot_uniform')

    agent = sac_agent.SacAgent(
        time_step_tensor_spec,
        action_tensor_spec,
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=actor_learning_rate),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=critic_learning_rate),
        alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=alpha_learning_rate),
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        td_errors_loss_fn=tf.math.squared_difference,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        gradient_clipping=None,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=train_step)
    agent.initialize()

    table_name = 'uniform_table'
    table = reverb.Table(table_name,
                         max_size=replay_capacity,
                         sampler=reverb.selectors.Uniform(),
                         remover=reverb.selectors.Fifo(),
                         rate_limiter=reverb.rate_limiters.MinSize(1))

    reverb_server = reverb.Server([table], port=reverb_port)
    reverb_replay = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=2,
        table_name=table_name,
        local_server=reverb_server)
    rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
        reverb_replay.py_client,
        table_name,
        sequence_length=2,
        stride_length=1)

    dataset = reverb_replay.as_dataset(sample_batch_size=batch_size,
                                       num_steps=2).prefetch(50)
    experience_dataset_fn = lambda: dataset

    saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
    env_step_metric = py_metrics.EnvironmentSteps()
    learning_triggers = [
        triggers.PolicySavedModelTrigger(
            saved_model_dir,
            agent,
            train_step,
            interval=policy_save_interval,
            metadata_metrics={triggers.ENV_STEP_METADATA_KEY:
                              env_step_metric}),
        triggers.StepPerSecondLogTrigger(train_step, interval=1000),
    ]

    agent_learner = learner.Learner(root_dir,
                                    train_step,
                                    agent,
                                    experience_dataset_fn,
                                    triggers=learning_triggers)

    random_policy = random_py_policy.RandomPyPolicy(
        collect_env.time_step_spec(), collect_env.action_spec())
    initial_collect_actor = actor.Actor(collect_env,
                                        random_policy,
                                        train_step,
                                        steps_per_run=initial_collect_steps,
                                        observers=[rb_observer])
    logging.info('Doing initial collect.')
    initial_collect_actor.run()

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy,
                                                        use_tf_function=True)

    collect_actor = actor.Actor(collect_env,
                                collect_policy,
                                train_step,
                                steps_per_run=1,
                                metrics=actor.collect_metrics(10),
                                summary_dir=os.path.join(
                                    root_dir, learner.TRAIN_DIR),
                                observers=[rb_observer, env_step_metric])

    tf_greedy_policy = greedy_policy.GreedyPolicy(agent.policy)
    eval_greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(
        tf_greedy_policy, use_tf_function=True)

    eval_actor = actor.Actor(
        eval_env,
        eval_greedy_policy,
        train_step,
        episodes_per_run=eval_episodes,
        metrics=actor.eval_metrics(eval_episodes),
        summary_dir=os.path.join(root_dir, 'eval'),
    )

    if eval_interval:
        logging.info('Evaluating.')
        eval_actor.run_and_log()

    logging.info('Training.')
    for _ in range(num_iterations):
        collect_actor.run()
        agent_learner.run(iterations=1)

        if eval_interval and agent_learner.train_step_numpy % eval_interval == 0:
            logging.info('Evaluating.')
            eval_actor.run_and_log()

    rb_observer.close()
    reverb_server.stop()
Esempio n. 10
0
def train_eval(
        root_dir,
        env_name,
        # Training params
        train_sequence_length,
        initial_collect_steps=1000,
        collect_steps_per_iteration=1,
        num_iterations=100000,
        # RNN params.
        q_network_fn=q_lstm_network,  # defaults to q_lstm_network.
        # Agent params
    epsilon_greedy=0.1,
        batch_size=64,
        learning_rate=1e-3,
        gamma=0.99,
        target_update_tau=0.05,
        target_update_period=5,
        reward_scale_factor=1.0,
        # Replay params
        reverb_port=None,
        replay_capacity=100000,
        # Others
        policy_save_interval=1000,
        eval_interval=1000,
        eval_episodes=10):
    """Trains and evaluates DQN."""

    collect_env = suite_gym.load(env_name)
    eval_env = suite_gym.load(env_name)

    unused_observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
        spec_utils.get_tensor_specs(collect_env))

    train_step = train_utils.create_train_step()

    num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
    q_net = q_network_fn(num_actions=num_actions)

    sequence_length = train_sequence_length + 1
    agent = dqn_agent.DqnAgent(
        time_step_tensor_spec,
        action_tensor_spec,
        q_network=q_net,
        epsilon_greedy=epsilon_greedy,
        # n-step updates aren't supported with RNNs yet.
        n_step_update=1,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=learning_rate),
        td_errors_loss_fn=common.element_wise_squared_loss,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        train_step_counter=train_step)

    table_name = 'uniform_table'
    table = reverb.Table(table_name,
                         max_size=replay_capacity,
                         sampler=reverb.selectors.Uniform(),
                         remover=reverb.selectors.Fifo(),
                         rate_limiter=reverb.rate_limiters.MinSize(1))
    reverb_server = reverb.Server([table], port=reverb_port)
    reverb_replay = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=sequence_length,
        table_name=table_name,
        local_server=reverb_server)
    rb_observer = reverb_utils.ReverbTrajectorySequenceObserver(
        reverb_replay.py_client,
        table_name,
        sequence_length=sequence_length,
        stride_length=1)

    dataset = reverb_replay.as_dataset(num_parallel_calls=3,
                                       sample_batch_size=batch_size,
                                       num_steps=sequence_length).prefetch(3)
    experience_dataset_fn = lambda: dataset

    saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
    env_step_metric = py_metrics.EnvironmentSteps()

    learning_triggers = [
        triggers.PolicySavedModelTrigger(
            saved_model_dir,
            agent,
            train_step,
            interval=policy_save_interval,
            metadata_metrics={triggers.ENV_STEP_METADATA_KEY:
                              env_step_metric}),
        triggers.StepPerSecondLogTrigger(train_step, interval=100),
    ]

    dqn_learner = learner.Learner(root_dir,
                                  train_step,
                                  agent,
                                  experience_dataset_fn,
                                  triggers=learning_triggers)

    # If we haven't trained yet make sure we collect some random samples first to
    # fill up the Replay Buffer with some experience.
    random_policy = random_py_policy.RandomPyPolicy(
        collect_env.time_step_spec(), collect_env.action_spec())
    initial_collect_actor = actor.Actor(collect_env,
                                        random_policy,
                                        train_step,
                                        steps_per_run=initial_collect_steps,
                                        observers=[rb_observer])
    logging.info('Doing initial collect.')
    initial_collect_actor.run()

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy,
                                                        use_tf_function=True)

    collect_actor = actor.Actor(
        collect_env,
        collect_policy,
        train_step,
        steps_per_run=collect_steps_per_iteration,
        observers=[rb_observer, env_step_metric],
        metrics=actor.collect_metrics(10),
        summary_dir=os.path.join(root_dir, learner.TRAIN_DIR),
    )

    tf_greedy_policy = agent.policy
    greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_greedy_policy,
                                                       use_tf_function=True)

    eval_actor = actor.Actor(
        eval_env,
        greedy_policy,
        train_step,
        episodes_per_run=eval_episodes,
        metrics=actor.eval_metrics(eval_episodes),
        summary_dir=os.path.join(root_dir, 'eval'),
    )

    if eval_interval:
        logging.info('Evaluating.')
        eval_actor.run_and_log()

    logging.info('Training.')
    for _ in range(num_iterations):
        collect_actor.run()
        dqn_learner.run(iterations=1)

        if eval_interval and dqn_learner.train_step_numpy % eval_interval == 0:
            logging.info('Evaluating.')
            eval_actor.run_and_log()

    rb_observer.close()
    reverb_server.stop()
Esempio n. 11
0
def train_eval(
        root_dir,
        env_name='Pong-v0',
        # Training params
        update_frequency=4,  # Number of collect steps per policy update
        initial_collect_steps=50000,  # 50k collect steps
        num_iterations=50000000,  # 50M collect steps
        # Taken from Rainbow as it's not specified in Mnih,15.
    max_episode_frames_collect=50000,  # env frames observed by the agent
        max_episode_frames_eval=108000,  # env frames observed by the agent
        # Agent params
    epsilon_greedy=0.1,
        epsilon_decay_period=250000,  # 1M collect steps / update_frequency
        batch_size=32,
        learning_rate=0.00025,
        n_step_update=1,
        gamma=0.99,
        target_update_tau=1.0,
        target_update_period=2500,  # 10k collect steps / update_frequency
        reward_scale_factor=1.0,
        # Replay params
        reverb_port=None,
        replay_capacity=1000000,
        # Others
        policy_save_interval=250000,
        eval_interval=1000,
        eval_episodes=30,
        debug_summaries=True):
    """Trains and evaluates DQN."""

    collect_env = suite_atari.load(
        env_name,
        max_episode_steps=max_episode_frames_collect,
        gym_env_wrappers=suite_atari.DEFAULT_ATARI_GYM_WRAPPERS_WITH_STACKING)
    eval_env = suite_atari.load(
        env_name,
        max_episode_steps=max_episode_frames_eval,
        gym_env_wrappers=suite_atari.DEFAULT_ATARI_GYM_WRAPPERS_WITH_STACKING)

    unused_observation_tensor_spec, action_tensor_spec, time_step_tensor_spec = (
        spec_utils.get_tensor_specs(collect_env))

    train_step = train_utils.create_train_step()

    num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1
    epsilon = tf.compat.v1.train.polynomial_decay(
        1.0,
        train_step,
        epsilon_decay_period,
        end_learning_rate=epsilon_greedy)
    agent = dqn_agent.DqnAgent(
        time_step_tensor_spec,
        action_tensor_spec,
        q_network=create_q_network(num_actions),
        epsilon_greedy=epsilon,
        n_step_update=n_step_update,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        optimizer=tf.compat.v1.train.RMSPropOptimizer(
            learning_rate=learning_rate,
            decay=0.95,
            momentum=0.95,
            epsilon=0.01,
            centered=True),
        td_errors_loss_fn=common.element_wise_huber_loss,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        train_step_counter=train_step,
        debug_summaries=debug_summaries)

    table_name = 'uniform_table'
    table = reverb.Table(table_name,
                         max_size=replay_capacity,
                         sampler=reverb.selectors.Uniform(),
                         remover=reverb.selectors.Fifo(),
                         rate_limiter=reverb.rate_limiters.MinSize(1))
    reverb_server = reverb.Server([table], port=reverb_port)
    reverb_replay = reverb_replay_buffer.ReverbReplayBuffer(
        agent.collect_data_spec,
        sequence_length=2,
        table_name=table_name,
        local_server=reverb_server)
    rb_observer = reverb_utils.ReverbAddTrajectoryObserver(
        reverb_replay.py_client,
        table_name,
        sequence_length=2,
        stride_length=1)

    dataset = reverb_replay.as_dataset(sample_batch_size=batch_size,
                                       num_steps=2).prefetch(3)
    experience_dataset_fn = lambda: dataset

    saved_model_dir = os.path.join(root_dir, learner.POLICY_SAVED_MODEL_DIR)
    env_step_metric = py_metrics.EnvironmentSteps()

    learning_triggers = [
        triggers.PolicySavedModelTrigger(
            saved_model_dir,
            agent,
            train_step,
            interval=policy_save_interval,
            metadata_metrics={triggers.ENV_STEP_METADATA_KEY:
                              env_step_metric}),
        triggers.StepPerSecondLogTrigger(train_step, interval=100),
    ]

    dqn_learner = learner.Learner(root_dir,
                                  train_step,
                                  agent,
                                  experience_dataset_fn,
                                  triggers=learning_triggers)

    # If we haven't trained yet make sure we collect some random samples first to
    # fill up the Replay Buffer with some experience.
    random_policy = random_py_policy.RandomPyPolicy(
        collect_env.time_step_spec(), collect_env.action_spec())
    initial_collect_actor = actor.Actor(collect_env,
                                        random_policy,
                                        train_step,
                                        steps_per_run=initial_collect_steps,
                                        observers=[rb_observer])
    logging.info('Doing initial collect.')
    initial_collect_actor.run()

    tf_collect_policy = agent.collect_policy
    collect_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_collect_policy,
                                                        use_tf_function=True)

    collect_actor = actor.Actor(
        collect_env,
        collect_policy,
        train_step,
        steps_per_run=update_frequency,
        observers=[rb_observer, env_step_metric],
        metrics=actor.collect_metrics(10),
        summary_dir=os.path.join(root_dir, learner.TRAIN_DIR),
    )

    tf_greedy_policy = agent.policy
    greedy_policy = py_tf_eager_policy.PyTFEagerPolicy(tf_greedy_policy,
                                                       use_tf_function=True)

    eval_actor = actor.Actor(
        eval_env,
        greedy_policy,
        train_step,
        episodes_per_run=eval_episodes,
        metrics=actor.eval_metrics(eval_episodes),
        summary_dir=os.path.join(root_dir, 'eval'),
    )

    if eval_interval:
        logging.info('Evaluating.')
        eval_actor.run_and_log()

    logging.info('Training.')
    for _ in range(num_iterations):
        collect_actor.run()
        dqn_learner.run(iterations=1)

        if eval_interval and dqn_learner.train_step_numpy % eval_interval == 0:
            logging.info('Evaluating.')
            eval_actor.run_and_log()

    rb_observer.close()
    reverb_server.stop()
Esempio n. 12
0
def main(_):
  logging.set_verbosity(logging.INFO)

  # Create the path for the serialized collect policy.
  collect_policy_saved_model_path = os.path.join(
      FLAGS.root_dir, learner.POLICY_SAVED_MODEL_DIR,
      learner.COLLECT_POLICY_SAVED_MODEL_DIR)
  saved_model_pb_path = os.path.join(collect_policy_saved_model_path,
                                     'saved_model.pb')

  samples_per_insert = FLAGS.samples_per_insert
  min_table_size_before_sampling = FLAGS.min_table_size_before_sampling

  try:
    # Wait for the collect policy to be outputed by learner (timeout after 2
    # days), then load it.
    train_utils.wait_for_file(
        saved_model_pb_path, sleep_time_secs=2, num_retries=86400)
    collect_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy(
        collect_policy_saved_model_path, load_specs_from_pbtxt=True)
  except TimeoutError as e:
    # If the collect policy does not become available during the wait time of
    # the call `wait_for_file`, that probably means the learner is not running.
    logging.error('Could not get the file %s. Exiting.', saved_model_pb_path)
    raise e

  # Create the signature for the variable container holding the policy weights.
  train_step = train_utils.create_train_step()
  variables = {
      reverb_variable_container.POLICY_KEY: collect_policy.variables(),
      reverb_variable_container.TRAIN_STEP_KEY: train_step
  }
  variable_container_signature = tf.nest.map_structure(
      lambda variable: tf.TensorSpec(variable.shape, dtype=variable.dtype),
      variables)
  logging.info('Signature of variables: \n%s', variable_container_signature)

  # Create the signature for the replay buffer holding observed experience.
  replay_buffer_signature = tensor_spec.from_spec(
      collect_policy.collect_data_spec)
  logging.info('Signature of experience: \n%s', replay_buffer_signature)

  if samples_per_insert is not None:
    # Use SamplesPerInsertRatio limiter
    samples_per_insert_tolerance = _SAMPLES_PER_INSERT_TOLERANCE_RATIO * samples_per_insert
    error_buffer = min_table_size_before_sampling * samples_per_insert_tolerance

    experience_rate_limiter = reverb.rate_limiters.SampleToInsertRatio(
        min_size_to_sample=min_table_size_before_sampling,
        samples_per_insert=samples_per_insert,
        error_buffer=error_buffer)
  else:
    # Use MinSize limiter
    experience_rate_limiter = reverb.rate_limiters.MinSize(
        min_table_size_before_sampling)

  # Crete and start the replay buffer and variable container server.
  server = reverb.Server(
      tables=[
          reverb.Table(  # Replay buffer storing experience.
              name=reverb_replay_buffer.DEFAULT_TABLE,
              sampler=reverb.selectors.Uniform(),
              remover=reverb.selectors.Fifo(),
              rate_limiter=experience_rate_limiter,
              max_size=FLAGS.replay_buffer_capacity,
              max_times_sampled=0,
              signature=replay_buffer_signature,
          ),
          reverb.Table(  # Variable container storing policy parameters.
              name=reverb_variable_container.DEFAULT_TABLE,
              sampler=reverb.selectors.Uniform(),
              remover=reverb.selectors.Fifo(),
              rate_limiter=reverb.rate_limiters.MinSize(1),
              max_size=1,
              max_times_sampled=0,
              signature=variable_container_signature,
          ),
      ],
      port=FLAGS.port)
  server.wait()