コード例 #1
0
ファイル: rl_utils.py プロジェクト: zxChouSean/tensor2tensor
def setup_env(hparams,
              batch_size,
              max_num_noops,
              rl_env_max_episode_steps=-1,
              env_name=None):
  """Setup."""
  if not env_name:
    env_name = full_game_name(hparams.game)

  maxskip_envs = should_apply_max_and_skip_env(hparams)

  env = T2TGymEnv(
      base_env_name=env_name,
      batch_size=batch_size,
      grayscale=hparams.grayscale,
      should_derive_observation_space=hparams
      .rl_should_derive_observation_space,
      resize_width_factor=hparams.resize_width_factor,
      resize_height_factor=hparams.resize_height_factor,
      rl_env_max_episode_steps=rl_env_max_episode_steps,
      max_num_noops=max_num_noops,
      maxskip_envs=maxskip_envs,
      sticky_actions=hparams.sticky_actions
  )
  return env
コード例 #2
0
def setup_env(hparams):
    """Setup."""
    env = T2TGymEnv(
        [make_gym_env(hparams) for _ in range(hparams.real_ppo_num_agents)],
        grayscale=hparams.grayscale,
        resize_width_factor=hparams.resize_width_factor,
        resize_height_factor=hparams.resize_height_factor)
    return env
コード例 #3
0
def initialize_env_specs(hparams):
    """Initializes env_specs using T2TGymEnvs."""
    env = T2TGymEnv.setup_env_from_hparams(hparams, hparams.batch_size,
                                           hparams.eval_max_num_noops)
    env.start_new_epoch(0)

    # TODO(afrozm): Decouple env_fn from hparams and return both, is there
    # even a need to return hparams? Just return the env_fn?
    hparams.add_hparam("env_fn", rl.make_real_env_fn(env))
    return hparams
コード例 #4
0
def setup_env(hparams, batch_size, max_num_noops, rl_env_max_episode_steps=-1):
  """Setup."""
  env_name = full_game_name(hparams.game)

  env = T2TGymEnv(base_env_name=env_name,
                  batch_size=batch_size,
                  grayscale=hparams.grayscale,
                  resize_width_factor=hparams.resize_width_factor,
                  resize_height_factor=hparams.resize_height_factor,
                  rl_env_max_episode_steps=rl_env_max_episode_steps,
                  max_num_noops=max_num_noops, maxskip_envs=True)
  return env
コード例 #5
0
ファイル: player_utils.py プロジェクト: zjms/tensor2tensor
def load_data_and_make_simulated_env(data_dir,
                                     wm_dir,
                                     hparams,
                                     which_epoch_data="last",
                                     random_starts=True):
    hparams = copy.deepcopy(hparams)
    t2t_env = T2TGymEnv.setup_and_load_epoch(hparams,
                                             data_dir=data_dir,
                                             which_epoch_data=which_epoch_data)
    return make_simulated_gym_env(t2t_env,
                                  world_model_dir=wm_dir,
                                  hparams=hparams,
                                  random_starts=random_starts)
コード例 #6
0
def setup_env(hparams):
  """Setup."""
  # TODO(kc): set reward clipping, when this will be possible
  assert hparams.game == "pong", "Currently only games with [-1, 1] rewards."
  game_mode = "Deterministic-v4"
  camel_game_name = "".join(
      [w[0].upper() + w[1:] for w in hparams.game.split("_")])
  camel_game_name += game_mode
  env_name = camel_game_name
  env = T2TGymEnv([gym.make(env_name)],
                  grayscale=hparams.grayscale,
                  resize_width_factor=hparams.resize_width_factor,
                  resize_height_factor=hparams.resize_height_factor)
  return env
コード例 #7
0
def setup_env(hparams):
    """Setup."""
    game_mode = "Deterministic-v4"
    camel_game_name = "".join(
        [w[0].upper() + w[1:] for w in hparams.game.split("_")])
    camel_game_name += game_mode
    env_name = camel_game_name

    env = T2TGymEnv(base_env_name=env_name,
                    batch_size=hparams.real_ppo_num_agents,
                    grayscale=hparams.grayscale,
                    resize_width_factor=hparams.resize_width_factor,
                    resize_height_factor=hparams.resize_height_factor,
                    base_env_timesteps_limit=hparams.env_timesteps_limit)
    return env
コード例 #8
0
def setup_env(hparams, batch_size, max_num_noops, rl_env_max_episode_steps=-1):
  """Setup."""
  game_mode = "NoFrameskip-v4"
  camel_game_name = misc_utils.snakecase_to_camelcase(hparams.game)
  camel_game_name += game_mode
  env_name = camel_game_name

  env = T2TGymEnv(base_env_name=env_name,
                  batch_size=batch_size,
                  grayscale=hparams.grayscale,
                  resize_width_factor=hparams.resize_width_factor,
                  resize_height_factor=hparams.resize_height_factor,
                  rl_env_max_episode_steps=rl_env_max_episode_steps,
                  max_num_noops=max_num_noops, maxskip_envs=True)
  return env
コード例 #9
0
def setup_env(hparams, batch_size, max_num_noops):
    """Setup."""
    game_mode = "Deterministic-v4"
    camel_game_name = "".join(
        [w[0].upper() + w[1:] for w in hparams.game.split("_")])
    camel_game_name += game_mode
    env_name = camel_game_name

    env = T2TGymEnv(base_env_name=env_name,
                    batch_size=batch_size,
                    grayscale=hparams.grayscale,
                    resize_width_factor=hparams.resize_width_factor,
                    resize_height_factor=hparams.resize_height_factor,
                    rl_env_max_episode_steps=hparams.rl_env_max_episode_steps,
                    max_num_noops=max_num_noops)
    return env
コード例 #10
0
ファイル: rl_utils.py プロジェクト: zjms/tensor2tensor
def evaluate_single_config(hparams, sampling_temp, max_num_noops,
                           agent_model_dir):
    """Evaluate the PPO agent in the real environment."""
    eval_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
    env = T2TGymEnv.setup_env_from_hparams(hparams,
                                           batch_size=hparams.eval_batch_size,
                                           max_num_noops=max_num_noops)
    env.start_new_epoch(0)
    env_fn = rl.make_real_env_fn(env)
    learner = LEARNERS[hparams.base_algo](hparams.frame_stack_size,
                                          base_event_dir=None,
                                          agent_model_dir=agent_model_dir,
                                          total_num_epochs=1)
    learner.evaluate(env_fn, eval_hparams, sampling_temp)
    rollouts = env.current_epoch_rollouts()
    env.close()

    return tuple(
        compute_mean_reward(rollouts, clipped) for clipped in (True, False))
コード例 #11
0
def main(_):
    # gym.logger.set_level(gym.logger.DEBUG)
    hparams = registry.hparams(FLAGS.loop_hparams_set)
    hparams.parse(FLAGS.loop_hparams)
    # Not important for experiments past 2018
    if "wm_policy_param_sharing" not in hparams.values().keys():
        hparams.add_hparam("wm_policy_param_sharing", False)
    directories = player_utils.infer_paths(output_dir=FLAGS.output_dir,
                                           world_model=FLAGS.wm_dir,
                                           policy=FLAGS.policy_dir,
                                           data=FLAGS.episodes_data_dir)
    epoch = FLAGS.epoch if FLAGS.epoch == "last" else int(FLAGS.epoch)

    if FLAGS.simulated_env:
        env = player_utils.load_data_and_make_simulated_env(
            directories["data"],
            directories["world_model"],
            hparams,
            which_epoch_data=epoch)
    else:
        env = T2TGymEnv.setup_and_load_epoch(hparams,
                                             data_dir=directories["data"],
                                             which_epoch_data=epoch)
        env = FlatBatchEnv(env)

    env = PlayerEnvWrapper(env)  # pylint: disable=redefined-variable-type

    env = player_utils.wrap_with_monitor(env, FLAGS.video_dir)

    if FLAGS.dry_run:
        for _ in range(5):
            env.reset()
            for i in range(50):
                env.step(i % 3)
            env.step(PlayerEnvWrapper.RESET_ACTION)  # reset
        return

    play.play(env, zoom=FLAGS.zoom, fps=FLAGS.fps)
コード例 #12
0
def training_loop(hparams, output_dir, report_fn=None, report_metric=None):
  """Run the main training loop."""
  if report_fn:
    assert report_metric is not None

  # Directories
  subdirectories = [
      "data", "tmp", "world_model", ("world_model", "debug_videos"),
      "policy", "eval_metrics"
  ]
  directories = setup_directories(output_dir, subdirectories)

  epoch = -1
  data_dir = directories["data"]
  env = T2TGymEnv.setup_env_from_hparams(
      hparams, batch_size=hparams.real_batch_size,
      max_num_noops=hparams.max_num_noops
  )
  env.start_new_epoch(epoch, data_dir)

  if hparams.wm_policy_param_sharing:
    policy_model_dir = directories["world_model"]
  else:
    policy_model_dir = directories["policy"]
  learner = rl_utils.LEARNERS[hparams.base_algo](
      hparams.frame_stack_size, policy_model_dir,
      policy_model_dir, hparams.epochs
  )

  # Timing log function
  log_relative_time = make_relative_timing_fn()

  # Per-epoch state
  epoch_metrics = []
  metrics = {}

  # Collect data from the real environment.
  tf.logging.info("Initial training of the policy in real environment.")
  train_agent_real_env(env, learner, hparams, epoch)
  metrics["mean_reward/train/clipped"] = rl_utils.compute_mean_reward(
      env.current_epoch_rollouts(), clipped=True
  )
  tf.logging.info("Mean training reward (initial): {}".format(
      metrics["mean_reward/train/clipped"]
  ))
  env.generate_data(data_dir)

  eval_metrics_writer = tf.summary.FileWriter(
      directories["eval_metrics"]
  )

  world_model_steps_num = 0

  for epoch in range(hparams.epochs):
    log = make_log_fn(epoch, log_relative_time)

    # Train world model
    log("Training world model")
    world_model_steps_num = train_world_model(
        env, data_dir, directories["world_model"], hparams,
        world_model_steps_num, epoch
    )

    # Train agent
    log("Training policy in simulated environment.")
    train_agent(env, learner, directories["world_model"], hparams, epoch)

    env.start_new_epoch(epoch, data_dir)

    # Train agent on real env (short)
    log("Training policy in real environment.")
    train_agent_real_env(env, learner, hparams, epoch)

    if hparams.stop_loop_early:
      return 0.0

    env.generate_data(data_dir)

    metrics = load_metrics(directories["eval_metrics"], epoch)
    if metrics:
      # Skip eval if metrics have already been written for this epoch. Otherwise
      # we'd overwrite them with wrong data.
      log("Metrics found for this epoch, skipping evaluation.")
    else:
      metrics["mean_reward/train/clipped"] = rl_utils.compute_mean_reward(
          env.current_epoch_rollouts(), clipped=True
      )
      log("Mean training reward: {}".format(
          metrics["mean_reward/train/clipped"]
      ))

      eval_metrics = rl_utils.evaluate_all_configs(hparams, policy_model_dir)
      log("Agent eval metrics:\n{}".format(pprint.pformat(eval_metrics)))
      metrics.update(eval_metrics)

      if hparams.eval_world_model:
        debug_video_path = os.path.join(
            directories["world_model", "debug_videos"],
            "{}.avi".format(env.current_epoch)
        )
        wm_metrics = evaluate_world_model(
            env, hparams, directories["world_model"], debug_video_path
        )
        log("World model eval metrics:\n{}".format(pprint.pformat(wm_metrics)))
        metrics.update(wm_metrics)

      summarize_metrics(eval_metrics_writer, metrics, epoch)

      # Report metrics
      if report_fn:
        if report_metric == "mean_reward":
          metric_name = rl_utils.get_metric_name(
              sampling_temp=hparams.eval_sampling_temps[0],
              max_num_noops=hparams.eval_max_num_noops,
              clipped=False
          )
          report_fn(eval_metrics[metric_name], epoch)
        else:
          report_fn(eval_metrics[report_metric], epoch)

    epoch_metrics.append(metrics)

  # Return the evaluation metrics from the final epoch
  return epoch_metrics[-1]