Ejemplo n.º 1
0
def train_agent(real_env, learner, world_model_dir, hparams, epoch):
  """Train the PPO agent in the simulated environment."""
  initial_frame_chooser = rl_utils.make_initial_frame_chooser(
      real_env, hparams.frame_stack_size, hparams.simulation_random_starts,
      hparams.simulation_flip_first_random_for_beginning
  )
  env_fn = make_simulated_env_fn_from_hparams(
      real_env, hparams, batch_size=hparams.simulated_batch_size,
      initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir,
      sim_video_dir=os.path.join(
          learner.agent_model_dir, "sim_videos_{}".format(epoch)
      )
  )
  base_algo_str = hparams.base_algo
  train_hparams = trainer_lib.create_hparams(hparams.base_algo_params)
  if hparams.wm_policy_param_sharing:
    train_hparams.optimizer_zero_grads = True

  rl_utils.update_hparams_from_hparams(
      train_hparams, hparams, base_algo_str + "_"
  )

  final_epoch = hparams.epochs - 1
  is_special_epoch = (epoch + 3) == final_epoch or (epoch + 7) == final_epoch
  is_final_epoch = epoch == final_epoch
  env_step_multiplier = 3 if is_final_epoch else 2 if is_special_epoch else 1
  learner.train(
      env_fn, train_hparams, simulated=True, save_continuously=True,
      epoch=epoch, env_step_multiplier=env_step_multiplier
  )
Ejemplo n.º 2
0
    def __init__(self,
                 real_env,
                 world_model_dir,
                 hparams,
                 random_starts,
                 setable_initial_frames=False):
        """Init.

    Args:
       real_env: gym environment.
       world_model_dir: path to world model checkpoint directory.
       hparams: hparams for rlmb pipeline.
       random_starts: if restart world model from random frames, or only
         from initial ones (from beginning of episodes). Valid only when
         `setable_initial_fames` set to False.
       setable_initial_frames: if True, initial_frames for world model should be
         set by `add_to_initial_stack`.
    """

        self._setable_initial_frames = setable_initial_frames

        if self._setable_initial_frames:
            real_obs_shape = real_env.observation_space.shape
            shape = (1, hparams.frame_stack_size) + real_obs_shape
            self._initial_frames = np.zeros(shape=shape, dtype=np.uint8)

            def initial_frame_chooser(batch_size):
                assert batch_size == 1
                return self._initial_frames

        else:
            initial_frame_chooser = rl_utils.make_initial_frame_chooser(
                real_env,
                hparams.frame_stack_size,
                simulation_random_starts=random_starts,
                simulation_flip_first_random_for_beginning=False)
        env_fn = make_simulated_env_fn_from_hparams(
            real_env,
            hparams,
            batch_size=1,
            initial_frame_chooser=initial_frame_chooser,
            model_dir=world_model_dir,
        )

        env = env_fn(in_graph=False)
        self.env = FlatBatchEnv(env)

        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
Ejemplo n.º 3
0
    def eval_fn(env, loop_hparams, policy_hparams, policy_dir, sampling_temp):
        """Eval function."""
        base_env = env
        env = rl_utils.BatchStackWrapper(env, loop_hparams.frame_stack_size)
        agent = make_agent_from_hparams(agent_type, base_env, env,
                                        loop_hparams, policy_hparams,
                                        planner_hparams, model_dir, policy_dir,
                                        sampling_temp, video_writers)

        if eval_mode == "agent_simulated":
            real_env = base_env.new_like(batch_size=1)
            stacked_env = rl_utils.BatchStackWrapper(
                real_env, loop_hparams.frame_stack_size)
            collect_frames_for_random_starts(real_env, stacked_env, agent,
                                             loop_hparams.frame_stack_size,
                                             random_starts_step_limit,
                                             log_every_steps)
            initial_frame_chooser = rl_utils.make_initial_frame_chooser(
                real_env,
                loop_hparams.frame_stack_size,
                simulation_random_starts=True,
                simulation_flip_first_random_for_beginning=False,
                split=None,
            )
            env_fn = rl.make_simulated_env_fn_from_hparams(
                real_env,
                loop_hparams,
                batch_size=loop_hparams.eval_batch_size,
                initial_frame_chooser=initial_frame_chooser,
                model_dir=model_dir)
            sim_env = env_fn(in_graph=False)
            env = rl_utils.BatchStackWrapper(sim_env,
                                             loop_hparams.frame_stack_size)

        kwargs = {}
        if not agent.records_own_videos:
            kwargs["video_writers"] = video_writers
        step_limit = base_env.rl_env_max_episode_steps
        if step_limit == -1:
            step_limit = None
        rl_utils.run_rollouts(env,
                              agent,
                              env.reset(),
                              log_every_steps=log_every_steps,
                              step_limit=step_limit,
                              **kwargs)
        if eval_mode == "agent_real":
            assert len(base_env.current_epoch_rollouts()) == env.batch_size
Ejemplo n.º 4
0
def make_simulated_gym_env(real_env, world_model_dir, hparams, random_starts):
    """Gym environment with world model."""
    initial_frame_chooser = rl_utils.make_initial_frame_chooser(
        real_env,
        hparams.frame_stack_size,
        simulation_random_starts=random_starts,
        simulation_flip_first_random_for_beginning=False)
    env_fn = make_simulated_env_fn_from_hparams(
        real_env,
        hparams,
        batch_size=1,
        initial_frame_chooser=initial_frame_chooser,
        model_dir=world_model_dir)
    env = env_fn(in_graph=False)
    flat_env = FlatBatchEnv(env)
    return flat_env
Ejemplo n.º 5
0
def evaluate_world_model(real_env, hparams, world_model_dir, debug_video_path):
  """Evaluate the world model (reward accuracy)."""
  frame_stack_size = hparams.frame_stack_size
  rollout_subsequences = []
  def initial_frame_chooser(batch_size):
    assert batch_size == len(rollout_subsequences)
    return np.stack([
        [frame.observation.decode() for frame in subsequence[:frame_stack_size]]
        for subsequence in rollout_subsequences
    ])

  env_fn = make_simulated_env_fn_from_hparams(
      real_env, hparams, batch_size=hparams.wm_eval_batch_size,
      initial_frame_chooser=initial_frame_chooser, model_dir=world_model_dir
  )
  sim_env = env_fn(in_graph=False)
  subsequence_length = int(
      max(hparams.wm_eval_rollout_ratios) * hparams.simulated_rollout_length
  )
  rollouts = real_env.current_epoch_rollouts(
      split=tf.estimator.ModeKeys.EVAL,
      minimal_rollout_frames=(subsequence_length + frame_stack_size)
  )

  video_writer = common_video.WholeVideoWriter(
      fps=10, output_path=debug_video_path, file_format="avi"
  )

  reward_accuracies_by_length = {
      int(ratio * hparams.simulated_rollout_length): []
      for ratio in hparams.wm_eval_rollout_ratios
  }
  for _ in range(hparams.wm_eval_num_batches):
    rollout_subsequences[:] = random_rollout_subsequences(
        rollouts, hparams.wm_eval_batch_size,
        subsequence_length + frame_stack_size
    )

    eval_subsequences = [
        subsequence[(frame_stack_size - 1):]
        for subsequence in rollout_subsequences
    ]

    # Check that the initial observation is the same in the real and simulated
    # rollout.
    sim_init_obs = sim_env.reset()
    def decode_real_obs(index):
      return np.stack([
          subsequence[index].observation.decode()
          for subsequence in eval_subsequences  # pylint: disable=cell-var-from-loop
      ])
    real_init_obs = decode_real_obs(0)
    assert np.all(sim_init_obs == real_init_obs)

    debug_frame_batches = []
    def append_debug_frame_batch(sim_obs, real_obs, sim_cum_rews,
                                 real_cum_rews, sim_rews, real_rews):
      """Add a debug frame."""
      rews = [[sim_cum_rews, sim_rews], [real_cum_rews, real_rews]]
      headers = []
      for j in range(len(sim_obs)):
        local_nps = []
        for i in range(2):
          img = PIL_Image().new("RGB", (sim_obs.shape[-2], 11),)
          draw = PIL_ImageDraw().Draw(img)
          draw.text((0, 0), "c:{:3}, r:{:3}".format(int(rews[i][0][j]),
                                                    int(rews[i][1][j])),
                    fill=(255, 0, 0))
          local_nps.append(np.asarray(img))
        local_nps.append(np.zeros_like(local_nps[0]))
        headers.append(np.concatenate(local_nps, axis=1))
      errs = absolute_hinge_difference(sim_obs, real_obs)
      headers = np.stack(headers)
      debug_frame_batches.append(  # pylint: disable=cell-var-from-loop
          np.concatenate([headers,
                          np.concatenate([sim_obs, real_obs, errs], axis=2)],
                         axis=1)
      )
    append_debug_frame_batch(sim_init_obs, real_init_obs,
                             np.zeros(hparams.wm_eval_batch_size),
                             np.zeros(hparams.wm_eval_batch_size),
                             np.zeros(hparams.wm_eval_batch_size),
                             np.zeros(hparams.wm_eval_batch_size))

    (sim_cum_rewards, real_cum_rewards) = (
        np.zeros(hparams.wm_eval_batch_size) for _ in range(2)
    )
    for i in range(subsequence_length):
      actions = [subsequence[i].action for subsequence in eval_subsequences]
      (sim_obs, sim_rewards, _) = sim_env.step(actions)
      sim_cum_rewards += sim_rewards

      real_rewards = np.array([
          subsequence[i + 1].reward for subsequence in eval_subsequences
      ])
      real_cum_rewards += real_rewards
      for (length, reward_accuracies) in six.iteritems(
          reward_accuracies_by_length
      ):
        if i + 1 == length:
          reward_accuracies.append(
              np.sum(sim_cum_rewards == real_cum_rewards) /
              len(real_cum_rewards)
          )

      real_obs = decode_real_obs(i + 1)
      append_debug_frame_batch(sim_obs, real_obs, sim_cum_rewards,
                               real_cum_rewards, sim_rewards, real_rewards)

    for debug_frames in np.stack(debug_frame_batches, axis=1):
      for debug_frame in debug_frames:
        video_writer.write(debug_frame)

  video_writer.finish_to_disk()

  return {
      "reward_accuracy/at_{}".format(length): np.mean(reward_accuracies)
      for (length, reward_accuracies) in six.iteritems(
          reward_accuracies_by_length
      )
  }