def test_play_env_problem_with_policy(self):
    env = gym_env_problem.GymEnvProblem(
        base_env_name="CartPole-v0", batch_size=2, reward_range=(-1, 1))

    # Let's make sure that at-most 4 observations come to the policy function.
    len_history_for_policy = 4

    def policy_fun(observations, lengths, state=None, rng=None):
      del lengths
      b = observations.shape[0]
      # Assert that observations from time-step len_history_for_policy onwards
      # are zeros.
      self.assertTrue(
          np.all(observations[:, len_history_for_policy:, ...] == 0))
      self.assertFalse(
          np.all(observations[:, :len_history_for_policy, ...] == 0))
      a = env.action_space.n
      p = np.random.uniform(size=(b, 1, a))
      p = np.exp(p)
      p = p / np.sum(p, axis=-1, keepdims=True)
      return np.log(p), np.mean(p, axis=-1), state, rng

    max_timestep = 15
    num_trajectories = 2
    trajectories, _, _, _ = env_problem_utils.play_env_problem_with_policy(
        env,
        policy_fun,
        num_trajectories=num_trajectories,
        max_timestep=max_timestep,
        len_history_for_policy=len_history_for_policy)

    self.assertEqual(num_trajectories, len(trajectories))

    # Check shapes within trajectories.
    traj = trajectories[0]
    T = traj[1].shape[0]  # pylint: disable=invalid-name
    self.assertEqual((T + 1, 4), traj[0].shape)  # (4,) is OBS
    self.assertEqual((T,), traj[2].shape)
    self.assertEqual(T, len(traj[4]["log_prob_actions"]))
    self.assertEqual(T, len(traj[4]["value_predictions"]))
    self.assertLessEqual(T, max_timestep)

    traj = trajectories[1]
    T = traj[1].shape[0]  # pylint: disable=invalid-name
    self.assertEqual((T + 1, 4), traj[0].shape)
    self.assertEqual((T,), traj[2].shape)
    self.assertEqual(T, len(traj[4]["log_prob_actions"]))
    self.assertEqual(T, len(traj[4]["value_predictions"]))
    self.assertLessEqual(T, max_timestep)
Exemple #2
0
    def play_env(self,
                 env=None,
                 nsteps=100,
                 base_env_name=None,
                 batch_size=5,
                 reward_range=None):
        """Creates `GymEnvProblem` with the given arguments and plays it randomly.

    Args:
      env: optional env.
      nsteps: plays the env randomly for nsteps.
      base_env_name: passed to GymEnvProblem's init.
      batch_size: passed to GymEnvProblem's init.
      reward_range: passed to GymEnvProblem's init.

    Returns:
      tuple of gym_env_problem, number of trajectories done,
      number of trajectories done in the last step.
    """

        if env is None:
            env = gym_env_problem.GymEnvProblem(base_env_name=base_env_name,
                                                batch_size=batch_size,
                                                reward_range=reward_range)
            # Usually done by a registered subclass, we do this manually in the test.
            env.name = base_env_name

        # Reset all environments.
        env.reset()

        # Play for some steps to generate data.
        num_dones = 0
        num_dones_in_last_step = 0
        for _ in range(nsteps):
            # Sample actions.
            actions = np.stack(
                [env.action_space.sample() for _ in range(batch_size)])
            # Step through it.
            _, _, dones, _ = env.step(actions)
            # Get the indices where we are done ...
            done_indices = env_problem_utils.done_indices(dones)
            # ... and reset those.
            env.reset(indices=done_indices)
            # count the number of dones we got, in this step and overall.
            num_dones_in_last_step = sum(dones)
            num_dones += num_dones_in_last_step

        return env, num_dones, num_dones_in_last_step
  def get_wrapped_env(self, name="CartPole-v0", max_episode_steps=2):
    wrapper_fn = functools.partial(
        gym_utils.gym_env_wrapper,
        **{
            "rl_env_max_episode_steps": max_episode_steps,
            "maxskip_env": False,
            "rendered_env": False,
            "rendered_env_resize_to": None,  # Do not resize frames
            "sticky_actions": False,
            "output_dtype": None,
        })

    return gym_env_problem.GymEnvProblem(base_env_name=name,
                                         batch_size=1,
                                         env_wrapper_fn=wrapper_fn,
                                         discrete_rewards=False)
    def _make_wrapped_env(self, name, max_episode_steps=2):
        wrapper_fn = functools.partial(
            gym_utils.gym_env_wrapper,
            **{
                'rl_env_max_episode_steps': max_episode_steps,
                'maxskip_env': False,
                'rendered_env': False,
                'rendered_env_resize_to': None,  # Do not resize frames
                'sticky_actions': False,
                'output_dtype': None,
            })

        return gym_env_problem.GymEnvProblem(base_env_name=name,
                                             batch_size=2,
                                             env_wrapper_fn=wrapper_fn,
                                             discrete_rewards=False)
Exemple #5
0
def make_env(batch_size=1,
             env_problem_name="",
             resize=True,
             resized_height=105,
             resized_width=80,
             max_timestep="None",
             clip_rewards=True,
             parallelism=1,
             use_tpu=False,
             **env_kwargs):
  """Creates the env."""

  if clip_rewards:
    env_kwargs.update({"reward_range": (-1, 1), "discrete_rewards": True})
  else:
    env_kwargs.update({"discrete_rewards": False})

  # No resizing needed, so let's be on the normal EnvProblem.
  if not resize:  # None or False
    return gym_env_problem.GymEnvProblem(
        base_env_name=env_problem_name,
        batch_size=batch_size,
        parallelism=parallelism,
        **env_kwargs)

  try:
    max_timestep = int(max_timestep)
  except Exception:  # pylint: disable=broad-except
    max_timestep = None

  wrapper_fn = functools.partial(
      gym_utils.gym_env_wrapper, **{
          "rl_env_max_episode_steps": max_timestep,
          "maxskip_env": True,
          "rendered_env": True,
          "rendered_env_resize_to": (resized_height, resized_width),
          "sticky_actions": False,
          "output_dtype": np.int32 if use_tpu else None,
      })

  return rendered_env_problem.RenderedEnvProblem(
      base_env_name=env_problem_name,
      batch_size=batch_size,
      parallelism=parallelism,
      env_wrapper_fn=wrapper_fn,
      **env_kwargs)
Exemple #6
0
def make_env(batch_size=8, **env_kwargs):
    """Creates the env."""

    if FLAGS.clip_rewards:
        env_kwargs.update({"reward_range": (-1, 1), "discrete_rewards": True})
    else:
        env_kwargs.update({"discrete_rewards": False})

    # TODO(afrozm): Should we leave out some cores?
    parallelism = multiprocessing.cpu_count() if FLAGS.parallelize_envs else 1

    # No resizing needed, so let's be on the normal EnvProblem.
    if not FLAGS.resize:  # None or False
        return gym_env_problem.GymEnvProblem(
            base_env_name=FLAGS.env_problem_name,
            batch_size=batch_size,
            parallelism=parallelism,
            **env_kwargs)

    max_timestep = None
    try:
        max_timestep = int(FLAGS.max_timestep)
    except Exception:  # pylint: disable=broad-except
        pass

    wrapper_fn = functools.partial(
        gym_utils.gym_env_wrapper, **{
            "rl_env_max_episode_steps": max_timestep,
            "maxskip_env": True,
            "rendered_env": True,
            "rendered_env_resize_to":
            (FLAGS.resized_height, FLAGS.resized_width),
            "sticky_actions": False,
            "output_dtype": onp.int32 if FLAGS.use_tpu else None,
        })

    return rendered_env_problem.RenderedEnvProblem(
        base_env_name=FLAGS.env_problem_name,
        batch_size=batch_size,
        parallelism=parallelism,
        env_wrapper_fn=wrapper_fn,
        **env_kwargs)
Exemple #7
0
    def test_setup(self):
        ep = gym_env_problem.GymEnvProblem(base_env_name="CartPole-v0",
                                           batch_size=5)
        # Checks that environments were created and they are `batch_size` in number.
        ep.assert_common_preconditions()

        # Expectations on the observation space.
        observation_space = ep.observation_space
        self.assertIsInstance(observation_space, Box)
        self.assertEqual(observation_space.shape, (4, ))
        self.assertEqual(observation_space.dtype, np.float32)

        # Expectations on the action space.
        action_space = ep.action_space
        self.assertTrue(isinstance(action_space, Discrete))
        self.assertEqual(action_space.shape, ())
        self.assertEqual(action_space.dtype, np.int64)
        self.assertEqual(ep.num_actions, 2)

        # Reward range is infinite here.
        self.assertFalse(ep.is_reward_range_finite)
Exemple #8
0
def make_env(name, batch_size, max_timestep, clip_rewards, rendered_env,
             resize_dims, **env_kwargs):
  """Creates the env."""

  if clip_rewards:
    env_kwargs.update({"reward_range": (-1, 1), "discrete_rewards": True})
  else:
    env_kwargs.update({"discrete_rewards": False})

  # TODO(afrozm): Should we leave out some cores?
  parallelism = multiprocessing.cpu_count() if FLAGS.parallelize_envs else 1

  # No resizing needed, so let's be on the normal EnvProblem.
  if not rendered_env:
    return gym_env_problem.GymEnvProblem(
        base_env_name=name,
        batch_size=batch_size,
        parallelism=parallelism,
        **env_kwargs)

  wrapper_fn = functools.partial(
      gym_utils.gym_env_wrapper, **{
          "rl_env_max_episode_steps": max_timestep,
          "maxskip_env": True,
          "rendered_env": True,
          "rendered_env_resize_to": resize_dims,
          "sticky_actions": False,
          "output_dtype": onp.int32 if FLAGS.use_tpu else None,
      })

  return rendered_env_problem.RenderedEnvProblem(
      base_env_name=name,
      batch_size=batch_size,
      parallelism=parallelism,
      env_wrapper_fn=wrapper_fn,
      **env_kwargs)
Exemple #9
0
    def test_interaction_with_env(self):
        batch_size = 5
        reward_range = (-1, 1)
        ep = gym_env_problem.GymEnvProblem(base_env_name="KellyCoinflip-v0",
                                           batch_size=batch_size,
                                           reward_range=reward_range)

        # Resets all environments.
        ep.reset()

        # Let's play a few steps.
        nsteps = 100
        num_trajectories_completed = 0
        num_timesteps_completed = 0
        # If batch_done_at_step[i] = j then it means that i^th env last got done at
        # step = j.
        batch_done_at_step = np.full(batch_size, -1)
        for i in range(nsteps):
            # Sample batch_size actions from the action space and stack them (since
            # that is the expected type).
            actions = np.stack(
                [ep.action_space.sample() for _ in range(batch_size)])

            _, _, dones, _ = ep.step(actions)

            # Do the book-keeping on number of trajectories completed and expect that
            # it matches ep's completed number.

            num_done = sum(dones)
            num_trajectories_completed += num_done

            self.assertEqual(num_trajectories_completed,
                             len(ep.trajectories.completed_trajectories))

            # Get the indices where we are done ...
            done_indices = env_problem_utils.done_indices(dones)

            # ... and reset those.
            ep.reset(indices=done_indices)

            # If nothing got done, go on to the next step.
            if done_indices.size == 0:
                # i.e. this is an empty array.
                continue

            # See when these indices were last done and calculate how many time-steps
            # each one took to get done.
            num_timesteps_completed += sum(i + 1 -
                                           batch_done_at_step[done_indices])
            batch_done_at_step[done_indices] = i

            # This should also match the number of time-steps completed given by ep.
            num_timesteps_completed_ep = sum(
                ct.num_time_steps
                for ct in ep.trajectories.completed_trajectories)
            self.assertEqual(num_timesteps_completed,
                             num_timesteps_completed_ep)

        # Reset the trajectories.
        ep.trajectories.reset_batch_trajectories()
        self.assertEqual(0, len(ep.trajectories.completed_trajectories))