def make_env(batch_size=8): """Creates the env.""" # No resizing needed, so let's be on the normal EnvProblem. if not FLAGS.resize: # None or False return env_problem.EnvProblem(base_env_name=FLAGS.env_problem_name, batch_size=batch_size, reward_range=(-1, 1)) max_timestep = None try: max_timestep = int(FLAGS.max_timestep) except Exception: # pylint: disable=broad-except pass wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ "rl_env_max_episode_steps": max_timestep, "maxskip_env": True, "rendered_env": True, "rendered_env_resize_to": (FLAGS.resized_height, FLAGS.resized_width), "sticky_actions": False, "output_dtype": onp.int32 if FLAGS.use_tpu else None, }) return rendered_env_problem.RenderedEnvProblem( base_env_name=FLAGS.env_problem_name, batch_size=batch_size, env_wrapper_fn=wrapper_fn, reward_range=(-1, 1))
def test_play_env_problem_with_policy(self): env = env_problem.EnvProblem( base_env_name="CartPole-v0", batch_size=2, reward_range=(-1, 1)) def policy_fun(observations, rng=None): b, t = observations.shape[:2] a = env.action_space.n p = np.random.uniform(size=(b, t, a)) p = np.exp(p) p = p / np.sum(p, axis=-1, keepdims=True) return np.log(p), (), rng max_timestep = 15 num_trajectories = 2 trajectories, _ = env_problem_utils.play_env_problem_with_policy( env, policy_fun, num_trajectories=num_trajectories, max_timestep=max_timestep, boundary=20) self.assertEqual(num_trajectories, len(trajectories)) # Check shapes within trajectories. traj = trajectories[0] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T+1, 4), traj[0].shape) # (4,) is OBS self.assertEqual((T,), traj[2].shape) self.assertLessEqual(T, max_timestep) traj = trajectories[1] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T+1, 4), traj[0].shape) self.assertEqual((T,), traj[2].shape) self.assertLessEqual(T, max_timestep)
def make_env(): """Creates the env.""" if FLAGS.env_name: return gym.make(FLAGS.env_name) assert FLAGS.env_problem_name # No resizing needed, so let's be on the normal EnvProblem. if not FLAGS.resize: # None or False return env_problem.EnvProblem(base_env_name=FLAGS.env_problem_name, batch_size=FLAGS.batch_size, reward_range=(-1, 1)) wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ "rl_env_max_episode_steps": FLAGS.max_timestep, "maxskip_env": True, "rendered_env": True, "rendered_env_resize_to": (FLAGS.resized_height, FLAGS.resized_width), "sticky_actions": False }) return rendered_env_problem.RenderedEnvProblem( base_env_name=FLAGS.env_problem_name, batch_size=FLAGS.batch_size, env_wrapper_fn=wrapper_fn, reward_range=(-1, 1))
def test_default_processed_rewards_discrete(self): # This differs in the above because it has a Tuple observation space. ep = env_problem.EnvProblem(base_env_name="KellyCoinflip-v0", batch_size=5, reward_range=None) ep.assert_common_preconditions() # Assert reward range is finite here. self.assertTrue(ep.is_reward_range_finite) # Assert that it is as expected of the underlying environment. reward_range = ep.reward_range self.assertEqual(0, reward_range[0]) # Google's version of Gym has maxWealth, vs max_wealth externally. max_wealth = getattr(ep._envs[0], "maxWealth", getattr(ep._envs[0], "max_wealth", None)) self.assertIsNotNone(max_wealth) self.assertEqual(max_wealth, reward_range[1]) # Check that the processed rewards are discrete. self.assertTrue(ep.is_processed_rewards_discrete) # Assert on the number of rewards. self.assertEqual(ep.num_rewards, reward_range[1] - reward_range[0] + 1)
def test_resets_properly(self): base_env_name = "CartPole-v0" batch_size = 5 reward_range = (-1, 1) nsteps = 100 env = env_problem.EnvProblem(base_env_name=base_env_name, batch_size=batch_size, reward_range=reward_range) env.name = base_env_name num_dones = 0 while num_dones == 0: env, num_dones, _ = self.play_env(env=env, nsteps=nsteps, batch_size=batch_size, reward_range=reward_range) # Some completed trajectories have been generated. self.assertGreater(env.trajectories.num_completed_trajectories, 0) # This should clear the env completely of any state. env.reset() # Assert that there aren't any completed trajectories in the env now. self.assertEqual(env.trajectories.num_completed_trajectories, 0)
def test_interaction_with_env(self): batch_size = 5 reward_range = (-1, 1) ep = env_problem.EnvProblem( base_env_name="KellyCoinflip-v0", batch_size=batch_size, reward_range=reward_range) # Resets all environments. ep.reset() # Let's play a few steps. nsteps = 100 num_trajectories_completed = 0 num_timesteps_completed = 0 # If batch_done_at_step[i] = j then it means that i^th env last got done at # step = j. batch_done_at_step = np.full(batch_size, -1) for i in range(nsteps): # Sample batch_size actions from the action space and stack them (since # that is the expected type). actions = np.stack([ep.action_space.sample() for _ in range(batch_size)]) _, _, dones, _ = ep.step(actions) # Do the book-keeping on number of trajectories completed and expect that # it matches ep's completed number. num_done = sum(dones) num_trajectories_completed += num_done self.assertEqual(num_trajectories_completed, len(ep.trajectories.completed_trajectories)) # Get the indices where we are done ... done_indices = env_problem.EnvProblem.done_indices(dones) # ... and reset those. ep.reset(indices=done_indices) # If nothing got done, go on to the next step. if done_indices.size == 0: # i.e. this is an empty array. continue # See when these indices were last done and calculate how many time-steps # each one took to get done. num_timesteps_completed += sum(i + 1 - batch_done_at_step[done_indices]) batch_done_at_step[done_indices] = i # This should also match the number of time-steps completed given by ep. num_timesteps_completed_ep = sum( ct.num_time_steps() for ct in ep.trajectories.completed_trajectories) self.assertEqual(num_timesteps_completed, num_timesteps_completed_ep) # Reset the trajectories. ep.trajectories.reset_batch_trajectories() self.assertEqual(0, len(ep.trajectories.completed_trajectories))
def test_reward_range(self): # Passing reward_range=None means take the reward range of the underlying # environment as the reward range. ep = env_problem.EnvProblem( base_env_name="FrozenLake-v0", batch_size=5, reward_range=None) ep.assert_common_preconditions() # Assert reward range is finite here. self.assertTrue(ep.is_reward_range_finite) # Assert that it is as expected of the underlying environment, since reward_ self.assertEqual(0, ep.reward_range[0]) self.assertEqual(1, ep.reward_range[1])
def play_env(self, env=None, nsteps=100, base_env_name=None, batch_size=5, reward_range=None): """Creates `EnvProblem` with the given arguments and plays it randomly. Args: env: optional env. nsteps: plays the env randomly for nsteps. base_env_name: passed to EnvProblem's init. batch_size: passed to EnvProblem's init. reward_range: passed to EnvProblem's init. Returns: tuple of env_problem, number of trajectories done, number of trajectories done in the last step. """ if env is None: env = env_problem.EnvProblem(base_env_name=base_env_name, batch_size=batch_size, reward_range=reward_range) # Usually done by a registered subclass, we do this manually in the test. env.name = base_env_name # Reset all environments. env.reset() # Play for some steps to generate data. num_dones = 0 num_dones_in_last_step = 0 for _ in range(nsteps): # Sample actions. actions = np.stack( [env.action_space.sample() for _ in range(batch_size)]) # Step through it. _, _, dones, _ = env.step(actions) # Get the indices where we are done ... done_indices = env_problem_utils.done_indices(dones) # ... and reset those. env.reset(indices=done_indices) # count the number of dones we got, in this step and overall. num_dones_in_last_step = sum(dones) num_dones += num_dones_in_last_step return env, num_dones, num_dones_in_last_step
def get_wrapped_env(self, name="CartPole-v0", max_episode_steps=2): wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ "rl_env_max_episode_steps": max_episode_steps, "maxskip_env": False, "rendered_env": False, "rendered_env_resize_to": None, # Do not resize frames "sticky_actions": False, "output_dtype": None, }) return env_problem.EnvProblem(base_env_name=name, batch_size=1, env_wrapper_fn=wrapper_fn, reward_range=(-1, 1))
def test_play_env_problem_with_policy(self): env = env_problem.EnvProblem(base_env_name="CartPole-v0", batch_size=2, reward_range=(-1, 1)) # Let's make sure that at-most 4 observations come to the policy function. len_history_for_policy = 4 def policy_fun(observations, rng=None): b, t = observations.shape[:2] # Assert that observations from time-step len_history_for_policy onwards # are zeros. self.assertTrue( np.all(observations[:, len_history_for_policy:, ...] == 0)) self.assertFalse( np.all(observations[:, :len_history_for_policy, ...] == 0)) a = env.action_space.n p = np.random.uniform(size=(b, t, a)) p = np.exp(p) p = p / np.sum(p, axis=-1, keepdims=True) return np.log(p), (), rng max_timestep = 15 num_trajectories = 2 trajectories, _, _ = env_problem_utils.play_env_problem_with_policy( env, policy_fun, num_trajectories=num_trajectories, max_timestep=max_timestep, len_history_for_policy=len_history_for_policy) self.assertEqual(num_trajectories, len(trajectories)) # Check shapes within trajectories. traj = trajectories[0] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T + 1, 4), traj[0].shape) # (4,) is OBS self.assertEqual((T, ), traj[2].shape) self.assertLessEqual(T, max_timestep) traj = trajectories[1] T = traj[1].shape[0] # pylint: disable=invalid-name self.assertEqual((T + 1, 4), traj[0].shape) self.assertEqual((T, ), traj[2].shape) self.assertLessEqual(T, max_timestep)
def test_setup(self): ep = env_problem.EnvProblem(base_env_name="CartPole-v0", batch_size=5) # Checks that environments were created and they are `batch_size` in number. ep.assert_common_preconditions() # Expectations on the observation space. observation_space = ep.observation_space self.assertTrue(isinstance(observation_space, Box)) self.assertEqual(observation_space.shape, (4, )) self.assertEqual(observation_space.dtype, np.float32) # Expectations on the action space. action_space = ep.action_space self.assertTrue(isinstance(action_space, Discrete)) self.assertEqual(action_space.shape, ()) self.assertEqual(action_space.dtype, np.int64) self.assertEqual(ep.num_actions, 2) # Reward range is infinite here. self.assertFalse(ep.is_reward_range_finite)
def make_env(batch_size=8, **env_kwargs): """Creates the env.""" if FLAGS.clip_rewards: env_kwargs.update({"reward_range": (-1, 1), "discrete_rewards": True}) else: env_kwargs.update({"discrete_rewards": False}) # TODO(afrozm): Should we leave out some cores? parallelism = multiprocessing.cpu_count() if FLAGS.parallelize_envs else 1 # No resizing needed, so let's be on the normal EnvProblem. if not FLAGS.resize: # None or False return env_problem.EnvProblem(base_env_name=FLAGS.env_problem_name, batch_size=batch_size, parallelism=parallelism, **env_kwargs) max_timestep = None try: max_timestep = int(FLAGS.max_timestep) except Exception: # pylint: disable=broad-except pass wrapper_fn = functools.partial( gym_utils.gym_env_wrapper, **{ "rl_env_max_episode_steps": max_timestep, "maxskip_env": True, "rendered_env": True, "rendered_env_resize_to": (FLAGS.resized_height, FLAGS.resized_width), "sticky_actions": False, "output_dtype": onp.int32 if FLAGS.use_tpu else None, }) return rendered_env_problem.RenderedEnvProblem( base_env_name=FLAGS.env_problem_name, batch_size=batch_size, parallelism=parallelism, env_wrapper_fn=wrapper_fn, **env_kwargs)