def test_pickleable(self): env = GridWorldEnv(desc="8x8") round_trip = pickle.loads(pickle.dumps(env)) assert round_trip assert round_trip.start_state == env.start_state step_env(round_trip) round_trip.close() env.close()
def test_does_not_modify_action(self): env = GridWorldEnv(desc='8x8') a = env.action_space.sample() a_copy = a env.reset() env.step(a) assert a == a_copy env.close()
def test_does_not_modify_action(self): env = GridWorldEnv(desc="8x8") a = env.action_space.sample() a_copy = a env.reset() env.step(a) self.assertEqual(a, a_copy) env.close()
def setup_method(self): self.env = TfEnv(GridWorldEnv(desc='4x4')) self.policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) self.algo = Mock(env_spec=self.env.spec, policy=self.policy, max_path_length=16)
def setup_method(self): ray.init(local_mode=True, ignore_reinit_error=True) self.env = TfEnv(GridWorldEnv(desc='4x4')) self.policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) self.algo = Mock( env_spec=self.env.spec, policy=self.policy, max_path_length=16)
def other_envs(): descs = [ ['FFFS', 'FHFH', 'FFFH', 'HFFG'], ['FFSF', 'FFFH', 'FHFH', 'HFFG'], ['FFFF', 'FFSH', 'FHFH', 'FFFG'], ['FFFF', 'FFFF', 'FSFF', 'FFFF'], ['HHFF', 'HHHF', 'HSHF', 'HHHF'], ] return [TfEnv(GridWorldEnv(desc=desc)) for desc in descs]
def envs(): descs = [ ['SFFF', 'FHFH', 'FFFH', 'HFFG'], ['SFFF', 'FFFH', 'FHFH', 'HFFG'], ['SFFF', 'FFFH', 'FHFH', 'FFFG'], ['SFFF', 'FFFF', 'FFFF', 'FFFF'], ['SHFF', 'HHFF', 'FFFF', 'FFFF'], ] return [TfEnv(GridWorldEnv(desc=desc)) for desc in descs]
def test_obtain_samples(ray_local_session_fixture): del ray_local_session_fixture env = GridWorldEnv(desc='4x4') policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) algo = Mock(env_spec=env.spec, policy=policy, max_episode_length=16) assert ray.is_initialized() workers = WorkerFactory(seed=100, max_episode_length=algo.max_episode_length, n_workers=8) sampler1 = RaySampler.from_worker_factory(workers, policy, env) sampler2 = LocalSampler.from_worker_factory(workers, policy, env) eps1 = sampler1.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) eps2 = sampler2.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) assert eps1.observations.shape[0] >= 1000 assert eps1.actions.shape[0] >= 1000 assert (sum(eps1.rewards[:eps1.lengths[0]]) == sum( eps2.rewards[:eps2.lengths[0]]) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in eps1.lengths: observations = eps1.observations[start:start + length] actions = eps1.actions[start:start + length] rewards = eps1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker() env.close()
def test_obtain_samples(): env = GarageEnv(GridWorldEnv(desc='4x4')) policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) algo = Mock(env_spec=env.spec, policy=policy, max_episode_length=16) workers = WorkerFactory(seed=100, max_episode_length=algo.max_episode_length, n_workers=8) sampler1 = MultiprocessingSampler.from_worker_factory(workers, policy, env) sampler2 = LocalSampler.from_worker_factory(workers, policy, env) trajs1 = sampler1.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2.rewards[:trajs2.lengths[0]]) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker() env.close()
def test_ray_batch_sampler(ray_local_session_fixture): del ray_local_session_fixture env = TfEnv(GridWorldEnv(desc='4x4')) policy = ScriptedPolicy( scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1]) algo = Mock(env_spec=env.spec, policy=policy, max_path_length=16) assert ray.is_initialized() workers = WorkerFactory(seed=100, max_path_length=algo.max_path_length) sampler1 = RaySampler(workers, policy, env) sampler1.start_worker() sampler2 = OnPolicyVectorizedSampler(algo, env) sampler2.start_worker() trajs1 = sampler1.obtain_samples(0, 1000, tuple(algo.policy.get_param_values())) trajs2 = sampler2.obtain_samples(0, 1000) # pylint: disable=superfluous-parens assert trajs1.observations.shape[0] >= 1000 assert trajs1.actions.shape[0] >= 1000 assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum( trajs2[0]['rewards']) == 1) true_obs = np.array([0, 1, 2, 6, 10, 14]) true_actions = np.array([2, 2, 1, 1, 1, 2]) true_rewards = np.array([0, 0, 0, 0, 0, 1]) start = 0 for length in trajs1.lengths: observations = trajs1.observations[start:start + length] actions = trajs1.actions[start:start + length] rewards = trajs1.rewards[start:start + length] assert np.array_equal(observations, true_obs) assert np.array_equal(actions, true_actions) assert np.array_equal(rewards, true_rewards) start += length sampler1.shutdown_worker() sampler2.shutdown_worker() env.close()
def env(): return TfEnv(GridWorldEnv(desc='4x4'))