Esempio n. 1
0
    def test_local_batch_sampler(self):
        workers = WorkerFactory(seed=100,
                                max_path_length=self.algo.max_path_length)
        sampler1 = LocalSampler.from_worker_factory(workers, self.policy,
                                                    self.env)
        sampler2 = OnPolicyVectorizedSampler(self.algo, self.env)
        sampler2.start_worker()
        trajs1 = sampler1.obtain_samples(
            0, 1000, tuple(self.algo.policy.get_param_values()))
        trajs2 = sampler2.obtain_samples(0, 1000)
        # pylint: disable=superfluous-parens
        assert trajs1.observations.shape[0] >= 1000
        assert trajs1.actions.shape[0] >= 1000
        assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum(
            trajs2[0]['rewards']) == 1)

        true_obs = np.array([0, 1, 2, 6, 10, 14])
        true_actions = np.array([2, 2, 1, 1, 1, 2])
        true_rewards = np.array([0, 0, 0, 0, 0, 1])
        start = 0
        for length in trajs1.lengths:
            observations = trajs1.observations[start:start + length]
            actions = trajs1.actions[start:start + length]
            rewards = trajs1.rewards[start:start + length]
            assert np.array_equal(observations, true_obs)
            assert np.array_equal(actions, true_actions)
            assert np.array_equal(rewards, true_rewards)
            start += length
        sampler1.shutdown_worker()
        sampler2.shutdown_worker()
Esempio n. 2
0
    def test_ray_batch_sampler(self):
        sampler1 = RaySampler(self.algo,
                              self.env,
                              seed=100,
                              num_processors=1,
                              sampler_worker_cls=SamplerWorker)
        sampler1.start_worker()
        sampler2 = OnPolicyVectorizedSampler(self.algo, self.env)
        sampler2.start_worker()
        trajs1 = sampler1.obtain_samples(0, 16)
        trajs2 = sampler2.obtain_samples(0, 1)
        assert (trajs1[0]['observations'].shape == np.array(
            trajs2[0]['observations']).shape == (6, 16))
        traj2_action_shape = np.array(trajs2[0]['actions']).shape
        assert (trajs1[0]['actions'].shape == traj2_action_shape == (6, 4))
        assert (sum(trajs1[0]['rewards']) == sum(trajs2[0]['rewards']) == 1)

        true_obs = np.array(
            [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
             [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
             [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
             [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
             [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
             [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])
        true_actions = np.array([[0., 0., 1., 0.], [0., 0., 1., 0.],
                                 [0., 1., 0., 0.], [0., 1., 0., 0.],
                                 [0., 1., 0., 0.], [0., 0., 1., 0.]])
        true_rewards = np.array([0, 0, 0, 0, 0, 1])
        for trajectory in trajs1:
            assert (np.array_equal(trajectory['observations'], true_obs))
            assert (np.array_equal(trajectory['actions'], true_actions))
            assert (np.array_equal(trajectory['rewards'], true_rewards))
        sampler1.shutdown_worker()
        sampler2.shutdown_worker()
Esempio n. 3
0
    def test_ray_batch_sampler(self):
        workers = WorkerFactory(seed=100,
                                max_path_length=self.algo.max_path_length)
        sampler1 = RaySampler(workers, self.policy, self.env, num_processors=1)
        sampler1.start_worker()
        sampler2 = OnPolicyVectorizedSampler(self.algo, self.env)
        sampler2.start_worker()
        trajs1 = sampler1.obtain_samples(
            0, 1000, tuple(self.algo.policy.get_param_values()))
        trajs2 = sampler2.obtain_samples(0, 1000)
        # pylint: disable=superfluous-parens
        assert (trajs1[0]['observations'].shape == np.array(
            trajs2[0]['observations']).shape == (6, ))
        traj2_action_shape = np.array(trajs2[0]['actions']).shape
        assert trajs1[0]['actions'].shape == traj2_action_shape == (6, )
        assert sum(trajs1[0]['rewards']) == sum(trajs2[0]['rewards']) == 1

        true_obs = np.array([0, 1, 2, 6, 10, 14])
        true_actions = np.array([2, 2, 1, 1, 1, 2])
        true_rewards = np.array([0, 0, 0, 0, 0, 1])
        for trajectory in trajs1:
            assert np.array_equal(trajectory['observations'], true_obs)
            assert np.array_equal(trajectory['actions'], true_actions)
            assert np.array_equal(trajectory['rewards'], true_rewards)
        sampler1.shutdown_worker()
        sampler2.shutdown_worker()
Esempio n. 4
0
def test_ray_batch_sampler(ray_local_session_fixture):
    del ray_local_session_fixture
    env = TfEnv(GridWorldEnv(desc='4x4'))
    policy = ScriptedPolicy(
        scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1])
    algo = Mock(env_spec=env.spec, policy=policy, max_path_length=16)
    assert ray.is_initialized()
    workers = WorkerFactory(seed=100, max_path_length=algo.max_path_length)
    sampler1 = RaySampler(workers, policy, env)
    sampler1.start_worker()
    sampler2 = OnPolicyVectorizedSampler(algo, env)
    sampler2.start_worker()
    trajs1 = sampler1.obtain_samples(0, 1000,
                                     tuple(algo.policy.get_param_values()))
    trajs2 = sampler2.obtain_samples(0, 1000)
    # pylint: disable=superfluous-parens
    assert trajs1.observations.shape[0] >= 1000
    assert trajs1.actions.shape[0] >= 1000
    assert (sum(trajs1.rewards[:trajs1.lengths[0]]) == sum(
        trajs2[0]['rewards']) == 1)

    true_obs = np.array([0, 1, 2, 6, 10, 14])
    true_actions = np.array([2, 2, 1, 1, 1, 2])
    true_rewards = np.array([0, 0, 0, 0, 0, 1])
    start = 0
    for length in trajs1.lengths:
        observations = trajs1.observations[start:start + length]
        actions = trajs1.actions[start:start + length]
        rewards = trajs1.rewards[start:start + length]
        assert np.array_equal(observations, true_obs)
        assert np.array_equal(actions, true_actions)
        assert np.array_equal(rewards, true_rewards)
        start += length
    sampler1.shutdown_worker()
    sampler2.shutdown_worker()
    env.close()