Ejemplo n.º 1
0
def test_update_envs_env_update():
    max_path_length = 16
    env = TfEnv(PointEnv())
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_path_length)
                         ])
    tasks = SetTaskSampler(PointEnv)
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = MultiprocessingSampler.from_worker_factory(workers, policy, env)
    rollouts = sampler.obtain_samples(0,
                                      161,
                                      np.asarray(policy.get_param_values()),
                                      env_update=tasks.sample(n_workers))
    mean_rewards = []
    goals = []
    for rollout in rollouts.split():
        mean_rewards.append(rollout.rewards.mean())
        goals.append(rollout.env_infos['task'][0]['goal'])
    assert np.var(mean_rewards) > 0
    assert np.var(goals) > 0
    with pytest.raises(ValueError):
        sampler.obtain_samples(0,
                               10,
                               np.asarray(policy.get_param_values()),
                               env_update=tasks.sample(n_workers + 1))
    sampler.shutdown_worker()
    env.close()
Ejemplo n.º 2
0
    def test_cem_cartpole(self):
        """Test CEM with Cartpole-v1 environment."""
        with LocalTFRunner(snapshot_config) as runner:
            env = TfEnv(env_name='CartPole-v1')

            policy = CategoricalMLPPolicy(name='policy',
                                          env_spec=env.spec,
                                          hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            n_samples = 10

            algo = CEM(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       best_frac=0.1,
                       max_path_length=100,
                       n_samples=n_samples)

            runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler)
            rtn = runner.train(n_epochs=10,
                               batch_size=2048,
                               n_epoch_cycles=n_samples)
            assert rtn > 40

            env.close()
Ejemplo n.º 3
0
def test_obtain_exact_trajectories():
    max_path_length = 15
    n_workers = 8
    env = TfEnv(PointEnv())
    per_worker_actions = [env.action_space.sample() for _ in range(n_workers)]
    policies = [
        FixedPolicy(env.spec, [action] * max_path_length)
        for action in per_worker_actions
    ]
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = MultiprocessingSampler.from_worker_factory(workers,
                                                         policies,
                                                         envs=env)
    n_traj_per_worker = 3
    rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker,
                                                 agent_update=policies)
    # At least one action per trajectory.
    assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker
    # All of the trajectories.
    assert len(rollouts.lengths) == n_workers * n_traj_per_worker
    worker = -1
    for count, rollout in enumerate(rollouts.split()):
        if count % n_traj_per_worker == 0:
            worker += 1
        assert (rollout.actions == per_worker_actions[worker]).all()
    sampler.shutdown_worker()
    env.close()
Ejemplo n.º 4
0
def test_init_with_crashed_worker():
    max_path_length = 16
    env = TfEnv(PointEnv())
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_path_length)
                         ])
    tasks = SetTaskSampler(lambda: TfEnv(PointEnv()))
    n_workers = 2
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)

    class CrashingPolicy:
        def reset(self, **kwargs):
            raise Exception('Intentional subprocess crash')

    bad_policy = CrashingPolicy()

    #  This causes worker 2 to crash.
    sampler = MultiprocessingSampler.from_worker_factory(
        workers, [policy, bad_policy], envs=tasks.sample(n_workers))
    rollouts = sampler.obtain_samples(0, 160, None)
    assert sum(rollouts.lengths) >= 160
    sampler.shutdown_worker()
    env.close()
Ejemplo n.º 5
0
    def test_ppo_pendulum_recurrent(self):
        """Test PPO with Pendulum environment and recurrent policy."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianLSTMPolicy(env_spec=env.spec, )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            lr_clip_range=0.01,
            optimizer_args=dict(batch_size=32, max_epochs=10),
            plot=False,
        )
        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 40

        env.close()
Ejemplo n.º 6
0
    def test_ddpg_pendulum(self):
        """Test DDPG with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(gym.make('InvertedDoublePendulum-v2'))
            action_noise = OUStrategy(env.spec, sigma=0.2)
            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=100)
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                min_buffer_size=int(1e4),
                exploration_strategy=action_noise,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10,
                                        n_epoch_cycles=20,
                                        batch_size=100)
            assert last_avg_ret > 60

            env.close()
Ejemplo n.º 7
0
    def test_npo_pendulum(self):
        """Test NPO with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            logger.reset()
            env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = NPO(
                env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.98,
                policy_ent_coeff=0.0)
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 20

            env.close()
Ejemplo n.º 8
0
    def test_ppo_pendulum_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
            policy = GaussianMLPPolicyWithModel(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = GaussianMLPBaselineWithModel(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                lr_clip_range=0.01,
                optimizer_args=dict(batch_size=32, max_epochs=10),
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 30

            env.close()
Ejemplo n.º 9
0
    def test_ppo_pendulum_recurrent_continuous_baseline(self):
        """Test PPO with Pendulum environment and recurrent policy."""
        with LocalTFRunner(snapshot_config) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianLSTMPolicy(env_spec=env.spec, )
            baseline = ContinuousMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                ),
                stop_entropy_gradient=True,
                entropy_method='max',
                policy_ent_coeff=0.02,
                center_adv=False,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 100

            env.close()
Ejemplo n.º 10
0
    def test_trpo_cnn_cubecrash(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('CubeCrash-v0')))

            policy = CategoricalCNNPolicy(env_spec=env.spec,
                                          conv_filters=(32, 64),
                                          conv_filter_sizes=(8, 4),
                                          conv_strides=(4, 2),
                                          conv_pad='VALID',
                                          hidden_sizes=(32, 32))

            baseline = GaussianCNNBaseline(env_spec=env.spec,
                                           regressor_args=dict(
                                               num_filters=(32, 64),
                                               filter_dims=(8, 4),
                                               strides=(4, 2),
                                               padding='VALID',
                                               hidden_sizes=(32, 32),
                                               use_trust_region=True))

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        gae_lambda=0.98,
                        max_kl_step=0.01,
                        policy_ent_coeff=0.0,
                        flatten_input=False)

            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > -0.9

            env.close()
Ejemplo n.º 11
0
    def test_cma_es_cartpole(self):
        """Test CMAES with Cartpole-v1 environment."""
        with LocalRunner() as runner:
            env = TfEnv(env_name="CartPole-v1")

            policy = CategoricalMLPPolicy(
                name="policy", env_spec=env.spec, hidden_sizes=(32, 32))

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            runner.initialize_tf_vars()

            n_samples = 20

            algo = CMAES(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                n_samples=n_samples)

            runner.setup(algo, env, sampler_cls=OnPolicyVectorizedSampler)
            runner.train(n_epochs=1, batch_size=1000, n_epoch_cycles=n_samples)
            # No assertion on return because CMAES is not stable.

            env.close()
Ejemplo n.º 12
0
    def test_dm_control_tf_policy(self):
        task = ALL_TASKS[0]

        with LocalRunner(sess=self.sess) as runner:
            env = TfEnv(DmControlEnv.from_suite(*task))

            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = TRPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=5,
                discount=0.99,
                max_kl_step=0.01,
            )

            runner.setup(algo, env)
            runner.train(n_epochs=1, batch_size=10)

            env.close()
Ejemplo n.º 13
0
    def test_baseline(self):
        """Test the baseline initialization."""
        box_env = TfEnv(DummyBoxEnv())
        deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env)
        gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env)

        self.sess.run(tf.compat.v1.global_variables_initializer())
        deterministic_mlp_baseline.get_param_values()
        gaussian_mlp_baseline.get_param_values()

        box_env.close()
Ejemplo n.º 14
0
    def test_dqn_cartpole_pickle(self):
        """Test DQN with CartPole environment."""
        with LocalRunner(self.sess) as runner:
            n_epochs = 10
            n_epoch_cycles = 10
            sampler_batch_size = 500
            num_timesteps = n_epochs * n_epoch_cycles * sampler_batch_size
            env = TfEnv(gym.make('CartPole-v0'))
            replay_buffer = SimpleReplayBuffer(
                env_spec=env.spec,
                size_in_transitions=int(1e4),
                time_horizon=1)
            qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
            policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf)
            epilson_greedy_strategy = EpsilonGreedyStrategy(
                env_spec=env.spec,
                total_timesteps=num_timesteps,
                max_epsilon=1.0,
                min_epsilon=0.02,
                decay_ratio=0.1)
            algo = DQN(
                env_spec=env.spec,
                policy=policy,
                qf=qf,
                exploration_strategy=epilson_greedy_strategy,
                replay_buffer=replay_buffer,
                qf_lr=1e-4,
                discount=1.0,
                min_buffer_size=int(1e3),
                double_q=False,
                n_train_steps=500,
                grad_norm_clipping=5.0,
                n_epoch_cycles=n_epoch_cycles,
                target_network_update_freq=1,
                buffer_batch_size=32)
            runner.setup(algo, env)
            with tf.variable_scope(
                    'DiscreteMLPQFunction/MLPModel/mlp/hidden_0', reuse=True):
                bias = tf.get_variable('bias')
                # assign it to all one
                old_bias = tf.ones_like(bias).eval()
                bias.load(old_bias)
                h = pickle.dumps(algo)

            with tf.Session(graph=tf.Graph()):
                pickle.loads(h)
                with tf.variable_scope(
                        'DiscreteMLPQFunction/MLPModel/mlp/hidden_0',
                        reuse=True):
                    new_bias = tf.get_variable('bias')
                    new_bias = new_bias.eval()
                    assert np.array_equal(old_bias, new_bias)

            env.close()
Ejemplo n.º 15
0
class TestContinuousPolicies(TfGraphTestCase):
    def setup_method(self):
        super().setup_method()
        self.env = TfEnv(DummyBoxEnv())
        self.obs_var = tf.compat.v1.placeholder(
            tf.float32,
            shape=[None, None, self.env.observation_space.flat_dim],
            name='obs')

    def teardown_method(self):
        self.env.close()
        super().teardown_method()

    def test_continuous_mlp_policy(self):
        continuous_mlp_policy = ContinuousMLPPolicy(env_spec=self.env,
                                                    hidden_sizes=(1, ))
        self.sess.run(tf.compat.v1.global_variables_initializer())

        obs = self.env.observation_space.high
        assert continuous_mlp_policy.get_action(obs)

    def test_gaussian_gru_policy(self):
        gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env,
                                                hidden_dim=1,
                                                state_include_action=False)
        self.sess.run(tf.compat.v1.global_variables_initializer())

        gaussian_gru_policy.build(self.obs_var)
        gaussian_gru_policy.reset()

        obs = self.env.observation_space.high
        assert gaussian_gru_policy.get_action(obs)

    def test_gaussian_lstm_policy(self):
        gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env,
                                                  hidden_dim=1,
                                                  state_include_action=False)
        self.sess.run(tf.compat.v1.global_variables_initializer())

        gaussian_lstm_policy.build(self.obs_var)
        gaussian_lstm_policy.reset()

        obs = self.env.observation_space.high
        assert gaussian_lstm_policy.get_action(obs)

    def test_gaussian_mlp_policy(self):
        gaussian_mlp_policy = GaussianMLPPolicy(env_spec=self.env,
                                                hidden_sizes=(1, ))
        self.sess.run(tf.compat.v1.global_variables_initializer())
        gaussian_mlp_policy.build(self.obs_var)

        obs = self.env.observation_space.high
        assert gaussian_mlp_policy.get_action(obs)
Ejemplo n.º 16
0
class TestContinuousPolicies(TfGraphTestCase):
    def setUp(self):
        super().setUp()
        self.env = TfEnv(DummyBoxEnv())

    def tearDown(self):
        self.env.close()
        super().tearDown()

    def test_continuous_mlp_policy(self):
        continuous_mlp_policy = ContinuousMLPPolicy(env_spec=self.env,
                                                    hidden_sizes=(1, ))
        self.sess.run(tf.global_variables_initializer())

        obs = self.env.observation_space.high
        assert continuous_mlp_policy.get_action(obs)

    def test_deterministic_mlp_policy(self):
        deterministic_mlp_policy = DeterministicMLPPolicy(env_spec=self.env,
                                                          hidden_sizes=(1, ))
        self.sess.run(tf.global_variables_initializer())

        obs = self.env.observation_space.high
        assert deterministic_mlp_policy.get_action(obs)

    def test_gaussian_gru_policy(self):
        gaussian_gru_policy = GaussianGRUPolicy(env_spec=self.env,
                                                hidden_dim=1)
        self.sess.run(tf.global_variables_initializer())

        gaussian_gru_policy.reset()

        obs = self.env.observation_space.high
        assert gaussian_gru_policy.get_action(obs)

    def test_gaussian_lstm_policy(self):
        gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env,
                                                  hidden_dim=1)
        self.sess.run(tf.global_variables_initializer())

        gaussian_lstm_policy.reset()

        obs = self.env.observation_space.high
        assert gaussian_lstm_policy.get_action(obs)

    def test_gaussian_mlp_policy(self):
        gaussian_mlp_policy = GaussianMLPPolicy(env_spec=self.env,
                                                hidden_sizes=(1, ))
        self.sess.run(tf.global_variables_initializer())

        obs = self.env.observation_space.high
        assert gaussian_mlp_policy.get_action(obs)
Ejemplo n.º 17
0
class TestGrayscale(unittest.TestCase):
    def setUp(self):
        self.env = TfEnv(DummyDiscretePixelEnv(random=False))
        self.env_g = TfEnv(Grayscale(DummyDiscretePixelEnv(random=False)))

    def tearDown(self):
        self.env.close()
        self.env_g.close()

    def test_gray_scale_invalid_environment_type(self):
        with self.assertRaises(ValueError):
            self.env.observation_space = Discrete(64)
            Grayscale(self.env)

    def test_gray_scale_invalid_environment_shape(self):
        with self.assertRaises(ValueError):
            self.env.observation_space = Box(
                low=0, high=255, shape=(4, ), dtype=np.uint8)
            Grayscale(self.env)

    def test_grayscale_observation_space(self):
        assert self.env_g.observation_space.shape == (
            self.env.observation_space.shape[:-1])

    def test_grayscale_reset(self):
        """
        RGB to grayscale conversion using scikit-image.

        Weights used for conversion:
        Y = 0.2125 R + 0.7154 G + 0.0721 B

        Reference:
        http://scikit-image.org/docs/dev/api/skimage.color.html#skimage.color.rgb2grey
        """
        gray_scale_output = np.round(
            np.dot(self.env.reset()[:, :, :3],
                   [0.2125, 0.7154, 0.0721])).astype(np.uint8)
        np.testing.assert_array_almost_equal(gray_scale_output,
                                             self.env_g.reset())

    def test_grayscale_step(self):
        self.env.reset()
        self.env_g.reset()
        obs, _, _, _ = self.env.step(1)
        obs_g, _, _, _ = self.env_g.step(1)

        gray_scale_output = np.round(
            np.dot(obs[:, :, :3], [0.2125, 0.7154, 0.0721])).astype(np.uint8)
        np.testing.assert_array_almost_equal(gray_scale_output, obs_g)
Ejemplo n.º 18
0
class TestStackFrames(unittest.TestCase):
    def setUp(self):
        self.n_frames = 4
        self.env = TfEnv(DummyDiscrete2DEnv(random=False))
        self.env_s = TfEnv(
            StackFrames(
                DummyDiscrete2DEnv(random=False), n_frames=self.n_frames))
        self.width, self.height = self.env.observation_space.shape

    def tearDown(self):
        self.env.close()
        self.env_s.close()

    def test_stack_frames_invalid_environment_type(self):
        with self.assertRaises(ValueError):
            self.env.observation_space = Discrete(64)
            StackFrames(self.env, n_frames=4)

    def test_stack_frames_invalid_environment_shape(self):
        with self.assertRaises(ValueError):
            self.env.observation_space = Box(
                low=0, high=255, shape=(4, ), dtype=np.uint8)
            StackFrames(self.env, n_frames=4)

    def test_stack_frames_output_observation_space(self):
        assert self.env_s.observation_space.shape == (self.width, self.height,
                                                      self.n_frames)

    def test_stack_frames_for_reset(self):
        frame_stack = self.env.reset()
        for i in range(self.n_frames - 1):
            frame_stack = np.dstack((frame_stack, self.env.reset()))

        np.testing.assert_array_equal(self.env_s.reset(), frame_stack)

    def test_stack_frames_for_step(self):
        self.env.reset()
        self.env_s.reset()

        frame_stack = np.empty((self.width, self.height, self.n_frames))
        for i in range(10):
            frame_stack = frame_stack[:, :, 1:]
            obs, _, _, _ = self.env.step(1)
            frame_stack = np.dstack((frame_stack, obs))
            obs_stack, _, _, _ = self.env_s.step(1)

        np.testing.assert_array_equal(obs_stack, frame_stack)
Ejemplo n.º 19
0
    def test_dqn_cartpole_grad_clip(self):
        """Test DQN with CartPole environment."""
        with LocalRunner(self.sess) as runner:
            n_epochs = 10
            n_epoch_cycles = 10
            sampler_batch_size = 500
            num_timesteps = n_epochs * n_epoch_cycles * sampler_batch_size
            env = TfEnv(gym.make('CartPole-v0'))
            replay_buffer = SimpleReplayBuffer(
                env_spec=env.spec,
                size_in_transitions=int(1e4),
                time_horizon=1)
            qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
            policy = DiscreteQfDerivedPolicy(env_spec=env.spec, qf=qf)
            epilson_greedy_strategy = EpsilonGreedyStrategy(
                env_spec=env.spec,
                total_timesteps=num_timesteps,
                max_epsilon=1.0,
                min_epsilon=0.02,
                decay_ratio=0.1)
            algo = DQN(
                env_spec=env.spec,
                policy=policy,
                qf=qf,
                exploration_strategy=epilson_greedy_strategy,
                replay_buffer=replay_buffer,
                qf_lr=1e-4,
                discount=1.0,
                min_buffer_size=int(1e3),
                double_q=False,
                n_train_steps=500,
                grad_norm_clipping=5.0,
                n_epoch_cycles=n_epoch_cycles,
                target_network_update_freq=1,
                buffer_batch_size=32)

            runner.setup(algo, env)
            last_avg_ret = runner.train(
                n_epochs=n_epochs,
                n_epoch_cycles=n_epoch_cycles,
                batch_size=sampler_batch_size)
            assert last_avg_ret > 20

            env.close()
Ejemplo n.º 20
0
class TestRaySamplerTF():
    """
    Uses mock policy for 4x4 gridworldenv
    '4x4': [
        'SFFF',
        'FHFH',
        'FFFH',
        'HFFG'
    ]
    0: left
    1: down
    2: right
    3: up
    -1: no move
    'S' : starting point
    'F' or '.': free space
    'W' or 'x': wall
    'H' or 'o': hole (terminates episode)
    'G' : goal
    [2,2,1,0,3,1,1,1,2,2,1,1,1,2,2,1]
    """
    def setup_method(self):
        ray.init(local_mode=True, ignore_reinit_error=True)

        self.env = TfEnv(GridWorldEnv(desc='4x4'))
        self.policy = ScriptedPolicy(
            scripted_actions=[2, 2, 1, 0, 3, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1])
        self.algo = Mock(env_spec=self.env.spec,
                         policy=self.policy,
                         max_path_length=16)

    def teardown_method(self):
        self.env.close()

    def test_ray_batch_sampler(self):
        workers = WorkerFactory(seed=100,
                                max_path_length=self.algo.max_path_length)
        sampler1 = RaySamplerTF(workers,
                                self.policy,
                                self.env,
                                num_processors=1)
        sampler1.start_worker()
        sampler1.shutdown_worker()
Ejemplo n.º 21
0
def test_init_with_env_updates():
    max_path_length = 16
    env = TfEnv(PointEnv())
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_path_length)
                         ])
    tasks = SetTaskSampler(lambda: TfEnv(PointEnv()))
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = MultiprocessingSampler.from_worker_factory(
        workers, policy, envs=tasks.sample(n_workers))
    rollouts = sampler.obtain_samples(0, 160, policy)
    assert sum(rollouts.lengths) >= 160
    sampler.shutdown_worker()
    env.close()
Ejemplo n.º 22
0
class TestNormalizedGym(unittest.TestCase):
    def setUp(self):
        self.env = TfEnv(
            normalize(
                gym.make('Pendulum-v0'),
                normalize_reward=True,
                normalize_obs=True,
                flatten_obs=True))

    def tearDown(self):
        self.env.close()

    def test_does_not_modify_action(self):
        a = self.env.action_space.sample()
        a_copy = a
        self.env.reset()
        self.env.step(a)
        self.assertEqual(a, a_copy)

    def test_flatten(self):
        for _ in range(10):
            self.env.reset()
            for _ in range(5):
                self.env.render()
                action = self.env.action_space.sample()
                next_obs, _, done, _ = self.env.step(action)
                self.assertEqual(next_obs.shape,
                                 self.env.observation_space.low.shape)
                if done:
                    break

    def test_unflatten(self):
        for _ in range(10):
            self.env.reset()
            for _ in range(5):
                action = self.env.action_space.sample()
                next_obs, _, done, _ = self.env.step(action)
                self.assertEqual(
                    self.env.observation_space.flatten(next_obs).shape,
                    self.env.observation_space.flat_dim)
                if done:
                    break
Ejemplo n.º 23
0
class TestDiscretePolicies(TfGraphTestCase):
    def setup_method(self):
        super().setup_method()
        self.env = TfEnv(DummyDiscreteEnv())
        self.obs_var = tf.compat.v1.placeholder(
            tf.float32,
            shape=[None, None, self.env.observation_space.flat_dim],
            name='obs')

    def teardown_method(self):
        self.env.close()
        super().teardown_method()

    def test_categorial_gru_policy(self):
        categorical_gru_policy = CategoricalGRUPolicy(
            env_spec=self.env, hidden_dim=1, state_include_action=False)
        self.sess.run(tf.compat.v1.global_variables_initializer())
        categorical_gru_policy.build(self.obs_var)
        categorical_gru_policy.reset()

        obs = self.env.observation_space.high
        assert categorical_gru_policy.get_action(obs)

    def test_categorical_lstm_policy(self):
        categorical_lstm_policy = CategoricalLSTMPolicy(
            env_spec=self.env, hidden_dim=1, state_include_action=False)
        self.sess.run(tf.compat.v1.global_variables_initializer())
        categorical_lstm_policy.build(self.obs_var)
        categorical_lstm_policy.reset()

        obs = self.env.observation_space.high
        assert categorical_lstm_policy.get_action(obs)

    def test_categorial_mlp_policy(self):
        categorical_mlp_policy = CategoricalMLPPolicy(env_spec=self.env,
                                                      hidden_sizes=(1, ))
        self.sess.run(tf.compat.v1.global_variables_initializer())
        categorical_mlp_policy.build(self.obs_var)

        obs = self.env.observation_space.high
        assert categorical_mlp_policy.get_action(obs)
Ejemplo n.º 24
0
    def test_ddpg_pendulum_with_decayed_weights(self):
        """Test DDPG with Pendulum environment and decayed weights.

        This environment has a [-3, 3] action_space bound.
        """
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedPendulum-v2')))
            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[64, 64],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)
            exploration_policy = AddOrnsteinUhlenbeckNoise(env.spec,
                                                           policy,
                                                           sigma=0.2)
            qf = ContinuousMLPQFunction(env_spec=env.spec,
                                        hidden_sizes=[64, 64],
                                        hidden_nonlinearity=tf.nn.relu)
            replay_buffer = SimpleReplayBuffer(env_spec=env.spec,
                                               size_in_transitions=int(1e6),
                                               time_horizon=100)
            algo = DDPG(
                env_spec=env.spec,
                policy=policy,
                policy_lr=1e-4,
                qf_lr=1e-3,
                qf=qf,
                replay_buffer=replay_buffer,
                steps_per_epoch=20,
                target_update_tau=1e-2,
                n_train_steps=50,
                discount=0.9,
                policy_weight_decay=0.01,
                qf_weight_decay=0.01,
                min_buffer_size=int(5e3),
                exploration_policy=exploration_policy,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=100)
            assert last_avg_ret > 10

            env.close()
Ejemplo n.º 25
0
class TestNormalizedGym:
    def setup_method(self):
        self.env = TfEnv(
            normalize(gym.make('Pendulum-v0'),
                      normalize_reward=True,
                      normalize_obs=True,
                      flatten_obs=True))

    def teardown_method(self):
        self.env.close()

    def test_does_not_modify_action(self):
        a = self.env.action_space.sample()
        a_copy = a
        self.env.reset()
        self.env.step(a)
        assert a == a_copy

    def test_flatten(self):
        for _ in range(10):
            self.env.reset()
            for _ in range(5):
                self.env.render()
                action = self.env.action_space.sample()
                next_obs, _, done, _ = self.env.step(action)
                assert next_obs.shape == self.env.observation_space.low.shape
                if done:
                    break

    def test_unflatten(self):
        for _ in range(10):
            self.env.reset()
            for _ in range(5):
                action = self.env.action_space.sample()
                next_obs, _, done, _ = self.env.step(action)
                # yapf: disable
                assert (self.env.observation_space.flatten(next_obs).shape
                        == self.env.observation_space.flat_dim)
                # yapf: enable
                if done:
                    break
    def test_ppo_pendulum_recurrent_continuous_baseline(self):
        """Test PPO with Pendulum environment and recurrent policy."""
        with LocalRunner() as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianLSTMPolicy(env_spec=env.spec, )
            baseline = ContinuousMLPBaselineWithModel(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=100,
                       discount=0.99,
                       lr_clip_range=0.01,
                       optimizer_args=dict(batch_size=32, max_epochs=10))
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 30

            env.close()
Ejemplo n.º 27
0
    def test_baseline(self):
        """Test the baseline initialization."""
        box_env = TfEnv(DummyBoxEnv())
        deterministic_mlp_baseline = DeterministicMLPBaseline(env_spec=box_env)
        gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env)

        discrete_env = TfEnv(Resize(DummyDiscrete2DEnv(), width=64, height=64))
        gaussian_conv_baseline = GaussianConvBaseline(
            env_spec=discrete_env,
            regressor_args=dict(conv_filters=[32, 32],
                                conv_filter_sizes=[1, 1],
                                conv_strides=[1, 1],
                                conv_pads=["VALID", "VALID"],
                                hidden_sizes=(32, 32)))

        self.sess.run(tf.global_variables_initializer())
        deterministic_mlp_baseline.get_param_values(trainable=True)
        gaussian_mlp_baseline.get_param_values(trainable=True)
        gaussian_conv_baseline.get_param_values(trainable=True)

        box_env.close()
Ejemplo n.º 28
0
class TestRepeatAction(unittest.TestCase):
    def setUp(self):
        self.env = TfEnv(DummyDiscreteEnv(random=False))
        self.env_r = TfEnv(
            RepeatAction(DummyDiscreteEnv(random=False), n_frame_to_repeat=4))

    def tearDown(self):
        self.env.close()
        self.env_r.close()

    def test_repeat_action_reset(self):
        np.testing.assert_array_equal(self.env.reset(), self.env_r.reset())

    def test_repeat_action_step(self):
        self.env.reset()
        self.env_r.reset()
        obs_repeat, _, _, _ = self.env_r.step(1)
        for i in range(4):
            obs, _, _, _ = self.env.step(1)

        np.testing.assert_array_equal(obs, obs_repeat)
Ejemplo n.º 29
0
    def test_erwr_cartpole(self):
        """Test ERWR with Cartpole-v1 environment."""
        env = TfEnv(env_name="CartPole-v1")

        policy = CategoricalMLPPolicy(name="policy",
                                      env_spec=env.spec,
                                      hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = ERWR(env=env,
                    policy=policy,
                    baseline=baseline,
                    batch_size=10000,
                    max_path_length=100,
                    n_itr=10,
                    discount=0.99)

        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 80

        env.close()
Ejemplo n.º 30
0
    def test_reps_cartpole(self):
        """Test REPS with gym Cartpole environment."""
        with LocalRunner(sess=self.sess) as runner:
            env = TfEnv(gym.make('CartPole-v0'))

            policy = CategoricalMLPPolicy(env_spec=env.spec,
                                          hidden_sizes=[32, 32])

            baseline = LinearFeatureBaseline(env_spec=env.spec)

            algo = REPS(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99)

            runner.setup(algo, env)

            last_avg_ret = runner.train(n_epochs=10, batch_size=4000)
            assert last_avg_ret > 5

            env.close()