Beispiel #1
0
    def test_fit_without_trusted_region(self):
        box_env_spec = GymEnv(DummyBoxEnv(obs_dim=(2, ))).spec
        gmb = GaussianMLPBaseline(env_spec=box_env_spec,
                                  use_trust_region=False)

        train_paths, _, _, paths, expected = get_train_test_data()

        for _ in range(150):
            gmb.fit(train_paths)

        prediction = gmb.predict(paths)

        assert np.allclose(prediction, expected, rtol=0, atol=0.1)
Beispiel #2
0
 def setup_method(self):
     super().setup_method()
     self.env = normalize(GymEnv('InvertedDoublePendulum-v2'))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         hidden_sizes=(32, 32),
     )
Beispiel #3
0
 def setup_method(self):
     """Setup method which is called before every test."""
     self.env = normalize(GymEnv(HalfCheetahDirEnv(),
                                 max_episode_length=100),
                          expected_action_scale=10.)
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=torch.tanh,
         output_nonlinearity=None,
     )
     self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec,
                                                    hidden_sizes=(32, 32))
Beispiel #4
0
def test_output_shape(batch_size, hidden_sizes):
    env_spec = GymEnv(DummyBoxEnv()).spec
    obs_dim = env_spec.observation_space.flat_dim
    obs = torch.ones(batch_size, obs_dim, dtype=torch.float32)

    qf = DiscreteMLPQFunction(env_spec=env_spec,
                              hidden_nonlinearity=None,
                              hidden_sizes=hidden_sizes,
                              hidden_w_init=nn.init.ones_,
                              output_w_init=nn.init.ones_)
    output = qf(obs)

    assert output.shape == (batch_size, env_spec.action_space.flat_dim)
Beispiel #5
0
 def setup_method(self):
     """Setup method which is called before every test."""
     self.env = normalize(GymEnv(HalfCheetahDirEnv(),
                                 max_episode_length=100),
                          expected_action_scale=10.)
     self.task_sampler = SetTaskSampler(
         HalfCheetahDirEnv,
         wrapper=lambda env, _: normalize(GymEnv(env,
                                                 max_episode_length=100),
                                          expected_action_scale=10.))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=torch.tanh,
         output_nonlinearity=None,
     )
     self.value_function = GaussianMLPValueFunction(env_spec=self.env.spec,
                                                    hidden_sizes=(32, 32))
     self.sampler = LocalSampler(
         agents=self.policy,
         envs=self.env,
         max_episode_length=self.env.spec.max_episode_length)
def continuous_mlp_baseline(ctxt, env_id, seed):
    """Create Continuous MLP Baseline on TF-PPO.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=hyper_params['policy_hidden_sizes'],
            hidden_nonlinearity=hyper_params['hidden_nonlinearity'],
        )

        baseline = ContinuousMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
        )

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   sampler=sampler,
                   discount=hyper_params['discount'],
                   gae_lambda=hyper_params['gae_lambda'],
                   lr_clip_range=hyper_params['lr_clip_range'],
                   entropy_method=hyper_params['entropy_method'],
                   policy_ent_coeff=hyper_params['policy_ent_coeff'],
                   optimizer_args=dict(
                       batch_size=32,
                       max_optimization_epochs=10,
                       learning_rate=1e-3,
                   ),
                   center_adv=hyper_params['center_adv'],
                   stop_entropy_gradient=True)

        trainer.setup(algo, env)
        trainer.train(n_epochs=hyper_params['n_epochs'],
                      batch_size=hyper_params['n_exploration_steps'])
Beispiel #7
0
def test_sac_inverted_double_pendulum():
    """Test Sac performance on inverted pendulum."""
    # pylint: disable=unexpected-keyword-arg
    env = normalize(GymEnv('InvertedDoublePendulum-v2',
                           max_episode_length=100))
    deterministic.set_seed(0)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[32, 32],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[32, 32],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )
    runner = LocalRunner(snapshot_config=snapshot_config)
    sac = SAC(env_spec=env.spec,
              policy=policy,
              qf1=qf1,
              qf2=qf2,
              gradient_steps_per_itr=100,
              replay_buffer=replay_buffer,
              min_buffer_size=1e3,
              target_update_tau=5e-3,
              discount=0.99,
              buffer_batch_size=64,
              reward_scale=1.,
              steps_per_epoch=2)
    runner.setup(sac, env, sampler_cls=LocalSampler)
    if torch.cuda.is_available():
        set_gpu_mode(True)
    else:
        set_gpu_mode(False)
    sac.to()
    ret = runner.train(n_epochs=12, batch_size=200, plot=False)
    # check that automatic entropy tuning is used
    assert sac._use_automatic_entropy_tuning
    # assert that there was a gradient properly connected to alpha
    # this doesn't verify that the path from the temperature objective is
    # correct.
    assert not torch.allclose(torch.Tensor([1.]), sac._log_alpha.to('cpu'))
    # check that policy is learning beyond predecided threshold
    assert ret > 80
def gaussian_mlp_baseline(ctxt, env_id, seed):
    """Create Gaussian MLP Baseline on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            use_trust_region=False,
            optimizer=FirstOrderOptimizer,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=1e-3,
            ),
        )

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=1e-3,
            ),
        )
        trainer.setup(algo, env, sampler_args=dict(n_envs=12))
        trainer.train(n_epochs=5, batch_size=2048)
Beispiel #9
0
def test_mtsac_get_log_alpha_incorrect_num_tasks(monkeypatch):
    """Check that if the num_tasks passed does not match the number of tasks

    in the environment, then the algorithm should raise an exception.

    MTSAC uses disentangled alphas, meaning that

    """
    env_names = ['CartPole-v0', 'CartPole-v1']
    task_envs = [GymEnv(name, max_episode_length=150) for name in env_names]
    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
    deterministic.set_seed(0)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[1, 1],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    buffer_batch_size = 2
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  eval_env=env,
                  env_spec=env.spec,
                  num_tasks=4,
                  steps_per_epoch=5,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1e3,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=buffer_batch_size)
    monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.]))
    error_string = ('The number of tasks in the environment does '
                    'not match self._num_tasks. Are you sure that you passed '
                    'The correct number of tasks?')
    obs = torch.Tensor([env.reset()[0]] * buffer_batch_size)
    with pytest.raises(ValueError, match=error_string):
        mtsac._get_log_alpha(dict(observation=obs))
Beispiel #10
0
    def test_td3_pendulum(self):
        """Test TD3 with Pendulum environment."""
        with LocalTFRunner(snapshot_config) as runner:
            env = GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)

            policy = ContinuousMLPPolicy(env_spec=env.spec,
                                         hidden_sizes=[400, 300],
                                         hidden_nonlinearity=tf.nn.relu,
                                         output_nonlinearity=tf.nn.tanh)

            exploration_policy = AddGaussianNoise(env.spec,
                                                  policy,
                                                  max_sigma=0.1,
                                                  min_sigma=0.1)

            qf = ContinuousMLPQFunction(name='ContinuousMLPQFunction',
                                        env_spec=env.spec,
                                        hidden_sizes=[400, 300],
                                        action_merge_layer=0,
                                        hidden_nonlinearity=tf.nn.relu)

            qf2 = ContinuousMLPQFunction(name='ContinuousMLPQFunction2',
                                         env_spec=env.spec,
                                         hidden_sizes=[400, 300],
                                         action_merge_layer=0,
                                         hidden_nonlinearity=tf.nn.relu)

            replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))

            algo = TD3(env_spec=env.spec,
                       policy=policy,
                       policy_lr=1e-3,
                       qf_lr=1e-3,
                       qf=qf,
                       qf2=qf2,
                       replay_buffer=replay_buffer,
                       steps_per_epoch=20,
                       target_update_tau=0.005,
                       n_train_steps=50,
                       discount=0.99,
                       min_buffer_size=int(1e4),
                       buffer_batch_size=100,
                       policy_weight_decay=0.001,
                       qf_weight_decay=0.001,
                       exploration_policy=exploration_policy,
                       policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                       qf_optimizer=tf.compat.v1.train.AdamOptimizer)

            runner.setup(algo, env, sampler_cls=LocalSampler)
            last_avg_ret = runner.train(n_epochs=10, batch_size=250)
            assert last_avg_ret > 200
Beispiel #11
0
    def test_dqn_cartpole_pickle(self):
        """Test DQN with CartPole environment."""
        deterministic.set_seed(100)
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            n_epochs = 10
            steps_per_epoch = 10
            sampler_batch_size = 500
            num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
            env = GymEnv('CartPole-v0')
            replay_buffer = PathBuffer(capacity_in_transitions=int(1e4))
            qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
            policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
            epilson_greedy_policy = EpsilonGreedyPolicy(
                env_spec=env.spec,
                policy=policy,
                total_timesteps=num_timesteps,
                max_epsilon=1.0,
                min_epsilon=0.02,
                decay_ratio=0.1)
            algo = DQN(env_spec=env.spec,
                       policy=policy,
                       qf=qf,
                       exploration_policy=epilson_greedy_policy,
                       replay_buffer=replay_buffer,
                       qf_lr=1e-4,
                       discount=1.0,
                       min_buffer_size=int(1e3),
                       double_q=False,
                       n_train_steps=500,
                       grad_norm_clipping=5.0,
                       steps_per_epoch=steps_per_epoch,
                       target_network_update_freq=1,
                       buffer_batch_size=32)
            trainer.setup(algo, env)
            with tf.compat.v1.variable_scope(
                    'DiscreteMLPQFunction/mlp/hidden_0', reuse=True):
                bias = tf.compat.v1.get_variable('bias')
                # assign it to all one
                old_bias = tf.ones_like(bias).eval()
                bias.load(old_bias)
                h = pickle.dumps(algo)

            with tf.compat.v1.Session(graph=tf.Graph()):
                pickle.loads(h)
                with tf.compat.v1.variable_scope(
                        'DiscreteMLPQFunction/mlp/hidden_0', reuse=True):
                    new_bias = tf.compat.v1.get_variable('bias')
                    new_bias = new_bias.eval()
                    assert np.array_equal(old_bias, new_bias)

            env.close()
Beispiel #12
0
def multi_env_ppo(ctxt=None, seed=1):
    """Train PPO on two Atari environments simultaneously.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env1 = normalize(GymEnv('Adventure-ram-v4'))
        env2 = normalize(GymEnv('Alien-ram-v4'))
        env = MultiEnvWrapper([env1, env2])

        policy = CategoricalMLPPolicy(
            env_spec=env.spec,
            hidden_nonlinearity=tf.nn.tanh,
        )

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   discount=0.99,
                   gae_lambda=0.95,
                   lr_clip_range=0.2,
                   policy_ent_coeff=0.0,
                   optimizer_args=dict(
                       batch_size=32,
                       max_optimization_epochs=10,
                       learning_rate=1e-3,
                   ))

        trainer.setup(algo, env)
        trainer.train(n_epochs=120, batch_size=2048, plot=False)
Beispiel #13
0
def test_get_time_limit_finds_time_limit():
    env = gym.make('PongNoFrameskip-v4')
    time_limit = env._max_episode_steps

    env = Noop(env, noop_max=30)
    env = MaxAndSkip(env, skip=4)
    env = EpisodicLife(env)
    env = Grayscale(env)
    env = Resize(env, 84, 84)
    env = ClipReward(env)
    env = StackFrames(env, 4)

    env = GymEnv(env)
    assert env._max_episode_length == time_limit
def test_get_action():
    env_spec = GymEnv(DummyBoxEnv()).spec
    obs_dim = env_spec.observation_space.flat_dim
    obs = torch.ones([
        obs_dim,
    ], dtype=torch.float32)
    qf = DiscreteMLPQFunction(env_spec=env_spec,
                              hidden_nonlinearity=None,
                              hidden_sizes=(2, 2))
    qvals = qf(obs.unsqueeze(0))
    policy = DiscreteQFArgmaxPolicy(qf, env_spec)
    action, _ = policy.get_action(obs.numpy())
    assert action == torch.argmax(qvals, dim=1).numpy()
    assert action.shape == ()
    def test_build(self, obs_dim, action_dim):
        env = GymEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim))
        qf = DiscreteMLPDuelingQFunction(env_spec=env.spec)
        env.reset()
        obs = env.step(1).observation

        output1 = self.sess.run(qf.q_vals, feed_dict={qf.input: [obs]})

        input_var = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, ) + obs_dim)
        q_vals = qf.build(input_var, 'another')
        output2 = self.sess.run(q_vals, feed_dict={input_var: [obs]})

        assert np.array_equal(output1, output2)
 def test_get_action(self, hidden_channels, kernel_sizes, strides,
                     hidden_sizes):
     """Test get_action function."""
     env = GymEnv(DummyDiscretePixelEnv(), is_image=True)
     policy = CategoricalCNNPolicy(env_spec=env.spec,
                                   image_format='NHWC',
                                   kernel_sizes=kernel_sizes,
                                   hidden_channels=hidden_channels,
                                   strides=strides,
                                   hidden_sizes=hidden_sizes)
     env.reset()
     obs = env.step(1).observation
     action, _ = policy.get_action(obs)
     assert env.action_space.contains(action)
Beispiel #17
0
def ppo_garage_tf(ctxt, env_id, seed):
    """Create garage TensorFlow PPO model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = TF_GMP(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = TF_GMB(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            use_trust_region=False,
            optimizer=FirstOrderOptimizer,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=3e-4,
            ),
        )

        algo = TF_PPO(env_spec=env.spec,
                      policy=policy,
                      baseline=baseline,
                      discount=0.99,
                      gae_lambda=0.95,
                      center_adv=True,
                      lr_clip_range=0.2,
                      optimizer_args=dict(batch_size=32,
                                          max_optimization_epochs=10,
                                          learning_rate=3e-4,
                                          verbose=True))

        trainer.setup(algo, env)
        trainer.train(n_epochs=hyper_parameters['n_epochs'],
                      batch_size=hyper_parameters['batch_size'])
Beispiel #18
0
def test_mtsac_get_log_alpha(monkeypatch):
    """Check that the private function _get_log_alpha functions correctly.

    MTSAC uses disentangled alphas, meaning that

    """
    env_names = ['CartPole-v0', 'CartPole-v1']
    task_envs = [GymEnv(name) for name in env_names]
    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
    deterministic.set_seed(0)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[1, 1],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    num_tasks = 2
    buffer_batch_size = 2
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_episode_length=150,
                  eval_env=env,
                  env_spec=env.spec,
                  num_tasks=num_tasks,
                  steps_per_epoch=5,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1e3,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=buffer_batch_size)
    monkeypatch.setattr(mtsac, '_log_alpha', torch.Tensor([1., 2.]))
    for i, _ in enumerate(env_names):
        obs = torch.Tensor([env.reset()[0]] * buffer_batch_size)
        log_alpha = mtsac._get_log_alpha(dict(observation=obs))
        assert (log_alpha == torch.Tensor([i + 1, i + 1])).all().item()
        assert log_alpha.size() == torch.Size([mtsac._buffer_batch_size])
Beispiel #19
0
    def test_get_action(self, obs_dim, action_dim, obs_type):
        """Test get_action method"""
        assert obs_type in ['box', 'dict']
        if obs_type == 'box':
            env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        else:
            env = GymEnv(
                DummyDictEnv(obs_space_type='box', act_space_type='box'))

        policy = ContinuousMLPPolicy(env_spec=env.spec)

        env.reset()
        obs = env.step(1).observation
        if obs_type == 'box':
            obs = obs.flatten()

        action, _ = policy.get_action(obs)

        assert env.action_space.contains(action)

        actions, _ = policy.get_actions([obs, obs, obs])
        for action in actions:
            assert env.action_space.contains(action)
Beispiel #20
0
    def test_get_action(self, obs_dim, action_dim, hidden_dim):
        env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = GaussianGRUPolicy(env_spec=env.spec,
                                   hidden_dim=hidden_dim,
                                   state_include_action=False)
        policy.reset(do_resets=None)
        obs = env.reset()[0]

        action, _ = policy.get_action(obs.flatten())
        assert env.action_space.contains(action)

        actions, _ = policy.get_actions([obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
Beispiel #21
0
def ppo_pendulum(ctxt=None, seed=1):
    """Train PPO with InvertedDoublePendulum-v2 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(snapshot_config=ctxt) as trainer:
        env = normalize(GymEnv('InvertedDoublePendulum-v2'))

        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            use_trust_region=True,
        )

        # NOTE: make sure when setting entropy_method to 'max', set
        # center_adv to False and turn off policy gradient. See
        # tf.algos.NPO for detailed documentation.
        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
            ),
            stop_entropy_gradient=True,
            entropy_method='max',
            policy_ent_coeff=0.02,
            center_adv=False,
        )

        trainer.setup(algo, env)

        trainer.train(n_epochs=120, batch_size=2048, plot=False)
Beispiel #22
0
 def test_all_gym_envs_pickleable(self, spec):
     if spec._env_name.startswith('Defender'):
         pytest.skip(
             'Defender-* envs bundled in atari-py 0.2.x don\'t load')
     if 'Kuka' in spec.id:
         # Kuka environments calls py_bullet.resetSimulation() in reset()
         # unconditionally, which globally resets other simulations. So
         # only one Kuka environment can be tested.
         pytest.skip('Skip Kuka Bullet environments')
     elif 'Minitaur' in spec.id:
         pytest.skip('Bulle Minitaur envs don\'t load')
     env = GymEnv(spec.id)
     round_trip = pickle.loads(pickle.dumps(env))
     assert round_trip
    def test_get_action(self, obs_dim, action_dim):
        env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = GaussianMLPPolicy(env_spec=env.spec)

        env.reset()
        obs = env.step(1).observation

        action, _ = policy.get_action(obs.flatten())
        assert env.action_space.contains(action)
        actions, _ = policy.get_actions(
            [obs.flatten(), obs.flatten(),
             obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
Beispiel #24
0
def ddpg_garage_tf(ctxt, env_id, seed):
    """Create garage TensorFlow DDPG model and training.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = ContinuousMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['policy_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu,
            output_nonlinearity=tf.nn.tanh)

        exploration_policy = AddOrnsteinUhlenbeckNoise(
            env.spec, policy, sigma=hyper_parameters['sigma'])

        qf = ContinuousMLPQFunction(
            env_spec=env.spec,
            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
            hidden_nonlinearity=tf.nn.relu)

        replay_buffer = PathBuffer(
            capacity_in_transitions=hyper_parameters['replay_buffer_size'])

        algo = DDPG(env_spec=env.spec,
                    policy=policy,
                    qf=qf,
                    replay_buffer=replay_buffer,
                    steps_per_epoch=hyper_parameters['steps_per_epoch'],
                    policy_lr=hyper_parameters['policy_lr'],
                    qf_lr=hyper_parameters['qf_lr'],
                    target_update_tau=hyper_parameters['tau'],
                    n_train_steps=hyper_parameters['n_train_steps'],
                    discount=hyper_parameters['discount'],
                    min_buffer_size=int(1e4),
                    exploration_policy=exploration_policy,
                    policy_optimizer=tf.compat.v1.train.AdamOptimizer,
                    qf_optimizer=tf.compat.v1.train.AdamOptimizer)

        trainer.setup(algo, env)
        trainer.train(n_epochs=hyper_parameters['n_epochs'],
                      batch_size=hyper_parameters['n_exploration_steps'])
Beispiel #25
0
    def _init_multi_env_wrapper(self,
                                env_names,
                                sample_strategy=uniform_random_strategy):
        """helper function to initialize multi_env_wrapper

        Args:
            env_names (list(str)): List of Environment names.
            sample_strategy (func): A sampling strategy.

        Returns:
            garage.envs.multi_env_wrapper: Multi env wrapper.
        """
        task_envs = [GymEnv(name) for name in env_names]
        return MultiEnvWrapper(task_envs, sample_strategy=sample_strategy)
Beispiel #26
0
    def test_get_action_dict_space(self):
        env = GymEnv(DummyDictEnv(obs_space_type='box', act_space_type='box'))
        policy = GaussianGRUPolicy(env_spec=env.spec,
                                   hidden_dim=4,
                                   state_include_action=False)
        policy.reset(do_resets=None)
        obs = env.reset()[0]

        action, _ = policy.get_action(obs)
        assert env.action_space.contains(action)

        actions, _ = policy.get_actions([obs, obs])
        for action in actions:
            assert env.action_space.contains(action)
Beispiel #27
0
 def test_all_gym_envs(self, spec):
     if spec._env_name.startswith('Defender'):
         pytest.skip(
             'Defender-* envs bundled in atari-py 0.2.x don\'t load')
     if spec._env_name.startswith('CarRacing'):
         pytest.skip(
             'CarRacing-* envs bundled in atari-py 0.2.x don\'t load')
     if 'Kuka' in spec.id:
         # Kuka environments calls py_bullet.resetSimulation() in reset()
         # unconditionally, which globally resets other simulations. So
         # only one Kuka environment can be tested.
         pytest.skip('Skip Kuka Bullet environments')
     env = GymEnv(spec.id)
     step_env_with_gym_quirks(env, spec, visualize=False)
Beispiel #28
0
def ppo_garage_pytorch(ctxt, env_id, seed):
    """Create garage PyTorch PPO model and training.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    trainer = Trainer(ctxt)

    env = normalize(GymEnv(env_id))

    policy = PyTorch_GMP(env.spec,
                         hidden_sizes=(32, 32),
                         hidden_nonlinearity=torch.tanh,
                         output_nonlinearity=None)

    value_function = GaussianMLPValueFunction(env_spec=env.spec,
                                              hidden_sizes=(32, 32),
                                              hidden_nonlinearity=torch.tanh,
                                              output_nonlinearity=None)

    policy_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                        policy,
                                        max_optimization_epochs=10,
                                        minibatch_size=64)

    vf_optimizer = OptimizerWrapper((torch.optim.Adam, dict(lr=2.5e-4)),
                                    value_function,
                                    max_optimization_epochs=10,
                                    minibatch_size=64)

    algo = PyTorch_PPO(env_spec=env.spec,
                       policy=policy,
                       value_function=value_function,
                       policy_optimizer=policy_optimizer,
                       vf_optimizer=vf_optimizer,
                       discount=0.99,
                       gae_lambda=0.95,
                       center_adv=True,
                       lr_clip_range=0.2)

    trainer.setup(algo, env)
    trainer.train(n_epochs=hyper_parameters['n_epochs'],
                  batch_size=hyper_parameters['batch_size'])
Beispiel #29
0
def dqn_cartpole(ctxt=None, seed=1):
    """Train TRPO with CubeCrash-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        n_epochs = 10
        steps_per_epoch = 10
        sampler_batch_size = 500
        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
        env = GymEnv('CartPole-v0')
        replay_buffer = PathBuffer(capacity_in_transitions=int(1e4))
        qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(64, 64))
        policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
        exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec,
                                                 policy=policy,
                                                 total_timesteps=num_timesteps,
                                                 max_epsilon=1.0,
                                                 min_epsilon=0.02,
                                                 decay_ratio=0.1)

        sampler = LocalSampler(agents=exploration_policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               is_tf_worker=True,
                               worker_class=FragmentWorker)

        algo = DQN(env_spec=env.spec,
                   policy=policy,
                   qf=qf,
                   exploration_policy=exploration_policy,
                   replay_buffer=replay_buffer,
                   sampler=sampler,
                   steps_per_epoch=steps_per_epoch,
                   qf_lr=1e-4,
                   discount=1.0,
                   min_buffer_size=int(1e3),
                   double_q=True,
                   n_train_steps=500,
                   target_network_update_freq=1,
                   buffer_batch_size=32)

        trainer.setup(algo, env)
        trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
Beispiel #30
0
    def test_obs_not_image(self):
        env = GymEnv(DummyDiscretePixelEnv())

        with mock.patch(('tests.fixtures.models.SimpleCNNModel._build'),
                        autospec=True,
                        side_effect=SimpleCNNModel._build) as build:
            with mock.patch(('garage.tf.models.'
                             'cnn_mlp_merge_model.CNNModel'),
                            new=SimpleCNNModel):
                with mock.patch(('garage.tf.models.'
                                 'cnn_mlp_merge_model.MLPMergeModel'),
                                new=SimpleMLPMergeModel):

                    qf = ContinuousCNNQFunction(env_spec=env.spec,
                                                filters=((5, (3, 3)), ),
                                                strides=(1, ))

                    # ensure non-image obses are not normalized
                    # in _initialize() and get_qval()

                    normalized_obs = build.call_args_list[0][0][1]
                    assert normalized_obs == qf.inputs[0]

                    fake_obs = [
                        np.full(env.spec.observation_space.shape, 255.)
                    ]

                    assert (self.sess.run(normalized_obs,
                                          feed_dict={qf.inputs[0]:
                                                     fake_obs}) == 255.).all()

                    # ensure non-image obses are not normalized in build()

                    obs_dim = env.spec.observation_space.shape
                    state_input = tf.compat.v1.placeholder(tf.float32,
                                                           shape=(None, ) +
                                                           obs_dim)

                    act_dim = env.spec.observation_space.shape
                    action_input = tf.compat.v1.placeholder(tf.float32,
                                                            shape=(None, ) +
                                                            act_dim)

                    qf.build(state_input, action_input, name='another')
                    normalized_obs = build.call_args_list[1][0][1]

                    assert (self.sess.run(normalized_obs,
                                          feed_dict={state_input:
                                                     fake_obs}) == 255.).all()