def test_clone(self):
     box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec
     gmb = GaussianMLPBaseline(env_spec=box_env_spec)
     cloned_gmb_model = gmb.clone_model(name='cloned_model')
     for cloned_param, param in zip(cloned_gmb_model.parameters.values(),
                                    gmb.parameters.values()):
         assert np.array_equal(cloned_param, param)
    def test_fit_unnormalized(self):
        box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec
        gmb = GaussianMLPBaseline(env_spec=box_env_spec,
                                  subsample_factor=0.9,
                                  normalize_inputs=False,
                                  normalize_outputs=False)

        train_paths, _, _, paths, expected = get_train_test_data()

        for _ in range(150):
            gmb.fit(train_paths)

        prediction = gmb.predict(paths)

        assert np.allclose(prediction, expected, rtol=0, atol=0.1)

        x_mean = self.sess.run(gmb._networks['default'].x_mean)
        x_mean_expected = np.zeros_like(x_mean)
        x_std = self.sess.run(gmb._networks['default'].x_std)
        x_std_expected = np.ones_like(x_std)
        assert np.array_equal(x_mean, x_mean_expected)
        assert np.array_equal(x_std, x_std_expected)

        y_mean = self.sess.run(gmb._networks['default'].y_mean)
        y_mean_expected = np.zeros_like(y_mean)
        y_std = self.sess.run(gmb._networks['default'].y_std)
        y_std_expected = np.ones_like(y_std)

        assert np.allclose(y_mean, y_mean_expected)
        assert np.allclose(y_std, y_std_expected)
    def test_fit_normalized(self):
        box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec
        gmb = GaussianMLPBaseline(env_spec=box_env_spec)

        (train_paths, observations, returns, paths,
         expected) = get_train_test_data()

        for _ in range(150):
            gmb.fit(train_paths)

        prediction = gmb.predict(paths)

        assert np.allclose(prediction, expected, rtol=0, atol=0.1)

        x_mean = self.sess.run(gmb._networks['default'].x_mean)
        x_mean_expected = np.mean(observations, axis=0, keepdims=True)
        x_std = self.sess.run(gmb._networks['default'].x_std)
        x_std_expected = np.std(observations, axis=0, keepdims=True)

        assert np.allclose(x_mean, x_mean_expected)
        assert np.allclose(x_std, x_std_expected)

        y_mean = self.sess.run(gmb._networks['default'].y_mean)
        y_mean_expected = np.mean(returns, axis=0, keepdims=True)
        y_std = self.sess.run(gmb._networks['default'].y_std)
        y_std_expected = np.std(returns, axis=0, keepdims=True)

        assert np.allclose(y_mean, y_mean_expected)
        assert np.allclose(y_std, y_std_expected)
 def test_unflattened_input(self):
     env = GymEnv(DummyBoxEnv(obs_dim=(2, 2)))
     gmb = GaussianMLPBaseline(env_spec=env.spec)
     env.reset()
     es = env.step(1)
     obs, rewards = es.observation, es.reward
     train_paths = [{'observations': [obs], 'returns': [rewards]}]
     gmb.fit(train_paths)
     paths = {'observations': [obs]}
     prediction = gmb.predict(paths)
     assert np.allclose(0., prediction)
Beispiel #5
0
    def test_baseline(self):
        """Test the baseline initialization."""
        box_env = GarageEnv(DummyBoxEnv())
        deterministic_mlp_baseline = ContinuousMLPBaseline(env_spec=box_env)
        gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env)

        self.sess.run(tf.compat.v1.global_variables_initializer())
        deterministic_mlp_baseline.get_param_values()
        gaussian_mlp_baseline.get_param_values()

        box_env.close()
Beispiel #6
0
 def test_get_params_internal(self, obs_dim):
     box_env = TfEnv(DummyBoxEnv(obs_dim=obs_dim))
     with mock.patch(('garage.tf.baselines.'
                      'gaussian_mlp_baseline.'
                      'GaussianMLPRegressor'),
                     new=SimpleGaussianMLPRegressor):
         gmb = GaussianMLPBaseline(env_spec=box_env.spec,
                                   regressor_args=dict())
     params_interal = gmb.get_params_internal()
     trainable_params = tf.compat.v1.trainable_variables(
         scope='GaussianMLPBaseline')
     assert np.array_equal(params_interal, trainable_params)
    def test_fit_smaller_subsample_factor(self):
        box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec
        gmb = GaussianMLPBaseline(env_spec=box_env_spec, subsample_factor=0.9)

        train_paths, _, _, paths, expected = get_train_test_data()

        for _ in range(150):
            gmb.fit(train_paths)

        prediction = gmb.predict(paths)

        assert np.allclose(prediction, expected, rtol=0, atol=0.1)
    def test_fit_without_trusted_region(self):
        box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec
        gmb = GaussianMLPBaseline(env_spec=box_env_spec,
                                  use_trust_region=False)

        train_paths, _, _, paths, expected = get_train_test_data()

        for _ in range(150):
            gmb.fit(train_paths)

        prediction = gmb.predict(paths)

        assert np.allclose(prediction, expected, rtol=0, atol=0.1)
Beispiel #9
0
    def test_ppo_pendulum_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicyWithModel(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            lr_clip_range=0.01,
            optimizer_args=dict(batch_size=32, max_epochs=10),
            plot=False,
        )
        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 40

        env.close()
def run_task(*_):
    env = TfEnv(env_name="CartPole-v1")
    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=None,
    )
    baseline = GaussianMLPBaseline(
        env_spec=env.spec,
        regressor_args=dict(hidden_sizes=(32, 32)),
    )
    algo = InstrumentedTRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,
        max_path_length=100,
        n_itr=4,
        discount=0.99,
        gae_lambda=0.98,
        policy_ent_coeff=0.0,
        plot=True,
    )
    algo.train()
    env.close()
Beispiel #11
0
def run_task(v):
    v = SimpleNamespace(**v)

    with LocalRunner() as runner:
        # Environment
        env = SimpleReacherEnv(goal_position=GOALS[0],
                               control_method="position_control",
                               completion_bonus=5)

        env = TfEnv(env)

        # Policy
        policy = GaussianMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=(64, 32),
            init_std=v.policy_init_std,
        )

        baseline = GaussianMLPBaseline(env_spec=env.spec)

        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            max_path_length=v.max_path_length,
            discount=0.99,
            lr_clip_range=0.2,
            optimizer_args=dict(batch_size=32, max_epochs=10),
            plot=True,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=1000, batch_size=v.batch_size, plot=False)
Beispiel #12
0
def trpo_minigrid(ctxt=None, seed=1):
    """Train TRPO with MiniGrid-FourRooms-v0 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:

        env = GarageEnv(env_name='DisabledAntPyBulletEnv-v0')

        policy = GaussianMLPPolicy(name='policy',

                                      env_spec=env.spec,
                                      hidden_sizes=(128, 64, 32))

        # baseline = LinearFeatureBaseline(env_spec=env.spec)
        baseline = GaussianMLPBaseline(
            env_spec=env.spec
        )

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    discount=0.99,

                    max_kl_step=0.001)

        runner.setup(algo, env)
        runner.train(n_epochs=2000, batch_size=4000)
def run_task(*_):
    sess = tf.Session()
    sess.__enter__()
    snapshot = joblib.load(latent_policy_pkl)
    latent_policy = snapshot["policy"]
    inner_env = SimpleReacherEnv(goal_position=(0.65, 0.3, 0.3),
                                 control_method="position_control",
                                 completion_bonus=30)

    env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env,
        hidden_sizes=(64, 64),
        init_std=20,
        # std_share_network=False,
        # adaptive_std=True
    )
    baseline = GaussianMLPBaseline(env_spec=env, include_action_to_input=False)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=1024,  # 4096
        max_path_length=100,
        n_itr=1500,
        discount=0.99,
        step_size=0.2,
        policy_ent_coeff=1e-6,
        plot=True,
    )
    algo.train(sess=sess)
    def run_task(*_):
        sess = tf.Session()
        sess.__enter__()
        latent_policy = joblib.load(latent_policy_pkl)["policy"]
        with LocalRunner(sess=sess) as runner:
            inner_env = PointEnv(goal=(1.4, 1.4), completion_bonus=100)
            env = TfEnv(EmbeddedPolicyEnv(inner_env, latent_policy))

            policy = GaussianMLPPolicy(name="composer",
                                       env_spec=env.spec,
                                       hidden_sizes=(64, 64),
                                       init_std=20,
                                       std_share_network=False,
                                       adaptive_std=True)

            baseline = GaussianMLPBaseline(env_spec=env)

            algo = PPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=1024,  # 4096
                max_path_length=50,
                n_itr=1500,
                discount=0.99,
                step_size=0.2,
                policy_ent_coeff=1e-6,
                plot=True,
                use_mpc_es=True,
            )
            runner.setup(algo, env)
            runner.train(n_epochs=600, plot=False, batch_size=1024)
def run_task(*_):
    with LocalRunner() as runner:
        env = PointEnv(goal=(3, 3), random_start=True)
        env = TfEnv(env)

        policy = GaussianMLPPolicy(name="policy",
                                   env_spec=env.spec,
                                   hidden_sizes=(64, 64),
                                   init_std=20,
                                   std_share_network=False,
                                   adaptive_std=True)

        baseline = GaussianMLPBaseline(env_spec=env,
                                       include_action_to_input=False)

        algo = PPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=1024,  # 4096
            max_path_length=50,
            n_itr=1500,
            discount=0.99,
            step_size=0.2,
            policy_ent_coeff=1e-6,
            use_mpc_es=True,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=1500, batch_size=1024, plot=True)
Beispiel #16
0
 def test_ppo_pendulum_lstm(self):
     """Test PPO with Pendulum environment and recurrent policy."""
     with TFTrainer(snapshot_config) as trainer:
         env = normalize(
             GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
         lstm_policy = GaussianLSTMPolicy(env_spec=env.spec)
         baseline = GaussianMLPBaseline(
             env_spec=env.spec,
             hidden_sizes=(32, 32),
         )
         algo = PPO(
             env_spec=env.spec,
             policy=lstm_policy,
             baseline=baseline,
             discount=0.99,
             gae_lambda=0.95,
             lr_clip_range=0.2,
             optimizer_args=dict(
                 batch_size=32,
                 max_optimization_epochs=10,
             ),
             stop_entropy_gradient=True,
             entropy_method='max',
             policy_ent_coeff=0.02,
             center_adv=False,
         )
         trainer.setup(algo, env, sampler_cls=LocalSampler)
         last_avg_ret = trainer.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 60
Beispiel #17
0
    def test_ppo_pendulum_recurrent(self):
        """Test PPO with Pendulum environment and recurrent policy."""
        with LocalRunner() as runner:
            logger.reset()
            env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
            policy = GaussianLSTMPolicy(env_spec=env.spec, )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                lr_clip_range=0.01,
                optimizer_args=dict(batch_size=32, max_epochs=10),
                plot=False,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 40

            env.close()
Beispiel #18
0
    def test_trpo_pendulum(self):
        """Test TRPO with Pendulum environment."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=2048,
            max_path_length=100,
            n_itr=10,
            discount=0.99,
            gae_lambda=0.98,
            policy_ent_coeff=0.0,
            plot=False,
        )
        last_avg_ret = algo.train(sess=self.sess)
        assert last_avg_ret > 50

        env.close()
Beispiel #19
0
def run_task(*_):

    env = TfEnv(
        normalize(
            GridworldGathererEnv(
                plot={
                    'visitation': {
                        # 'save': '~/garage/data/local/gridworld/instant-run',
                        'save': False,
                        'live': True
                    }
                })))

    policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=5000,
        max_path_length=100,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
    )

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as session:
        algo.train(sess=session)
Beispiel #20
0
 def test_process_samples_continuous_recurrent(self):
     env = TfEnv(DummyBoxEnv())
     policy = GaussianLSTMPolicy(env_spec=env.spec)
     baseline = GaussianMLPBaseline(env_spec=env.spec)
     max_path_length = 100
     with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
         algo = BatchPolopt2(env_spec=env.spec,
                             policy=policy,
                             baseline=baseline,
                             max_path_length=max_path_length,
                             flatten_input=True)
         runner.setup(algo, env, sampler_args=dict(n_envs=1))
         runner.train(n_epochs=1, batch_size=max_path_length)
         paths = runner.obtain_samples(0)
         samples = algo.process_samples(0, paths)
         # Since there is only 1 vec_env in the sampler and DummyBoxEnv
         # never terminate until it reaches max_path_length, batch size
         # must be max_path_length, i.e. 100
         assert samples['observations'].shape == (
             max_path_length, env.observation_space.flat_dim)
         assert samples['actions'].shape == (max_path_length,
                                             env.action_space.flat_dim)
         assert samples['rewards'].shape == (max_path_length, )
         assert samples['baselines'].shape == (max_path_length, )
         assert samples['returns'].shape == (max_path_length, )
         # there is only 1 path
         assert samples['lengths'].shape == (1, )
         for key, shape in policy.state_info_specs:
             assert samples['agent_infos'][key].shape == (max_path_length,
                                                          np.prod(shape))
         # DummyBoxEnv has env_info dummy
         assert samples['env_infos']['dummy'].shape == (max_path_length, )
         assert isinstance(samples['average_return'], float)
Beispiel #21
0
    def test_ppo_pendulum_gru_with_model(self):
        """Test PPO with Pendulum environment and GRU policy."""
        with LocalRunner(sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianGRUPolicyWithModel(env_spec=env.spec, )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                ),
                stop_entropy_gradient=True,
                entropy_method='max',
                policy_ent_coeff=0.02,
                center_adv=False,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 80

            env.close()
Beispiel #22
0
    def test_ppo_pendulum_with_model(self):
        """Test PPO with model, with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianMLPPolicyWithModel(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                lr_clip_range=0.01,
                optimizer_args=dict(batch_size=32, max_epochs=10),
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 30

            env.close()
Beispiel #23
0
 def test_ppo_pendulum_gru(self):
     """Test PPO with Pendulum environment and recurrent policy."""
     with LocalTFRunner(snapshot_config) as runner:
         env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
         gru_policy = GaussianGRUPolicy(env_spec=env.spec)
         baseline = GaussianMLPBaseline(
             env_spec=env.spec,
             regressor_args=dict(hidden_sizes=(32, 32)),
         )
         algo = PPO(
             env_spec=env.spec,
             policy=gru_policy,
             baseline=baseline,
             max_path_length=100,
             discount=0.99,
             gae_lambda=0.95,
             lr_clip_range=0.2,
             optimizer_args=dict(
                 batch_size=32,
                 max_epochs=10,
             ),
             stop_entropy_gradient=True,
             entropy_method='max',
             policy_ent_coeff=0.02,
             center_adv=False,
         )
         runner.setup(algo, env, sampler_cls=LocalSampler)
         last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 80
Beispiel #24
0
    def test_npo_pendulum(self):
        """Test NPO with Pendulum environment."""
        with LocalRunner(self.sess) as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianMLPPolicy(
                env_spec=env.spec,
                hidden_sizes=(64, 64),
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = NPO(env_spec=env.spec,
                       policy=policy,
                       baseline=baseline,
                       max_path_length=100,
                       discount=0.99,
                       gae_lambda=0.98,
                       policy_ent_coeff=0.0)
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 20

            env.close()
Beispiel #25
0
def run_task(v):
    v = SimpleNamespace(**v)

    # Environment
    env = SimpleReacherEnv(goal_position=GOALS[0], control_method="position_control", completion_bonus=5)

    env = TfEnv(env)

    # Policy
    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        hidden_sizes=(64, 32),
        init_std=v.policy_init_std,
    )

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=v.batch_size,  # 4096
        max_path_length=v.max_path_length,
        n_itr=1000,
        discount=0.99,
        step_size=0.2,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=True,
    )
    algo.train()
Beispiel #26
0
    def test_trpo_unknown_kl_constraint(self):
        """Test TRPO with unkown KL constraints."""
        logger.reset()
        env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
        policy = GaussianMLPPolicy(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )
        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(hidden_sizes=(32, 32)),
        )
        with self.assertRaises(NotImplementedError) as context:
            TRPO(
                env=env,
                policy=policy,
                baseline=baseline,
                batch_size=2048,
                max_path_length=100,
                n_itr=10,
                discount=0.99,
                gae_lambda=0.98,
                policy_ent_coeff=0.0,
                plot=False,
                kl_constraint="random kl_constraint",
            )
        assert "Unknown KLConstraint" in str(context.exception)

        env.close()
Beispiel #27
0
 def test_ppo_pendulum(self):
     """Test PPO with Pendulum environment."""
     logger._tensorboard = TensorBoardOutput()
     env = TfEnv(normalize(gym.make("Pendulum-v0")))
     policy = GaussianMLPPolicy(
         env_spec=env.spec,
         hidden_sizes=(32, 32),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     baseline = GaussianMLPBaseline(
         env_spec=env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
     algo = PPO(
         env=env,
         policy=policy,
         baseline=baseline,
         batch_size=1024,
         max_path_length=100,
         n_itr=10,
         discount=0.99,
         gae_lambda=0.98,
         policy_ent_coeff=0.0,
         plot=False,
     )
     last_avg_ret = algo.train(sess=self.sess)
     assert last_avg_ret > -1000
Beispiel #28
0
def run_task(*_):
    """
    Wrap PPO training task in the run_task function.

    :param _:
    :return:
    """
    env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))

    policy = GaussianMLPPolicy(
        name="policy", env_spec=env.spec, hidden_sizes=(64, 64))

    baseline = GaussianMLPBaseline(env_spec=env.spec)

    algo = PPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2048,
        max_path_length=100,
        n_itr=488,
        discount=0.99,
        step_size=0.01,
        optimizer_args=dict(batch_size=32, max_epochs=10),
        plot=False)
    algo.train()
def gaussian_lstm_policy(ctxt, env_id, seed):
    """Create Gaussian LSTM Policy on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=32,
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            use_trust_region=False,
            optimizer=FirstOrderOptimizer,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=1e-3,
            ),
        )

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            sampler=sampler,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=1e-3,
            ),
        )

        trainer.setup(algo, env)
        trainer.train(n_epochs=5, batch_size=2048)
Beispiel #30
0
def run_garage(env, seed, log_dir):
    """
    Create garage model and training.

    Replace the trpo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trail.
    :param log_dir: Log dir path.
    :return:import baselines.common.tf_util as U
    """
    ext.set_seed(seed)

    with tf.Graph().as_default():
        env = TfEnv(normalize(env))

        policy = GaussianMLPPolicy(
            name="policy",
            env_spec=env.spec,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(32, 32),
                use_trust_region=True,
            ),
        )

        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=1024,
            max_path_length=100,
            n_itr=976,
            discount=0.99,
            gae_lambda=0.98,
            clip_range=0.1,
            policy_ent_coeff=0.0,
            plot=False,
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, "progress.csv")
        garage_logger.add_tabular_output(tabular_log_file)
        garage_logger.set_tensorboard_dir(log_dir)

        algo.train()

        garage_logger.remove_tabular_output(tabular_log_file)

        return tabular_log_file