Ejemplo n.º 1
0
    def test_dist_info_sym_include_action(self, obs_dim, action_dim,
                                          hidden_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))

        obs_ph = tf.compat.v1.placeholder(
            tf.float32, shape=(None, None, env.observation_space.flat_dim))

        with mock.patch(('garage.tf.policies.'
                         'gaussian_lstm_policy.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicy(env_spec=env.spec,
                                        state_include_action=True)

            policy.reset()
            obs = env.reset()
            dist_sym = policy.dist_info_sym(
                obs_var=obs_ph,
                state_info_vars={'prev_action': np.zeros((2, 1) + action_dim)},
                name='p2_sym')
        dist = self.sess.run(
            dist_sym, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})

        assert np.array_equal(dist['mean'], np.full((2, 1) + action_dim, 0.5))
        assert np.array_equal(dist['log_std'], np.full((2, 1) + action_dim,
                                                       0.5))
Ejemplo n.º 2
0
 def test_clone(self):
     env = GarageEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, )))
     policy = GaussianLSTMPolicy(env_spec=env.spec)
     policy_clone = policy.clone('GaussianLSTMPolicyClone')
     assert policy_clone.env_spec == policy.env_spec
     for cloned_param, param in zip(policy_clone.parameters.values(),
                                    policy.parameters.values()):
         assert np.array_equal(cloned_param, param)
Ejemplo n.º 3
0
    def test_gaussian_lstm_policy(self):
        gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env,
                                                  hidden_dim=1,
                                                  state_include_action=False)
        gaussian_lstm_policy.reset()

        obs = self.env.observation_space.high
        assert gaussian_lstm_policy.get_action(obs)
Ejemplo n.º 4
0
    def test_gaussian_lstm_policy(self):
        gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env,
                                                  hidden_dim=1)
        self.sess.run(tf.global_variables_initializer())

        gaussian_lstm_policy.reset()

        obs = self.env.observation_space.high
        assert gaussian_lstm_policy.get_action(obs)
Ejemplo n.º 5
0
    def test_ppo_pendulum_recurrent(self):
        """Test PPO with Pendulum environment and recurrent policy."""
        with LocalRunner() as runner:
            logger.reset()
            env = TfEnv(normalize(gym.make("InvertedDoublePendulum-v2")))
            policy = GaussianLSTMPolicy(env_spec=env.spec, )
            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env=env,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                lr_clip_range=0.01,
                optimizer_args=dict(batch_size=32, max_epochs=10),
                plot=False,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 40

            env.close()
Ejemplo n.º 6
0
 def test_process_samples_continuous_recurrent(self):
     env = TfEnv(DummyBoxEnv())
     policy = GaussianLSTMPolicy(env_spec=env.spec)
     baseline = GaussianMLPBaseline(env_spec=env.spec)
     max_path_length = 100
     with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
         algo = BatchPolopt2(env_spec=env.spec,
                             policy=policy,
                             baseline=baseline,
                             max_path_length=max_path_length,
                             flatten_input=True)
         runner.setup(algo, env, sampler_args=dict(n_envs=1))
         runner.train(n_epochs=1, batch_size=max_path_length)
         paths = runner.obtain_samples(0)
         samples = algo.process_samples(0, paths)
         # Since there is only 1 vec_env in the sampler and DummyBoxEnv
         # never terminate until it reaches max_path_length, batch size
         # must be max_path_length, i.e. 100
         assert samples['observations'].shape == (
             max_path_length, env.observation_space.flat_dim)
         assert samples['actions'].shape == (max_path_length,
                                             env.action_space.flat_dim)
         assert samples['rewards'].shape == (max_path_length, )
         assert samples['baselines'].shape == (max_path_length, )
         assert samples['returns'].shape == (max_path_length, )
         # there is only 1 path
         assert samples['lengths'].shape == (1, )
         for key, shape in policy.state_info_specs:
             assert samples['agent_infos'][key].shape == (max_path_length,
                                                          np.prod(shape))
         # DummyBoxEnv has env_info dummy
         assert samples['env_infos']['dummy'].shape == (max_path_length, )
         assert isinstance(samples['average_return'], float)
Ejemplo n.º 7
0
 def test_ppo_pendulum_lstm(self):
     """Test PPO with Pendulum environment and recurrent policy."""
     with LocalTFRunner(snapshot_config) as runner:
         env = GarageEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
         lstm_policy = GaussianLSTMPolicy(env_spec=env.spec)
         baseline = GaussianMLPBaseline(
             env_spec=env.spec,
             regressor_args=dict(hidden_sizes=(32, 32)),
         )
         algo = PPO(
             env_spec=env.spec,
             policy=lstm_policy,
             baseline=baseline,
             max_path_length=100,
             discount=0.99,
             gae_lambda=0.95,
             lr_clip_range=0.2,
             optimizer_args=dict(
                 batch_size=32,
                 max_epochs=10,
             ),
             stop_entropy_gradient=True,
             entropy_method='max',
             policy_ent_coeff=0.02,
             center_adv=False,
         )
         runner.setup(algo, env, sampler_cls=LocalSampler)
         last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
         assert last_avg_ret > 80
Ejemplo n.º 8
0
    def test_is_pickleable(self):
        env = GarageEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    state_include_action=False)
        env.reset()
        obs = env.reset()
        with tf.compat.v1.variable_scope(
                'GaussianLSTMPolicy/GaussianLSTMModel', reuse=True):
            param = tf.compat.v1.get_variable(
                'dist_params/log_std_param/parameter')
        # assign it to all one
        param.load(tf.ones_like(param).eval())

        output1 = self.sess.run(
            [policy.distribution.loc,
             policy.distribution.stddev()],
            feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]})

        p = pickle.dumps(policy)
        # yapf: disable
        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            output2 = sess.run(
                [
                    policy_pickled.distribution.loc,
                    policy_pickled.distribution.stddev()
                ],
                feed_dict={
                    policy_pickled.model.input: [[obs.flatten()],
                                                 [obs.flatten()]]
                })
            assert np.array_equal(output1, output2)
def ppo_cmb(env, seed, log_dir):
    """Create test continuous mlp baseline on ppo.

    Args:
        env (gym_env): Environment of the task.
        seed (int): Random seed for the trial.
        log_dir (str): Log dir path.

    Returns:
        str: training results in csv format.

    """
    deterministic.set_seed(seed)
    config = tf.compat.v1.ConfigProto(allow_soft_placement=True,
                                      intra_op_parallelism_threads=num_proc,
                                      inter_op_parallelism_threads=num_proc)
    sess = tf.compat.v1.Session(config=config)
    with LocalTFRunner(snapshot_config, sess=sess,
                       max_cpus=num_proc) as runner:
        env = TfEnv(normalize(env))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=policy_params['policy_hidden_sizes'],
            hidden_nonlinearity=policy_params['hidden_nonlinearity'],
        )

        baseline = ContinuousMLPBaseline(
            env_spec=env.spec,
            regressor_args=baseline_params['regressor_args'],
        )

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=algo_params['max_path_length'],
                   discount=algo_params['discount'],
                   gae_lambda=algo_params['gae_lambda'],
                   lr_clip_range=algo_params['lr_clip_range'],
                   entropy_method=algo_params['entropy_method'],
                   policy_ent_coeff=algo_params['policy_ent_coeff'],
                   optimizer_args=algo_params['optimizer_args'],
                   center_adv=algo_params['center_adv'],
                   stop_entropy_gradient=True)

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo,
                     env,
                     sampler_args=dict(n_envs=algo_params['n_envs']))
        runner.train(n_epochs=algo_params['n_epochs'],
                     batch_size=algo_params['n_rollout_steps'])

        dowel_logger.remove_all()

        return tabular_log_file
Ejemplo n.º 10
0
    def test_ppo_pendulum_recurrent_continuous_baseline(self):
        """Test PPO with Pendulum environment and recurrent policy."""
        with TFTrainer(snapshot_config) as trainer:
            env = normalize(
                GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
            policy = GaussianLSTMPolicy(env_spec=env.spec, )
            baseline = ContinuousMLPBaseline(
                env_spec=env.spec,
                hidden_sizes=(32, 32),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                optimizer_args=dict(
                    batch_size=32,
                    max_optimization_epochs=10,
                ),
                stop_entropy_gradient=True,
                entropy_method='max',
                policy_ent_coeff=0.02,
                center_adv=False,
            )
            trainer.setup(algo, env, sampler_cls=LocalSampler)
            last_avg_ret = trainer.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 100

            env.close()
Ejemplo n.º 11
0
    def test_ppo_pendulum_recurrent_continuous_baseline(self):
        """Test PPO with Pendulum environment and recurrent policy."""
        with LocalRunner() as runner:
            env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
            policy = GaussianLSTMPolicy(env_spec=env.spec, )
            baseline = ContinuousMLPBaselineWithModel(
                env_spec=env.spec,
                regressor_args=dict(hidden_sizes=(32, 32)),
            )
            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                ),
                stop_entropy_gradient=True,
                entropy_method='max',
                policy_ent_coeff=0.02,
                center_adv=False,
            )
            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > 100

            env.close()
Ejemplo n.º 12
0
    def test_is_pickleable(self):
        env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
        with mock.patch(('garage.tf.policies.'
                         'gaussian_lstm_policy.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicy(env_spec=env.spec,
                                        state_include_action=False)

        env.reset()
        obs = env.reset()

        with tf.compat.v1.variable_scope(
                'GaussianLSTMPolicy/GaussianLSTMModel', reuse=True):
            return_var = tf.compat.v1.get_variable('return_var')
        # assign it to all one
        return_var.load(tf.ones_like(return_var).eval())

        output1 = self.sess.run(
            policy.model.networks['default'].mean,
            feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]})

        p = pickle.dumps(policy)
        # yapf: disable
        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            output2 = sess.run(
                policy_pickled.model.networks['default'].mean,
                feed_dict={
                    policy_pickled.model.input: [[obs.flatten()],
                                                 [obs.flatten()]]
                })
            assert np.array_equal(output1, output2)
Ejemplo n.º 13
0
def gaussian_lstm_policy(ctxt, env_id, seed):
    """Create Gaussian LSTM Policy on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=32,
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
            use_trust_region=False,
            optimizer=FirstOrderOptimizer,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=1e-3,
            ),
        )

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            sampler=sampler,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=1e-3,
            ),
        )

        trainer.setup(algo, env)
        trainer.train(n_epochs=5, batch_size=2048)
Ejemplo n.º 14
0
def gaussian_lstm_policy(ctxt, env_id, seed):
    """Create Gaussian LSTM Policy on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = TfEnv(normalize(gym.make(env_id)))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=32,
            hidden_nonlinearity=tf.nn.tanh,
            output_nonlinearity=None,
        )

        baseline = GaussianMLPBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                hidden_sizes=(64, 64),
                use_trust_region=False,
                optimizer=FirstOrderOptimizer,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=1e-3),
                ),
            ),
        )

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
        )

        runner.setup(algo, env, sampler_args=dict(n_envs=12))
        runner.train(n_epochs=5, batch_size=2048)
Ejemplo n.º 15
0
    def test_build_state_include_action(self, obs_dim, action_dim, hidden_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    hidden_dim=hidden_dim,
                                    state_include_action=True)
        policy.reset(do_resets=None)
        obs = env.reset()

        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None,
                                                      policy.input_dim))
        dist_sym = policy.build(state_input, name='dist_sym').dist

        concat_obs = np.concatenate([obs.flatten(), np.zeros(action_dim)])
        output1 = self.sess.run(
            [policy.distribution.loc],
            feed_dict={policy.model.input: [[concat_obs], [concat_obs]]})
        output2 = self.sess.run(
            [dist_sym.loc],
            feed_dict={state_input: [[concat_obs], [concat_obs]]})
        assert np.array_equal(output1, output2)
Ejemplo n.º 16
0
    def test_get_action(self, mock_normal, obs_dim, action_dim, hidden_dim):
        mock_normal.return_value = 0.5
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'gaussian_lstm_policy.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicy(env_spec=env.spec,
                                        state_include_action=False)
        expected_action = np.full(action_dim, 0.5 * np.exp(0.5) + 0.5)

        policy.reset()
        obs = env.reset()

        action, agent_info = policy.get_action(obs)
        assert env.action_space.contains(action)
        assert np.allclose(action,
                           np.full(action_dim, expected_action),
                           atol=1e-6)

        expected_mean = np.full(action_dim, 0.5)
        assert np.array_equal(agent_info['mean'], expected_mean)
        expected_log_std = np.full(action_dim, 0.5)
        assert np.array_equal(agent_info['log_std'], expected_log_std)

        actions, agent_infos = policy.get_actions([obs])
        for action, mean, log_std in zip(actions, agent_infos['mean'],
                                         agent_infos['log_std']):
            assert env.action_space.contains(action)
            assert np.allclose(action,
                               np.full(action_dim, expected_action),
                               atol=1e-6)
            assert np.array_equal(mean, expected_mean)
            assert np.array_equal(log_std, expected_log_std)
Ejemplo n.º 17
0
    def test_gaussian_lstm_policy(self):
        gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=self.env,
                                                  hidden_dim=1,
                                                  state_include_action=False)
        self.sess.run(tf.compat.v1.global_variables_initializer())

        gaussian_lstm_policy.build(self.obs_var)
        gaussian_lstm_policy.reset()

        obs = self.env.observation_space.high
        assert gaussian_lstm_policy.get_action(obs)
Ejemplo n.º 18
0
def continuous_mlp_baseline(ctxt, env_id, seed):
    """Create Continuous MLP Baseline on TF-PPO.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~Trainer` to create the :class:`~Snapshotter`.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=hyper_params['policy_hidden_sizes'],
            hidden_nonlinearity=hyper_params['hidden_nonlinearity'],
        )

        baseline = ContinuousMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
        )

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   sampler=sampler,
                   discount=hyper_params['discount'],
                   gae_lambda=hyper_params['gae_lambda'],
                   lr_clip_range=hyper_params['lr_clip_range'],
                   entropy_method=hyper_params['entropy_method'],
                   policy_ent_coeff=hyper_params['policy_ent_coeff'],
                   optimizer_args=dict(
                       batch_size=32,
                       max_optimization_epochs=10,
                       learning_rate=1e-3,
                   ),
                   center_adv=hyper_params['center_adv'],
                   stop_entropy_gradient=True)

        trainer.setup(algo, env)
        trainer.train(n_epochs=hyper_params['n_epochs'],
                      batch_size=hyper_params['n_exploration_steps'])
Ejemplo n.º 19
0
 def setup_method(self):
     super().setup_method()
     self.env = TfEnv(normalize(gym.make('InvertedDoublePendulum-v2')))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.recurrent_policy = GaussianLSTMPolicy(env_spec=self.env.spec, )
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         regressor_args=dict(hidden_sizes=(32, 32)),
     )
Ejemplo n.º 20
0
    def test_get_action_state_include_action(self, obs_dim, action_dim,
                                             hidden_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    hidden_dim=hidden_dim,
                                    state_include_action=True)
        policy.reset()
        obs = env.reset()
        action, _ = policy.get_action(obs.flatten())
        assert env.action_space.contains(action)

        policy.reset()

        actions, _ = policy.get_actions([obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
Ejemplo n.º 21
0
    def test_is_pickleable(self):
        env = GarageEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    state_include_action=False)
        env.reset()
        obs = env.reset()
        with tf.compat.v1.variable_scope('GaussianLSTMPolicy', reuse=True):
            param = tf.compat.v1.get_variable(
                'dist_params/log_std_param/parameter')
        # assign it to all one
        param.load(tf.ones_like(param).eval())

        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None,
                                                      policy.input_dim))
        dist_sym = policy.build(state_input, name='dist_sym').dist
        output1 = self.sess.run(
            [dist_sym.loc, dist_sym.stddev()],
            feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]})

        p = pickle.dumps(policy)
        # yapf: disable
        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            state_input = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, None,
                                                          policy.input_dim))
            dist_sym = policy_pickled.build(state_input, name='dist_sym').dist
            output2 = sess.run(
                [
                    dist_sym.loc,
                    dist_sym.stddev()
                ],
                feed_dict={
                    state_input: [[obs.flatten()], [obs.flatten()]]
                })
            assert np.array_equal(output1, output2)
Ejemplo n.º 22
0
    def run_task(self, snapshot_config, *_):
        config = tf.compat.v1.ConfigProto(device_count={'GPU': 0},
                                          allow_soft_placement=True,
                                          intra_op_parallelism_threads=12,
                                          inter_op_parallelism_threads=12)
        sess = tf.compat.v1.Session(config=config)
        with LocalTFRunner(snapshot_config=snapshot_config,
                           sess=sess) as runner:
            env = gym.make(self._env)
            env = TfEnv(normalize(env))
            env.reset()
            policy = GaussianLSTMPolicy(
                env_spec=env.spec,
                hidden_dim=32,
                hidden_nonlinearity=tf.nn.tanh,
                output_nonlinearity=None,
            )

            baseline = GaussianMLPBaseline(
                env_spec=env.spec,
                regressor_args=dict(
                    hidden_sizes=(64, 64),
                    use_trust_region=False,
                    optimizer=FirstOrderOptimizer,
                    optimizer_args=dict(
                        batch_size=32,
                        max_epochs=10,
                        tf_optimizer_args=dict(learning_rate=1e-3),
                    ),
                ),
            )

            algo = PPO(
                env_spec=env.spec,
                policy=policy,
                baseline=baseline,
                max_path_length=100,
                discount=0.99,
                gae_lambda=0.95,
                lr_clip_range=0.2,
                policy_ent_coeff=0.0,
                optimizer_args=dict(
                    batch_size=32,
                    max_epochs=10,
                    tf_optimizer_args=dict(learning_rate=1e-3),
                ),
            )
            runner.setup(algo, env, sampler_args=dict(n_envs=12))
            runner.train(n_epochs=5, batch_size=2048)
Ejemplo n.º 23
0
def continuous_mlp_baseline(ctxt, env_id, seed):
    """Create Continuous MLP Baseline on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(normalize(gym.make(env_id)))

        policy = GaussianLSTMPolicy(
            env_spec=env.spec,
            hidden_dim=hyper_params['policy_hidden_sizes'],
            hidden_nonlinearity=hyper_params['hidden_nonlinearity'],
        )

        baseline = ContinuousMLPBaseline(
            env_spec=env.spec,
            hidden_sizes=(64, 64),
        )

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=hyper_params['max_path_length'],
                   discount=hyper_params['discount'],
                   gae_lambda=hyper_params['gae_lambda'],
                   lr_clip_range=hyper_params['lr_clip_range'],
                   entropy_method=hyper_params['entropy_method'],
                   policy_ent_coeff=hyper_params['policy_ent_coeff'],
                   optimizer_args=dict(
                       batch_size=32,
                       max_epochs=10,
                       learning_rate=1e-3,
                   ),
                   center_adv=hyper_params['center_adv'],
                   stop_entropy_gradient=True)

        runner.setup(algo,
                     env,
                     sampler_args=dict(n_envs=hyper_params['n_envs']))
        runner.train(n_epochs=hyper_params['n_epochs'],
                     batch_size=hyper_params['n_rollout_steps'])
Ejemplo n.º 24
0
 def setup_method(self):
     super().setup_method()
     self.env = normalize(GymEnv('InvertedDoublePendulum-v2'))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec)
     self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec)
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         hidden_sizes=(32, 32),
     )
Ejemplo n.º 25
0
    def test_get_action_state_include_action(self, obs_dim, action_dim,
                                             hidden_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        obs_var = tf.compat.v1.placeholder(
            tf.float32,
            shape=[
                None, None,
                env.observation_space.flat_dim + np.prod(action_dim)
            ],
            name='obs')
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    hidden_dim=hidden_dim,
                                    state_include_action=True)
        policy.build(obs_var)
        policy.reset()
        obs = env.reset()
        action, _ = policy.get_action(obs.flatten())
        assert env.action_space.contains(action)

        policy.reset()

        actions, _ = policy.get_actions([obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
Ejemplo n.º 26
0
 def test_policies(self):
     """Test the policies initialization."""
     box_env = TfEnv(DummyBoxEnv())
     discrete_env = TfEnv(DummyDiscreteEnv())
     categorical_gru_policy = CategoricalGRUPolicy(env_spec=discrete_env,
                                                   hidden_dim=1)
     categorical_lstm_policy = CategoricalLSTMPolicy(env_spec=discrete_env,
                                                     hidden_dim=1)
     categorical_mlp_policy = CategoricalMLPPolicy(env_spec=discrete_env,
                                                   hidden_sizes=(1, ))
     continuous_mlp_policy = ContinuousMLPPolicy(env_spec=box_env,
                                                 hidden_sizes=(1, ))
     deterministic_mlp_policy = DeterministicMLPPolicy(env_spec=box_env,
                                                       hidden_sizes=(1, ))
     gaussian_gru_policy = GaussianGRUPolicy(env_spec=box_env, hidden_dim=1)
     gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=box_env,
                                               hidden_dim=1)
     gaussian_mlp_policy = GaussianMLPPolicy(env_spec=box_env,
                                             hidden_sizes=(1, ))
Ejemplo n.º 27
0
    def test_get_action_dict_space(self):
        env = GymEnv(DummyDictEnv(obs_space_type='box', act_space_type='box'))
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    hidden_dim=4,
                                    state_include_action=False)
        policy.reset(do_resets=None)
        obs = env.reset()[0]

        action, _ = policy.get_action(obs)
        assert env.action_space.contains(action)

        actions, _ = policy.get_actions([obs, obs])
        for action in actions:
            assert env.action_space.contains(action)
Ejemplo n.º 28
0
def tf_gym_music(ctxt=None, seed=1):
    """Train Policy Gradient LSTM with Music-v0 environment.
    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
            created by @wrap_experiment
        seed (int): Used to seed the random number generator to produce
            determinism.

    """

    set_seed(seed)

    with TFTrainer(snapshot_config=ctxt) as trainer:

        env = GymEnv(MusicEnv(monitor = HeartMonitor('DC:39:39:66:26:1F')),max_episode_length = 35) 
        
        policy = GaussianLSTMPolicy(name='policy',
                                    env_spec=env.spec,
                                    hidden_dim= 32)
        
        baseline = GaussianMLPBaseline(
            env_spec = env.spec,
            hidden_sizes=(32, 32),
        )

        sampler = LocalSampler(agents=policy,
                               envs=env,
                               max_episode_length=env.spec.max_episode_length,
                               is_tf_worker=False,
                               n_workers = 1,
                              )
        
        algo = NPO(env_spec = env.spec,
                    policy = policy,
                    baseline = baseline, 
                    sampler = sampler,
                  )
        
        trainer.setup(algo, env)

        trainer.train(n_epochs=120, batch_size=1,store_episodes = True)
Ejemplo n.º 29
0
 def setup_method(self):
     super().setup_method()
     self.env = normalize(
         GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
     self.policy = GaussianMLPPolicy(
         env_spec=self.env.spec,
         hidden_sizes=(64, 64),
         hidden_nonlinearity=tf.nn.tanh,
         output_nonlinearity=None,
     )
     self.lstm_policy = GaussianLSTMPolicy(env_spec=self.env.spec)
     self.gru_policy = GaussianGRUPolicy(env_spec=self.env.spec)
     self.baseline = GaussianMLPBaseline(
         env_spec=self.env.spec,
         hidden_sizes=(32, 32),
     )
     self.sampler = LocalSampler(
         agents=self.policy,
         envs=self.env,
         max_episode_length=self.env.spec.max_episode_length,
         is_tf_worker=True)
Ejemplo n.º 30
0
    def test_build_state_not_include_action(self, obs_dim, action_dim,
                                            hidden_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    hidden_dim=hidden_dim,
                                    state_include_action=False)
        policy.reset(do_resets=None)
        obs = env.reset()

        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None,
                                                      policy.input_dim))
        dist_sym = policy.build(state_input, name='dist_sym').dist
        dist_sym2 = policy.build(state_input, name='dist_sym2').dist

        output1 = self.sess.run(
            [dist_sym.loc],
            feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]})
        output2 = self.sess.run(
            [dist_sym2.loc],
            feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]})
        assert np.array_equal(output1, output2)