Esempio n. 1
0
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(normalize(gym.make('MemorizeDigits-v0')))
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      conv_filters=(32, 64, 64),
                                      conv_filter_sizes=(5, 3, 2),
                                      conv_strides=(4, 2, 1),
                                      conv_pad='VALID',
                                      hidden_sizes=(256, ))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       regressor_args=dict(
                                           num_filters=(32, 64, 64),
                                           filter_dims=(5, 3, 2),
                                           strides=(4, 2, 1),
                                           padding='VALID',
                                           hidden_sizes=(256, ),
                                           use_trust_region=True))

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=100,
                   discount=0.99,
                   max_kl_step=0.01,
                   flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=1000, batch_size=2048)
Esempio n. 2
0
    def test_dist_info_sym(self, obs_dim, action_dim, filter_dims,
                           filter_sizes, strides, padding, hidden_sizes):
        env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'categorical_cnn_policy.MLPModel'),
                        new=SimpleMLPModel):
            with mock.patch(('garage.tf.policies.'
                             'categorical_cnn_policy.CNNModel'),
                            new=SimpleCNNModel):
                policy = CategoricalCNNPolicy(env_spec=env.spec,
                                              conv_filters=filter_dims,
                                              conv_filter_sizes=filter_sizes,
                                              conv_strides=strides,
                                              conv_pad=padding,
                                              hidden_sizes=hidden_sizes)

        env.reset()
        obs, _, _, _ = env.step(1)

        expected_prob = np.full(action_dim, 0.5)

        obs_dim = env.spec.observation_space.shape
        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, ) + obs_dim)
        dist1 = policy.dist_info_sym(state_input, name='policy2')

        prob = self.sess.run(dist1['prob'], feed_dict={state_input: [obs]})
        assert np.array_equal(prob[0], expected_prob)
Esempio n. 3
0
    def test_obs_is_image(self):
        env = TfEnv(DummyDiscretePixelEnv(), is_image=True)
        with mock.patch(('garage.tf.policies.'
                         'categorical_cnn_policy.CNNModel._build'),
                        autospec=True,
                        side_effect=CNNModel._build) as build:
            policy = CategoricalCNNPolicy(env_spec=env.spec,
                                          conv_filters=(32, ),
                                          conv_filter_sizes=(1, ),
                                          conv_strides=(1, ),
                                          conv_pad='VALID',
                                          hidden_sizes=(3, ))
            normalized_obs = build.call_args_list[0][0][1]

            input_ph = tf.compat.v1.get_default_graph().get_tensor_by_name(
                'Placeholder:0')

            fake_obs = [np.full(env.spec.observation_space.shape, 255)]
            assert (self.sess.run(normalized_obs,
                                  feed_dict={input_ph: fake_obs}) == 1.).all()

            obs_dim = env.spec.observation_space.shape
            state_input = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, ) + obs_dim)

            policy.dist_info_sym(state_input, name='another')
            normalized_obs = build.call_args_list[1][0][1]

            input_ph = tf.compat.v1.get_default_graph().get_tensor_by_name(
                'Placeholder_1:0')

            fake_obs = [np.full(env.spec.observation_space.shape, 255)]
            assert (self.sess.run(normalized_obs,
                                  feed_dict={state_input:
                                             fake_obs}) == 1.).all()
Esempio n. 4
0
    def test_get_action(self, mock_rand, obs_dim, action_dim, filter_dims,
                        filter_sizes, strides, padding, hidden_sizes):
        mock_rand.return_value = 0
        env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'categorical_cnn_policy.MLPModel'),
                        new=SimpleMLPModel):
            with mock.patch(('garage.tf.policies.'
                             'categorical_cnn_policy.CNNModel'),
                            new=SimpleCNNModel):
                policy = CategoricalCNNPolicy(env_spec=env.spec,
                                              conv_filters=filter_dims,
                                              conv_filter_sizes=filter_sizes,
                                              conv_strides=strides,
                                              conv_pad=padding,
                                              hidden_sizes=hidden_sizes)

        env.reset()
        obs, _, _, _ = env.step(1)

        action, prob = policy.get_action(obs)
        expected_prob = np.full(action_dim, 0.5)

        assert env.action_space.contains(action)
        assert action == 0
        assert np.array_equal(prob['prob'], expected_prob)

        actions, probs = policy.get_actions([obs, obs, obs])
        for action, prob in zip(actions, probs['prob']):
            assert env.action_space.contains(action)
            assert action == 0
            assert np.array_equal(prob, expected_prob)
Esempio n. 5
0
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(normalize(gym.make('CubeCrash-v0')))
        print('shape= ', env.spec.observation_space.shape)
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      conv_filters=(32, 64),
                                      conv_filter_sizes=(8, 4),
                                      conv_strides=(4, 2),
                                      conv_pad='VALID',
                                      hidden_sizes=(32, 32))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       regressor_args=dict(
                                           num_filters=(32, 64),
                                           filter_dims=(8, 4),
                                           strides=(4, 2),
                                           padding='VALID',
                                           hidden_sizes=(32, 32),
                                           use_trust_region=True))

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01,
                    flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=4000)
    def test_is_pickleable(self):
        env = GarageEnv(DummyDiscretePixelEnv())
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=((3, (32, 32)), ),
                                      strides=(1, ),
                                      padding='SAME',
                                      hidden_sizes=(4, ))

        env.reset()
        obs, _, _, _ = env.step(1)

        with tf.compat.v1.variable_scope(
                'CategoricalCNNPolicy/CategoricalCNNModel', reuse=True):
            cnn_bias = tf.compat.v1.get_variable('CNNModel/cnn/h0/bias')
            bias = tf.compat.v1.get_variable('MLPModel/mlp/hidden_0/bias')

        cnn_bias.load(tf.ones_like(cnn_bias).eval())
        bias.load(tf.ones_like(bias).eval())

        output1 = self.sess.run(policy.distribution.probs,
                                feed_dict={policy.model.input: [[obs]]})
        p = pickle.dumps(policy)

        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            output2 = sess.run(policy_pickled.distribution.probs,
                               feed_dict={policy_pickled.model.input: [[obs]]})
            assert np.array_equal(output1, output2)
    def test_is_pickleable(self):
        env = GymEnv(DummyDiscretePixelEnv())
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=((3, (32, 32)), ),
                                      strides=(1, ),
                                      padding='SAME',
                                      hidden_sizes=(4, ))

        env.reset()
        obs = env.step(1).observation

        with tf.compat.v1.variable_scope('CategoricalCNNPolicy', reuse=True):
            cnn_bias = tf.compat.v1.get_variable('CNNModel/cnn/h0/bias')
            bias = tf.compat.v1.get_variable('MLPModel/mlp/hidden_0/bias')

        cnn_bias.load(tf.ones_like(cnn_bias).eval())
        bias.load(tf.ones_like(bias).eval())

        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None) +
                                               policy.input_dim)
        dist_sym = policy.build(state_input, name='dist_sym').dist
        output1 = self.sess.run(dist_sym.probs,
                                feed_dict={state_input: [[obs]]})
        p = pickle.dumps(policy)

        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            state_input = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, None) +
                                                   policy.input_dim)
            dist_sym = policy_pickled.build(state_input, name='dist_sym').dist
            output2 = sess.run(dist_sym.probs,
                               feed_dict={state_input: [[obs]]})
            assert np.array_equal(output1, output2)
Esempio n. 8
0
    def test_trpo_cnn_cubecrash(self):
        with TFTrainer(snapshot_config, sess=self.sess) as trainer:
            env = normalize(GymEnv('CubeCrash-v0', max_episode_length=100))

            policy = CategoricalCNNPolicy(env_spec=env.spec,
                                          filters=((32, (8, 8)), (64, (4, 4))),
                                          strides=(4, 2),
                                          padding='VALID',
                                          hidden_sizes=(32, 32))

            baseline = GaussianCNNBaseline(env_spec=env.spec,
                                           filters=((32, (8, 8)), (64, (4,
                                                                        4))),
                                           strides=(4, 2),
                                           padding='VALID',
                                           hidden_sizes=(32, 32),
                                           use_trust_region=True)

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        discount=0.99,
                        gae_lambda=0.98,
                        max_kl_step=0.01,
                        policy_ent_coeff=0.0)

            trainer.setup(algo, env, sampler_cls=LocalSampler)
            last_avg_ret = trainer.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > -1.5

            env.close()
Esempio n. 9
0
    def test_is_pickleable(self, mock_rand, obs_dim, action_dim):
        mock_rand.return_value = 0
        env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'categorical_cnn_policy.MLPModel'),
                        new=SimpleMLPModel):
            with mock.patch(('garage.tf.policies.'
                             'categorical_cnn_policy.CNNModel'),
                            new=SimpleCNNModel):
                policy = CategoricalCNNPolicy(env_spec=env.spec,
                                              conv_filters=(32, ),
                                              conv_filter_sizes=(3, ),
                                              conv_strides=(1, ),
                                              conv_pad='SAME',
                                              hidden_sizes=(4, ))
        env.reset()
        obs, _, _, _ = env.step(1)

        with tf.compat.v1.variable_scope(
                'CategoricalCNNPolicy/Sequential/MLPModel', reuse=True):
            return_var = tf.compat.v1.get_variable('return_var')
        # assign it to all one
        return_var.load(tf.ones_like(return_var).eval())
        output1 = self.sess.run(policy.model.outputs,
                                feed_dict={policy.model.input: [obs]})
        p = pickle.dumps(policy)

        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            output2 = sess.run(policy_pickled.model.outputs,
                               feed_dict={policy_pickled.model.input: [obs]})
            assert np.array_equal(output1, output2)
Esempio n. 10
0
    def test_trpo_cnn_cubecrash(self):
        with LocalTFRunner(snapshot_config, sess=self.sess) as runner:
            env = TfEnv(normalize(gym.make('CubeCrash-v0')))

            policy = CategoricalCNNPolicy(env_spec=env.spec,
                                          conv_filters=(32, 64),
                                          conv_filter_sizes=(8, 4),
                                          conv_strides=(4, 2),
                                          conv_pad='VALID',
                                          hidden_sizes=(32, 32))

            baseline = GaussianCNNBaseline(env_spec=env.spec,
                                           regressor_args=dict(
                                               num_filters=(32, 64),
                                               filter_dims=(8, 4),
                                               strides=(4, 2),
                                               padding='VALID',
                                               hidden_sizes=(32, 32),
                                               use_trust_region=True))

            algo = TRPO(env_spec=env.spec,
                        policy=policy,
                        baseline=baseline,
                        max_path_length=100,
                        discount=0.99,
                        gae_lambda=0.98,
                        max_kl_step=0.01,
                        policy_ent_coeff=0.0,
                        flatten_input=False)

            runner.setup(algo, env)
            last_avg_ret = runner.train(n_epochs=10, batch_size=2048)
            assert last_avg_ret > -0.9

            env.close()
Esempio n. 11
0
def ppo_memorize_digits(ctxt=None,
                        seed=1,
                        batch_size=4000,
                        max_episode_length=100):
    """Train PPO on MemorizeDigits-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        batch_size (int): Number of timesteps to use in each training step.
        max_episode_length (int): Max number of timesteps in an episode.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env = normalize(
            GymEnv('MemorizeDigits-v0',
                   is_image=True,
                   max_episode_length=max_episode_length))
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=(
                                                  (32, (5, 5)),
                                                  (64, (3, 3)),
                                                  (64, (2, 2)),
                                              ),
                                      strides=(4, 2, 1),
                                      padding='VALID',
                                      hidden_sizes=(256, ))  # yapf: disable

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            filters=(
                (32, (5, 5)),
                (64, (3, 3)),
                (64, (2, 2)),
            ),
            strides=(4, 2, 1),
            padding='VALID',
            hidden_sizes=(256, ),
            use_trust_region=True)  # yapf: disable

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   discount=0.99,
                   gae_lambda=0.95,
                   lr_clip_range=0.2,
                   policy_ent_coeff=0.0,
                   optimizer_args=dict(
                       batch_size=32,
                       max_optimization_epochs=10,
                       learning_rate=1e-3,
                   ))

        trainer.setup(algo, env)
        trainer.train(n_epochs=1000, batch_size=batch_size)
Esempio n. 12
0
 def test_invalid_obs_shape(self, obs_dim, action_dim, filter_dims,
                            filter_sizes, strides, padding, hidden_sizes):
     env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim))
     with pytest.raises(ValueError):
         CategoricalCNNPolicy(env_spec=env.spec,
                              conv_filters=filter_dims,
                              conv_filter_sizes=filter_sizes,
                              conv_strides=strides,
                              conv_pad=padding,
                              hidden_sizes=hidden_sizes)
 def test_obs_unflattened(self, filters, strides, padding, hidden_sizes):
     self.policy = CategoricalCNNPolicy(env_spec=self.env.spec,
                                        filters=filters,
                                        strides=strides,
                                        padding=padding,
                                        hidden_sizes=hidden_sizes)
     obs = self.env.observation_space.sample()
     action, _ = self.policy.get_action(
         self.env.observation_space.flatten(obs))
     self.env.step(action)
    def test_clone(self, filters, strides, padding, hidden_sizes):
        env = GarageEnv(DummyDiscretePixelEnv())
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=filters,
                                      strides=strides,
                                      padding=padding,
                                      hidden_sizes=hidden_sizes)

        policy_clone = policy.clone('CategoricalCNNPolicyClone')
        assert policy.env_spec == policy_clone.env_spec
 def test_does_not_support_dict_obs_space(self, filters, strides, padding,
                                          hidden_sizes):
     """Test that policy raises error if passed a dict obs space."""
     env = GymEnv(DummyDictEnv(act_space_type='discrete'))
     with pytest.raises(ValueError):
         CategoricalCNNPolicy(env_spec=env.spec,
                              filters=filters,
                              strides=strides,
                              padding=padding,
                              hidden_sizes=hidden_sizes)
Esempio n. 16
0
def categorical_cnn_policy(ctxt, env_id, seed):
    """Create Categorical CNN Policy on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with LocalTFRunner(ctxt, max_cpus=12) as runner:
        env = TfEnv(normalize(gym.make(env_id)))

        policy = CategoricalCNNPolicy(
            env_spec=env.spec,
            conv_filters=hyper_params['conv_filters'],
            conv_filter_sizes=hyper_params['conv_filter_sizes'],
            conv_strides=hyper_params['conv_strides'],
            conv_pad=hyper_params['conv_pad'],
            hidden_sizes=hyper_params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(
                num_filters=hyper_params['conv_filters'],
                filter_dims=hyper_params['conv_filter_sizes'],
                strides=hyper_params['conv_strides'],
                padding=hyper_params['conv_pad'],
                hidden_sizes=hyper_params['hidden_sizes'],
                use_trust_region=hyper_params['use_trust_region']))

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
            flatten_input=False,
        )

        runner.setup(algo, env)
        runner.train(n_epochs=hyper_params['n_epochs'],
                     batch_size=hyper_params['batch_size'])
Esempio n. 17
0
def gaussian_cnn_baseline(ctxt, env_id, seed):
    """Create Gaussian CNN Baseline on TF-PPO.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the
            snapshotter.
        env_id (str): Environment id of the task.
        seed (int): Random positive integer for the trial.

    """
    deterministic.set_seed(seed)

    with TFTrainer(ctxt) as trainer:
        env = normalize(GymEnv(env_id))

        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=params['conv_filters'],
                                      strides=params['conv_strides'],
                                      padding=params['conv_pad'],
                                      hidden_sizes=params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            filters=params['conv_filters'],
            strides=params['conv_strides'],
            padding=params['conv_pad'],
            hidden_sizes=params['hidden_sizes'],
            use_trust_region=params['use_trust_region'])

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            sampler=sampler,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            optimizer_args=dict(
                batch_size=32,
                max_optimization_epochs=10,
                learning_rate=1e-3,
            ),
        )

        trainer.setup(algo, env)
        trainer.train(n_epochs=params['n_epochs'],
                      batch_size=params['batch_size'])
    def test_clone(self, filters, strides, padding, hidden_sizes):
        env = GymEnv(DummyDiscretePixelEnv())
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=filters,
                                      strides=strides,
                                      padding=padding,
                                      hidden_sizes=hidden_sizes)

        policy_clone = policy.clone('CategoricalCNNPolicyClone')
        assert policy.env_spec == policy_clone.env_spec
        for cloned_param, param in zip(policy_clone.parameters.values(),
                                       policy.parameters.values()):
            assert np.array_equal(cloned_param, param)
Esempio n. 19
0
def ppo_memorize_digits(ctxt=None, seed=1, batch_size=4000):
    """Train PPO on MemorizeDigits-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        batch_size (int): Number of timesteps to use in each training step.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = TfEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True)
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      num_filters=(32, 64, 64),
                                      filter_dims=(5, 3, 2),
                                      strides=(4, 2, 1),
                                      padding='VALID',
                                      hidden_sizes=(256, ))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       regressor_args=dict(
                                           num_filters=(32, 64, 64),
                                           filter_dims=(5, 3, 2),
                                           strides=(4, 2, 1),
                                           padding='VALID',
                                           hidden_sizes=(256, ),
                                           use_trust_region=True))

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=100,
                   discount=0.99,
                   gae_lambda=0.95,
                   lr_clip_range=0.2,
                   policy_ent_coeff=0.0,
                   optimizer_args=dict(
                       batch_size=32,
                       max_epochs=10,
                       tf_optimizer_args=dict(learning_rate=1e-3),
                   ),
                   flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=1000, batch_size=batch_size)
    def test_get_action(self, filters, strides, padding, hidden_sizes):
        env = GymEnv(DummyDiscretePixelEnv())
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=filters,
                                      strides=strides,
                                      padding=padding,
                                      hidden_sizes=hidden_sizes)

        env.reset()
        obs = env.step(1).observation

        action, _ = policy.get_action(obs)
        assert env.action_space.contains(action)

        actions, _ = policy.get_actions([obs, obs, obs])
        for action in actions:
            assert env.action_space.contains(action)
Esempio n. 21
0
def trpo_cubecrash(ctxt=None, seed=1, max_episode_length=5, batch_size=4000):
    """Train TRPO with CubeCrash-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by Trainer to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        max_episode_length (int): Maximum length of a single episode.
        batch_size (int): Number of timesteps to use in each training step.

    """
    set_seed(seed)
    with TFTrainer(ctxt) as trainer:
        env = normalize(
            GymEnv('CubeCrash-v0', max_episode_length=max_episode_length))
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=((32, (8, 8)), (64, (4, 4))),
                                      strides=(4, 2),
                                      padding='VALID',
                                      hidden_sizes=(32, 32))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       filters=((32, (8, 8)), (64, (4, 4))),
                                       strides=(4, 2),
                                       padding='VALID',
                                       hidden_sizes=(32, 32),
                                       use_trust_region=True)

        sampler = RaySampler(agents=policy,
                             envs=env,
                             max_episode_length=env.spec.max_episode_length,
                             is_tf_worker=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    sampler=sampler,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        trainer.setup(algo, env)
        trainer.train(n_epochs=100, batch_size=batch_size)
    def test_build(self, filters, strides, padding, hidden_sizes):
        env = GarageEnv(DummyDiscretePixelEnv())
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=filters,
                                      strides=strides,
                                      padding=padding,
                                      hidden_sizes=hidden_sizes)

        obs = env.reset()

        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None) +
                                               policy.input_dim)
        dist_sym = policy.build(state_input, name='dist_sym').dist
        output1 = self.sess.run([policy.distribution.probs],
                                feed_dict={policy.model.input: [[obs]]})
        output2 = self.sess.run([dist_sym.probs],
                                feed_dict={state_input: [[obs]]})
        assert np.array_equal(output1, output2)
Esempio n. 23
0
def run_task(snapshot_config, variant_data, *_):
    """Run task.

    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.

        variant_data (dict): Custom arguments for the task.

        *_ (object): Ignored by this function.

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = TfEnv(normalize(gym.make('MemorizeDigits-v0')), is_image=True)
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      conv_filters=(32, 64, 64),
                                      conv_filter_sizes=(5, 3, 2),
                                      conv_strides=(4, 2, 1),
                                      conv_pad='VALID',
                                      hidden_sizes=(256, ))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       regressor_args=dict(
                                           num_filters=(32, 64, 64),
                                           filter_dims=(5, 3, 2),
                                           strides=(4, 2, 1),
                                           padding='VALID',
                                           hidden_sizes=(256, ),
                                           use_trust_region=True))

        algo = PPO(env_spec=env.spec,
                   policy=policy,
                   baseline=baseline,
                   max_path_length=100,
                   discount=0.99,
                   max_kl_step=0.01,
                   flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=1000, batch_size=variant_data['batch_size'])
Esempio n. 24
0
def trpo_cubecrash(ctxt=None, seed=1, batch_size=4000):
    """Train TRPO with CubeCrash-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        batch_size (int): Number of timesteps to use in each training step.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = TfEnv(normalize(gym.make('CubeCrash-v0')))
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      conv_filters=(32, 64),
                                      conv_filter_sizes=(8, 4),
                                      conv_strides=(4, 2),
                                      conv_pad='VALID',
                                      hidden_sizes=(32, 32))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       regressor_args=dict(
                                           num_filters=(32, 64),
                                           filter_dims=(8, 4),
                                           strides=(4, 2),
                                           padding='VALID',
                                           hidden_sizes=(32, 32),
                                           use_trust_region=True))

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    max_kl_step=0.01,
                    flatten_input=False)

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size)
    def test_get_action(self, filters, strides, padding, hidden_sizes):
        env = GarageEnv(DummyDiscretePixelEnv())
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=filters,
                                      strides=strides,
                                      padding=padding,
                                      hidden_sizes=hidden_sizes)
        obs_var = tf.compat.v1.placeholder(tf.float32,
                                           shape=(None, None) +
                                           env.observation_space.shape,
                                           name='obs')
        policy.build(obs_var)

        env.reset()
        obs, _, _, _ = env.step(1)

        action, _ = policy.get_action(obs)
        assert env.action_space.contains(action)

        actions, _ = policy.get_actions([obs, obs, obs])
        for action in actions:
            assert env.action_space.contains(action)
Esempio n. 26
0
    def test_is_pickleable(self):
        env = TfEnv(DummyDiscretePixelEnv())
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filter_dims=(32, ),
                                      num_filters=(3, ),
                                      strides=(1, ),
                                      padding='SAME',
                                      hidden_sizes=(4, ))
        obs_var = tf.compat.v1.placeholder(tf.float32,
                                           shape=(None, None) +
                                           env.observation_space.shape,
                                           name='obs')
        policy.build(obs_var)

        env.reset()
        obs, _, _, _ = env.step(1)

        with tf.compat.v1.variable_scope(
                'CategoricalCNNPolicy/CategoricalCNNModel', reuse=True):
            cnn_bias = tf.compat.v1.get_variable('CNNModel/cnn/h0/bias')
            bias = tf.compat.v1.get_variable('MLPModel/mlp/hidden_0/bias')

        cnn_bias.load(tf.ones_like(cnn_bias).eval())
        bias.load(tf.ones_like(bias).eval())

        output1 = self.sess.run(policy.distribution.probs,
                                feed_dict={policy.model.input: [[obs]]})
        p = pickle.dumps(policy)

        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            obs_var = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None) +
                                               env.observation_space.shape,
                                               name='obs')
            policy_pickled.build(obs_var)
            output2 = sess.run(policy_pickled.distribution.probs,
                               feed_dict={policy_pickled.model.input: [[obs]]})
            assert np.array_equal(output1, output2)
Esempio n. 27
0
def trpo_cubecrash(ctxt=None, seed=1, batch_size=4000):
    """Train TRPO with CubeCrash-v0 environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.
        batch_size (int): Number of timesteps to use in each training step.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(normalize(gym.make('CubeCrash-v0')))
        policy = CategoricalCNNPolicy(env_spec=env.spec,
                                      filters=((32, (8, 8)), (64, (4, 4))),
                                      strides=(4, 2),
                                      padding='VALID',
                                      hidden_sizes=(32, 32))

        baseline = GaussianCNNBaseline(env_spec=env.spec,
                                       filters=((32, (8, 8)), (64, (4, 4))),
                                       strides=(4, 2),
                                       padding='VALID',
                                       hidden_sizes=(32, 32),
                                       use_trust_region=True)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        runner.setup(algo, env)
        runner.train(n_epochs=100, batch_size=batch_size)
Esempio n. 28
0
    def test_dist_info(self, obs_dim, action_dim, filter_dims, filter_sizes,
                       strides, padding, hidden_sizes):
        env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'categorical_cnn_policy.MLPModel'),
                        new=SimpleMLPModel):
            with mock.patch(('garage.tf.policies.'
                             'categorical_cnn_policy.CNNModel'),
                            new=SimpleCNNModel):
                policy = CategoricalCNNPolicy(env_spec=env.spec,
                                              conv_filters=filter_dims,
                                              conv_filter_sizes=filter_sizes,
                                              conv_strides=strides,
                                              conv_pad=padding,
                                              hidden_sizes=hidden_sizes)

        env.reset()
        obs, _, _, _ = env.step(1)

        expected_prob = np.full(action_dim, 0.5)

        policy_probs = policy.dist_info([obs])
        assert np.array_equal(policy_probs['prob'][0], expected_prob)
Esempio n. 29
0
def run_garage(env, seed, log_dir):
    '''
    Create garage model and training.

    Replace the ppo with the algorithm you want to run.

    :param env: Environment of the task.
    :param seed: Random seed for the trial.
    :param log_dir: Log dir path.
    :return:
    '''
    deterministic.set_seed(seed)
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=12,
                            inter_op_parallelism_threads=12)
    sess = tf.Session(config=config)

    with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner:
        env = TfEnv(normalize(env))

        policy = CategoricalCNNPolicy(
            env_spec=env.spec,
            conv_filters=params['conv_filters'],
            conv_filter_sizes=params['conv_filter_sizes'],
            conv_strides=params['conv_strides'],
            conv_pad=params['conv_pad'],
            hidden_sizes=params['hidden_sizes'])

        baseline = GaussianCNNBaseline(
            env_spec=env.spec,
            regressor_args=dict(num_filters=params['conv_filters'],
                                filter_dims=params['conv_filter_sizes'],
                                strides=params['conv_strides'],
                                padding=params['conv_pad'],
                                hidden_sizes=params['hidden_sizes'],
                                use_trust_region=params['use_trust_region']))

        algo = PPO(
            env_spec=env.spec,
            policy=policy,
            baseline=baseline,
            max_path_length=100,
            discount=0.99,
            gae_lambda=0.95,
            lr_clip_range=0.2,
            policy_ent_coeff=0.0,
            flatten_input=False,
            optimizer_args=dict(
                batch_size=32,
                max_epochs=10,
                tf_optimizer_args=dict(learning_rate=1e-3),
            ),
        )

        # Set up logger since we are not using run_experiment
        tabular_log_file = osp.join(log_dir, 'progress.csv')
        dowel_logger.add_output(dowel.StdOutput())
        dowel_logger.add_output(dowel.CsvOutput(tabular_log_file))
        dowel_logger.add_output(dowel.TensorBoardOutput(log_dir))

        runner.setup(algo, env)
        runner.train(n_epochs=params['n_epochs'],
                     batch_size=params['batch_size'])

        dowel_logger.remove_all()

        return tabular_log_file