Beispiel #1
0
    def test_is_pickleable(self):
        box_env = TfEnv(DummyBoxEnv(obs_dim=(1, )))
        with mock.patch(('garage.tf.baselines.'
                         'continuous_mlp_baseline_with_model.'
                         'ContinuousMLPRegressorWithModel'),
                        new=SimpleMLPRegressor):
            cmb = ContinuousMLPBaselineWithModel(env_spec=box_env.spec)
        obs = {'observations': [np.full(1, 1), np.full(1, 1)]}

        with tf.compat.v1.variable_scope('ContinuousMLPBaselineWithModel',
                                         reuse=True):
            return_var = tf.compat.v1.get_variable('SimpleMLPModel/return_var')
        return_var.load(1.0)

        prediction = cmb.predict(obs)

        h = pickle.dumps(cmb)

        with tf.compat.v1.Session(graph=tf.Graph()):
            cmb_pickled = pickle.loads(h)
            prediction2 = cmb_pickled.predict(obs)

            assert np.array_equal(prediction, prediction2)
    def test_is_pickleable(self):
        env = GarageEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    state_include_action=False)
        env.reset()
        obs = env.reset()
        with tf.compat.v1.variable_scope('GaussianLSTMPolicy', reuse=True):
            param = tf.compat.v1.get_variable(
                'dist_params/log_std_param/parameter')
        # assign it to all one
        param.load(tf.ones_like(param).eval())

        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None,
                                                      policy.input_dim))
        dist_sym = policy.build(state_input, name='dist_sym').dist
        output1 = self.sess.run(
            [dist_sym.loc, dist_sym.stddev()],
            feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]})

        p = pickle.dumps(policy)
        # yapf: disable
        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            state_input = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, None,
                                                          policy.input_dim))
            dist_sym = policy_pickled.build(state_input, name='dist_sym').dist
            output2 = sess.run(
                [
                    dist_sym.loc,
                    dist_sym.stddev()
                ],
                feed_dict={
                    state_input: [[obs.flatten()], [obs.flatten()]]
                })
            assert np.array_equal(output1, output2)
    def test_is_pickleable(self, obs_dim, action_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        obs_var = tf.compat.v1.placeholder(
            tf.float32,
            shape=[None, None, env.observation_space.flat_dim],
            name='obs')
        policy = GaussianMLPPolicy(env_spec=env.spec)

        policy.build(obs_var)
        obs = env.reset()

        with tf.compat.v1.variable_scope('GaussianMLPPolicy/GaussianMLPModel',
                                         reuse=True):
            bias = tf.compat.v1.get_variable(
                'dist_params/mean_network/hidden_0/bias')
        # assign it to all one
        bias.load(tf.ones_like(bias).eval())
        output1 = self.sess.run(
            [policy.distribution.loc,
             policy.distribution.stddev()],
            feed_dict={policy.model.input: [[obs.flatten()]]})

        p = pickle.dumps(policy)
        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            obs_var = tf.compat.v1.placeholder(
                tf.float32,
                shape=[None, None, env.observation_space.flat_dim],
                name='obs')
            policy_pickled = pickle.loads(p)
            policy_pickled.build(obs_var)
            output2 = sess.run(
                [
                    policy_pickled.distribution.loc,
                    policy_pickled.distribution.stddev()
                ],
                feed_dict={policy_pickled.model.input: [[obs.flatten()]]})
            assert np.array_equal(output1, output2)
    def setUp(self, mock_rand):
        mock_rand.return_value = 0.5
        super().setUp()
        self.box_env = TfEnv(DummyBoxEnv())
        self.policy1 = GaussianMLPPolicy(env_spec=self.box_env,
                                         init_std=1.0,
                                         name='P1')
        self.policy2 = GaussianMLPPolicy(env_spec=self.box_env,
                                         init_std=1.2,
                                         name='P2')
        self.policy3 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                  init_std=1.0,
                                                  name='P3')
        self.policy4 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                  init_std=1.2,
                                                  name='P4')

        self.sess.run(tf.global_variables_initializer())

        for a, b in zip(self.policy3.get_params(), self.policy1.get_params()):
            self.sess.run(tf.assign(b, a))
        for a, b in zip(self.policy4.get_params(), self.policy2.get_params()):
            self.sess.run(tf.assign(b, a))

        self.obs = [self.box_env.reset()]
        self.obs_ph = tf.placeholder(
            tf.float32, shape=(None, self.box_env.observation_space.flat_dim))
        self.action_ph = tf.placeholder(
            tf.float32, shape=(None, self.box_env.action_space.flat_dim))

        self.dist1_sym = self.policy1.dist_info_sym(self.obs_ph, name='p1_sym')
        self.dist2_sym = self.policy2.dist_info_sym(self.obs_ph, name='p2_sym')
        self.dist3_sym = self.policy3.dist_info_sym(self.obs_ph, name='p3_sym')
        self.dist4_sym = self.policy4.dist_info_sym(self.obs_ph, name='p4_sym')

        assert self.policy1.vectorized == self.policy2.vectorized
        assert self.policy3.vectorized == self.policy4.vectorized
Beispiel #5
0
    def test_get_action(self, hidden_sizes):
        unflat_dim = (2, 2)
        env_spec = GymEnv(DummyBoxEnv(obs_dim=unflat_dim))
        obs_dim = env_spec.observation_space.flat_dim
        act_dim = env_spec.action_space.flat_dim
        obs = torch.ones(obs_dim, dtype=torch.float32)
        obs_unflat = torch.ones(unflat_dim, dtype=torch.float32)
        obs_np = np.ones(obs_dim, dtype=np.float32)
        obs_np_unflat = np.ones(unflat_dim, dtype=np.float32)
        policy = DeterministicMLPPolicy(env_spec=env_spec,
                                        hidden_nonlinearity=None,
                                        hidden_sizes=hidden_sizes,
                                        hidden_w_init=nn.init.ones_,
                                        output_w_init=nn.init.ones_)

        expected_output = np.full(act_dim,
                                  fill_value=obs_dim * np.prod(hidden_sizes),
                                  dtype=np.float32)
        assert np.array_equal(policy.get_action(obs)[0], expected_output)
        assert np.array_equal(
            policy.get_action(obs_unflat)[0], expected_output)
        assert np.array_equal(policy.get_action(obs_np)[0], expected_output)
        assert np.array_equal(
            policy.get_action(obs_np_unflat)[0], expected_output)
    def test_get_action_state_include_action(self, obs_dim, action_dim,
                                             hidden_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        obs_var = tf.compat.v1.placeholder(
            tf.float32,
            shape=[
                None, None,
                env.observation_space.flat_dim + np.prod(action_dim)
            ],
            name='obs')
        policy = GaussianLSTMPolicy2(env_spec=env.spec,
                                     hidden_dim=hidden_dim,
                                     state_include_action=True)
        policy.build(obs_var)
        policy.reset()
        obs = env.reset()
        action, _ = policy.get_action(obs.flatten())
        assert env.action_space.contains(action)

        policy.reset()

        actions, _ = policy.get_actions([obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
Beispiel #7
0
    def test_is_pickleable(self, batch_size, hidden_sizes):
        """Test if policy is unchanged after pickling."""
        env_spec = GymEnv(DummyBoxEnv())
        obs_dim = env_spec.observation_space.flat_dim
        obs = torch.ones([batch_size, obs_dim], dtype=torch.float32)
        init_std = 2.

        policy = TanhGaussianMLPPolicy(env_spec=env_spec,
                                       hidden_sizes=hidden_sizes,
                                       init_std=init_std,
                                       hidden_nonlinearity=None,
                                       std_parameterization='exp',
                                       hidden_w_init=nn.init.ones_,
                                       output_w_init=nn.init.ones_)

        output1_action, output1_prob = policy.get_actions(obs)

        p = pickle.dumps(policy)
        policy_pickled = pickle.loads(p)
        output2_action, output2_prob = policy_pickled.get_actions(obs)
        assert np.allclose(output2_prob['mean'],
                           output1_prob['mean'],
                           rtol=1e-3)
        assert output1_action.shape == output2_action.shape
    def test_get_action_state_include_action(self, mock_normal, obs_dim,
                                             action_dim, hidden_dim):
        mock_normal.return_value = 0.5
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('metarl.tf.policies.'
                         'gaussian_lstm_policy.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicy(env_spec=env.spec,
                                        state_include_action=True)

        policy.reset()
        obs = env.reset()
        expected_action = np.full(action_dim, 0.5 * np.exp(0.5) + 0.5)
        action, agent_info = policy.get_action(obs)
        assert env.action_space.contains(action)
        assert np.allclose(action, expected_action, atol=1e-6)
        expected_mean = np.full(action_dim, 0.5)
        assert np.array_equal(agent_info['mean'], expected_mean)
        expected_log_std = np.full(action_dim, 0.5)
        assert np.array_equal(agent_info['log_std'], expected_log_std)
        expected_prev_action = np.full(action_dim, 0)
        assert np.array_equal(agent_info['prev_action'], expected_prev_action)

        policy.reset()

        actions, agent_infos = policy.get_actions([obs])
        for action, mean, log_std, prev_action in zip(
                actions, agent_infos['mean'], agent_infos['log_std'],
                agent_infos['prev_action']):
            assert env.action_space.contains(action)
            assert np.allclose(action,
                               np.full(action_dim, expected_action),
                               atol=1e-6)
            assert np.array_equal(mean, expected_mean)
            assert np.array_equal(log_std, expected_log_std)
            assert np.array_equal(prev_action, expected_prev_action)
    def test_dist_info_sym_wrong_input(self):
        env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))

        obs_ph = tf.compat.v1.placeholder(
            tf.float32, shape=(None, None, env.observation_space.flat_dim))

        with mock.patch(('metarl.tf.policies.'
                         'gaussian_lstm_policy.GaussianLSTMModel'),
                        new=SimpleGaussianLSTMModel):
            policy = GaussianLSTMPolicy(env_spec=env.spec,
                                        state_include_action=True)

            policy.reset()
            obs = env.reset()

            policy.dist_info_sym(
                obs_var=obs_ph,
                state_info_vars={'prev_action': np.zeros((3, 1, 1))},
                name='p2_sym')
        # observation batch size = 2 but prev_action batch size = 3
        with pytest.raises(tf.errors.InvalidArgumentError):
            self.sess.run(
                policy.model.networks['p2_sym'].input,
                feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})
Beispiel #10
0
    def test_get_action_sym(self, obs_dim, action_dim):
        """Test get_action_sym method"""
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'continuous_mlp_policy.MLPModel'),
                        new=SimpleMLPModel):
            policy = ContinuousMLPPolicy(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        obs_dim = env.spec.observation_space.flat_dim
        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim))
        action_sym = policy.get_action_sym(state_input, name='action_sym')

        expected_action = np.full(action_dim, 0.5)

        action = self.sess.run(action_sym,
                               feed_dict={state_input: [obs.flatten()]})
        action = policy.action_space.unflatten(action)

        assert np.array_equal(action, expected_action)
        assert env.action_space.contains(action)
Beispiel #11
0
    def test_get_action(self, obs_dim, action_dim):
        """Test get_action method"""
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'continuous_mlp_policy.MLPModel'),
                        new=SimpleMLPModel):
            policy = ContinuousMLPPolicy(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        action, _ = policy.get_action(obs.flatten())

        expected_action = np.full(action_dim, 0.5)

        assert env.action_space.contains(action)
        assert np.array_equal(action, expected_action)

        actions, _ = policy.get_actions(
            [obs.flatten(), obs.flatten(),
             obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
            assert np.array_equal(action, expected_action)
    def test_dist_info_sym(self, obs_dim, action_dim, hidden_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))

        obs_ph = tf.placeholder(
            tf.float32, shape=(None, None, env.observation_space.flat_dim))

        with mock.patch(('garage.tf.policies.'
                         'gaussian_gru_policy_with_model.GaussianGRUModel'),
                        new=SimpleGaussianGRUModel):
            policy = GaussianGRUPolicyWithModel(
                env_spec=env.spec, state_include_action=False)

            policy.reset()
            obs = env.reset()

            dist_sym = policy.dist_info_sym(
                obs_var=obs_ph, state_info_vars=None, name='p2_sym')

        dist = self.sess.run(
            dist_sym, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})

        assert np.array_equal(dist['mean'], np.full((2, 1) + action_dim, 0.5))
        assert np.array_equal(dist['log_std'], np.full((2, 1) + action_dim,
                                                       0.5))
    def test_module(self, reward_dim, latent_dim, hidden_sizes, updates):
        """Test all methods."""
        env_spec = TfEnv(DummyBoxEnv())
        latent_space = akro.Box(low=-1,
                                high=1,
                                shape=(latent_dim, ),
                                dtype=np.float32)

        # add latent space to observation space to create a new space
        augmented_obs_space = akro.Tuple(
            (env_spec.observation_space, latent_space))
        augmented_env_spec = EnvSpec(augmented_obs_space,
                                     env_spec.action_space)

        obs_dim = int(np.prod(env_spec.observation_space.shape))
        action_dim = int(np.prod(env_spec.action_space.shape))
        encoder_input_dim = obs_dim + action_dim + reward_dim
        encoder_output_dim = latent_dim * 2
        encoder_hidden_sizes = (3, 2, encoder_output_dim)

        context_encoder = RecurrentEncoder(input_dim=encoder_input_dim,
                                           output_dim=encoder_output_dim,
                                           hidden_nonlinearity=None,
                                           hidden_sizes=encoder_hidden_sizes,
                                           hidden_w_init=nn.init.ones_,
                                           output_w_init=nn.init.ones_)

        # policy needs to be able to accept obs_dim + latent_dim as input dim
        policy = GaussianMLPPolicy(env_spec=augmented_env_spec,
                                   hidden_sizes=hidden_sizes,
                                   hidden_nonlinearity=F.relu,
                                   output_nonlinearity=None)

        module = ContextConditionedPolicy(latent_dim=latent_dim,
                                          context_encoder=context_encoder,
                                          policy=policy,
                                          use_ib=True,
                                          use_next_obs=False)

        expected_shape = [1, latent_dim]
        module.reset_belief()
        assert torch.all(torch.eq(module.z_means, torch.zeros(expected_shape)))
        assert torch.all(torch.eq(module.z_vars, torch.ones(expected_shape)))

        module.sample_from_belief()
        assert all([a == b for a, b in zip(module.z.shape, expected_shape)])

        module.detach_z()
        assert module.z.requires_grad is False

        context_dict = {}
        context_dict['observation'] = np.ones(obs_dim)
        context_dict['action'] = np.ones(action_dim)
        context_dict['reward'] = np.ones(reward_dim)
        context_dict['next_observation'] = np.ones(obs_dim)

        for _ in range(updates):
            module.update_context(context_dict)
        assert torch.all(
            torch.eq(module._context, torch.ones(updates, encoder_input_dim)))

        context = torch.randn(1, 1, encoder_input_dim)
        module.infer_posterior(context)
        assert all([a == b for a, b in zip(module.z.shape, expected_shape)])

        t, b = 1, 2
        obs = torch.randn((t, b, obs_dim), dtype=torch.float32)
        policy_output, task_z_out = module.forward(obs, context)
        assert policy_output is not None
        expected_shape = [b, latent_dim]
        assert all([a == b for a, b in zip(task_z_out.shape, expected_shape)])

        obs = torch.randn(obs_dim)
        action = module.get_action(obs)
        assert len(action) == action_dim

        kl_div = module.compute_kl_div()
        assert kl_div != 0
 def test_invalid_obs_shape(self, obs_dim):
     box_env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim))
     with pytest.raises(ValueError):
         GaussianCNNBaseline(env_spec=box_env.spec)
Beispiel #15
0
 def test_invalid_env(self):
     env = TfEnv(DummyBoxEnv())
     with pytest.raises(ValueError):
         CategoricalMLPPolicy2(env_spec=env.spec)
Beispiel #16
0
 def test_baseline(self):
     """Test the baseline initialization."""
     box_env = TfEnv(DummyBoxEnv())
     deterministic_mlp_baseline = DeterministicMLPBaseline(env_spec=box_env)
     gaussian_mlp_baseline = GaussianMLPBaseline(env_spec=box_env)
Beispiel #17
0
 def test_state_info_specs_with_state_include_action(self):
     env = GarageEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, )))
     policy = GaussianLSTMPolicy(env_spec=env.spec,
                                 state_include_action=True)
     assert policy.state_info_specs == [('prev_action', (4, ))]
Beispiel #18
0
 def setup_method(self):
     self.env = GarageEnv(DummyBoxEnv(obs_dim=(4, 4), action_dim=(2, 2)))
     self.policy = DummyPolicy(self.env.spec)
Beispiel #19
0
 def test_clone(self):
     env = TfEnv(DummyBoxEnv(obs_dim=(10, ), action_dim=(4, )))
     policy = GaussianMLPPolicy(env_spec=env.spec)
     policy_clone = policy.clone('GaussnaMLPPolicyClone')
     assert policy.env_spec == policy_clone.env_spec
 def test_clone(self, obs_dim, action_dim, hidden_sizes):
     env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
     qf = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=hidden_sizes)
     qf_clone = qf.clone('another_qf')
     assert qf_clone._hidden_sizes == qf._hidden_sizes
Beispiel #21
0
 def setUp(self):
     super().setUp()
     self.env = TfEnv(DummyBoxEnv())
Beispiel #22
0
    def setup_method(self):
        with mock.patch('tensorflow.random.normal') as mock_rand:
            mock_rand.return_value = 0.5
            super().setup_method()
            env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
            self.default_initializer = tf.constant_initializer(1)
            self.default_hidden_nonlinearity = tf.nn.tanh
            self.default_recurrent_nonlinearity = tf.nn.sigmoid
            self.default_output_nonlinearity = None
            self.time_step = 1

            self.policy1 = GaussianGRUPolicy(
                env_spec=env.spec,
                hidden_dim=4,
                hidden_nonlinearity=self.default_hidden_nonlinearity,
                recurrent_nonlinearity=self.default_recurrent_nonlinearity,
                recurrent_w_x_init=self.default_initializer,
                recurrent_w_h_init=self.default_initializer,
                output_nonlinearity=self.default_output_nonlinearity,
                output_w_init=self.default_initializer,
                state_include_action=True,
                name='P1')
            self.policy2 = GaussianGRUPolicy(
                env_spec=env.spec,
                hidden_dim=4,
                hidden_nonlinearity=self.default_hidden_nonlinearity,
                recurrent_nonlinearity=self.default_recurrent_nonlinearity,
                recurrent_w_x_init=self.default_initializer,
                recurrent_w_h_init=self.default_initializer,
                output_nonlinearity=self.default_output_nonlinearity,
                output_w_init=tf.constant_initializer(2),
                state_include_action=True,
                name='P2')

            self.sess.run(tf.compat.v1.global_variables_initializer())

            self.policy3 = GaussianGRUPolicyWithModel(
                env_spec=env.spec,
                hidden_dim=4,
                hidden_nonlinearity=self.default_hidden_nonlinearity,
                hidden_w_init=self.default_initializer,
                recurrent_nonlinearity=self.default_recurrent_nonlinearity,
                recurrent_w_init=self.default_initializer,
                output_nonlinearity=self.default_output_nonlinearity,
                output_w_init=self.default_initializer,
                state_include_action=True,
                name='P3')
            self.policy4 = GaussianGRUPolicyWithModel(
                env_spec=env.spec,
                hidden_dim=4,
                hidden_nonlinearity=self.default_hidden_nonlinearity,
                hidden_w_init=self.default_initializer,
                recurrent_nonlinearity=self.default_recurrent_nonlinearity,
                recurrent_w_init=self.default_initializer,
                output_nonlinearity=self.default_output_nonlinearity,
                output_w_init=tf.constant_initializer(2),
                state_include_action=True,
                name='P4')

            self.policy1.reset()
            self.policy2.reset()
            self.policy3.reset()
            self.policy4.reset()
            self.obs = [env.reset()]
            self.obs = np.concatenate(
                [self.obs for _ in range(self.time_step)], axis=0)

            self.obs_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(None, None, env.observation_space.flat_dim))
            self.action_ph = tf.compat.v1.placeholder(
                tf.float32, shape=(None, None, env.action_space.flat_dim))

            self.dist1_sym = self.policy1.dist_info_sym(
                obs_var=self.obs_ph,
                state_info_vars={
                    'prev_action': np.zeros((2, self.time_step, 1))
                },
                name='p1_sym')
            self.dist2_sym = self.policy2.dist_info_sym(
                obs_var=self.obs_ph,
                state_info_vars={
                    'prev_action': np.zeros((2, self.time_step, 1))
                },
                name='p2_sym')
            self.dist3_sym = self.policy3.dist_info_sym(
                obs_var=self.obs_ph,
                state_info_vars={
                    'prev_action': np.zeros((2, self.time_step, 1))
                },
                name='p3_sym')
            self.dist4_sym = self.policy4.dist_info_sym(
                obs_var=self.obs_ph,
                state_info_vars={
                    'prev_action': np.zeros((2, self.time_step, 1))
                },
                name='p4_sym')
Beispiel #23
0
def test_methods():
    """Test PEARLWorker methods."""
    env_spec = GarageEnv(DummyBoxEnv())
    latent_dim = 5
    latent_space = akro.Box(low=-1,
                            high=1,
                            shape=(latent_dim, ),
                            dtype=np.float32)

    # add latent space to observation space to create a new space
    augmented_obs_space = akro.Tuple(
        (env_spec.observation_space, latent_space))
    augmented_env_spec = EnvSpec(augmented_obs_space, env_spec.action_space)

    obs_dim = int(np.prod(env_spec.observation_space.shape))
    action_dim = int(np.prod(env_spec.action_space.shape))
    reward_dim = 1
    encoder_input_dim = obs_dim + action_dim + reward_dim
    encoder_output_dim = latent_dim * 2
    encoder_hidden_sizes = (3, 2, encoder_output_dim)

    context_encoder = MLPEncoder(input_dim=encoder_input_dim,
                                 output_dim=encoder_output_dim,
                                 hidden_nonlinearity=None,
                                 hidden_sizes=encoder_hidden_sizes,
                                 hidden_w_init=nn.init.ones_,
                                 output_w_init=nn.init.ones_)

    policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec,
                                   hidden_sizes=(3, 5, 7),
                                   hidden_nonlinearity=F.relu,
                                   output_nonlinearity=None)

    context_policy = ContextConditionedPolicy(latent_dim=latent_dim,
                                              context_encoder=context_encoder,
                                              policy=policy,
                                              use_information_bottleneck=True,
                                              use_next_obs=False)

    max_path_length = 20
    worker1 = PEARLWorker(seed=1,
                          max_path_length=max_path_length,
                          worker_number=1)
    worker1.update_agent(context_policy)
    worker1.update_env(env_spec)
    rollouts = worker1.rollout()

    assert rollouts.observations.shape == (max_path_length, obs_dim)
    assert rollouts.actions.shape == (max_path_length, action_dim)
    assert rollouts.rewards.shape == (max_path_length, )

    worker2 = PEARLWorker(seed=1,
                          max_path_length=max_path_length,
                          worker_number=1,
                          deterministic=True,
                          accum_context=True)
    worker2.update_agent(context_policy)
    worker2.update_env(env_spec)
    rollouts = worker2.rollout()

    assert context_policy.context.shape == (1, max_path_length,
                                            encoder_input_dim)
    assert rollouts.observations.shape == (max_path_length, obs_dim)
    assert rollouts.actions.shape == (max_path_length, action_dim)
    assert rollouts.rewards.shape == (max_path_length, )
Beispiel #24
0
    def test_get_action(self, mock_normal, obs_dim, task_num, latent_dim,
                        action_dim):
        mock_normal.return_value = 0.5
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(
                'garage.tf.policies.'
                'gaussian_mlp_task_embedding_policy.GaussianMLPModel',
                new=SimpleGaussianMLPModel):
            embedding_spec = InOutSpec(
                input_space=akro.Box(low=np.zeros(task_num),
                                     high=np.ones(task_num)),
                output_space=akro.Box(low=np.zeros(latent_dim),
                                      high=np.ones(latent_dim)))
            encoder = GaussianMLPEncoder(embedding_spec)
            policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec,
                                                    encoder=encoder)

        env.reset()
        obs, _, _, _ = env.step(1)
        latent = np.random.random((latent_dim, ))
        task = np.zeros(task_num)
        task[0] = 1

        action1, prob1 = policy.get_action_given_latent(obs, latent)
        action2, prob2 = policy.get_action_given_task(obs, task)
        action3, prob3 = policy.get_action(
            np.concatenate([obs.flatten(), task]))

        expected_action = np.full(action_dim, 0.75)
        expected_mean = np.full(action_dim, 0.5)
        expected_log_std = np.full(action_dim, np.log(0.5))

        assert env.action_space.contains(action1)
        assert np.array_equal(action1, expected_action)
        assert np.array_equal(prob1['mean'], expected_mean)
        assert np.array_equal(prob1['log_std'], expected_log_std)

        assert env.action_space.contains(action2)
        assert np.array_equal(action2, expected_action)
        assert np.array_equal(prob2['mean'], expected_mean)
        assert np.array_equal(prob2['log_std'], expected_log_std)

        assert env.action_space.contains(action3)
        assert np.array_equal(action3, expected_action)
        assert np.array_equal(prob3['mean'], expected_mean)
        assert np.array_equal(prob3['log_std'], expected_log_std)

        obses, latents, tasks = [obs] * 3, [latent] * 3, [task] * 3
        aug_obses = [np.concatenate([obs.flatten(), task])] * 3
        action1n, prob1n = policy.get_actions_given_latents(obses, latents)
        action2n, prob2n = policy.get_actions_given_tasks(obses, tasks)
        action3n, prob3n = policy.get_actions(aug_obses)

        for action, mean, log_std in chain(
                zip(action1n, prob1n['mean'], prob1n['log_std']),
                zip(action2n, prob2n['mean'], prob2n['log_std']),
                zip(action3n, prob3n['mean'], prob3n['log_std'])):
            assert env.action_space.contains(action)
            assert np.array_equal(action, expected_action)
            assert np.array_equal(mean, expected_mean)
            assert np.array_equal(log_std, expected_log_std)
Beispiel #25
0
 def test_state_info_specs(self):
     env = GarageEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, )))
     policy = GaussianLSTMPolicy(env_spec=env.spec,
                                 state_include_action=False)
     assert policy.state_info_specs == []
Beispiel #26
0
 def test_normalize_pixel_patch_not_trigger(self):
     env = TfEnv(DummyBoxEnv())
     obs = env.reset()
     obs_normalized = normalize_pixel_batch(env, obs)
     assert np.array_equal(obs, obs_normalized)
Beispiel #27
0
 def test_clone(self):
     env = GarageEnv(DummyBoxEnv(obs_dim=(4, ), action_dim=(4, )))
     policy = GaussianLSTMPolicy(env_spec=env.spec)
     policy_clone = policy.clone('GaussianLSTMPolicyClone')
     assert policy_clone.env_spec == policy.env_spec
Beispiel #28
0
 def setup_method(self):
     super().setup_method()
     self.env = GarageEnv(DummyBoxEnv())
Beispiel #29
0
 def test_invalid_env(self):
     env = GarageEnv(DummyBoxEnv())
     with pytest.raises(ValueError):
         CategoricalLSTMPolicy(env_spec=env.spec)
Beispiel #30
0
    def test_dist_info(self, obs_dim, task_num, latent_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(
                'garage.tf.policies.'
                'gaussian_mlp_task_embedding_policy.GaussianMLPModel',
                new=SimpleGaussianMLPModel):
            embedding_spec = InOutSpec(
                input_space=akro.Box(low=np.zeros(task_num),
                                     high=np.ones(task_num)),
                output_space=akro.Box(low=np.zeros(latent_dim),
                                      high=np.ones(latent_dim)))
            encoder = GaussianMLPEncoder(embedding_spec)
            policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec,
                                                    encoder=encoder)

        env.reset()
        obs, _, _, _ = env.step(1)
        task = np.zeros(task_num)
        task[0] = 1
        aug_obs = np.concatenate([obs.flatten(), task])
        latent = np.random.random(latent_dim)

        obs_dim = env.spec.observation_space.flat_dim
        obs_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim))
        task_ph = tf.compat.v1.placeholder(tf.float32, shape=(None, task_num))
        latent_ph = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, latent_dim))
        aug_obs_ph = tf.compat.v1.concat([obs_ph, task_ph], axis=1)

        dist0_sym = policy.dist_info_sym(aug_obs_ph, name='p0_sym')
        dist1_sym = policy.dist_info_sym_given_task(obs_ph,
                                                    task_ph,
                                                    name='p1_sym')
        dist2_sym = policy.dist_info_sym_given_latent(obs_ph,
                                                      latent_ph,
                                                      name='p2_sym')

        # flatten output
        expected_mean = [np.full(np.prod(action_dim), 0.5)]
        expected_log_std = [np.full(np.prod(action_dim), np.log(0.5))]

        prob0 = self.sess.run(dist0_sym,
                              feed_dict={aug_obs_ph: [aug_obs.flatten()]})
        prob1 = self.sess.run(dist1_sym,
                              feed_dict={
                                  obs_ph: [obs.flatten()],
                                  task_ph: [task]
                              })
        prob2 = self.sess.run(dist2_sym,
                              feed_dict={
                                  obs_ph: [obs.flatten()],
                                  latent_ph: [latent]
                              })
        prob3 = policy.dist_info(aug_obs)

        assert np.array_equal(prob0['mean'].flatten(), expected_mean[0])
        assert np.array_equal(prob0['log_std'].flatten(), expected_log_std[0])
        assert np.array_equal(prob1['mean'], expected_mean)
        assert np.array_equal(prob1['log_std'], expected_log_std)
        assert np.array_equal(prob2['mean'], expected_mean)
        assert np.array_equal(prob2['log_std'], expected_log_std)
        assert np.array_equal(prob3['mean'].flatten(), expected_mean[0])
        assert np.array_equal(prob3['log_std'].flatten(), expected_log_std[0])