def test_output_shape(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.q_functions.'
                         'continuous_mlp_q_function.MLPMergeModel'),
                        new=SimpleMLPMergeModel):
            qf = ContinuousMLPQFunction(env_spec=env.spec)
        env.reset()
        obs, _, _, _ = env.step(1)
        obs = obs.flatten()
        act = np.full(action_dim, 0.5).flatten()
        obs_ph, act_ph = qf.inputs

        outputs = qf.get_qval([obs], [act])

        assert outputs.shape == (1, 1)
Beispiel #2
0
    def test_output_shape(self, obs_dim, act_dim, output_dim, hidden_sizes):
        env_spec = TfEnv(DummyBoxEnv())
        obs = torch.ones(obs_dim, dtype=torch.float32).unsqueeze(0)
        act = torch.ones(act_dim, dtype=torch.float32).unsqueeze(0)
        nn_module = MLPModule(input_dim=obs_dim + act_dim,
                              output_dim=output_dim,
                              hidden_nonlinearity=None,
                              hidden_sizes=hidden_sizes,
                              hidden_w_init=nn.init.ones_,
                              output_w_init=nn.init.ones_)

        qf = ContinuousNNQFunction(env_spec, nn_module)
        output = qf.get_qval(obs, act)

        assert output.shape == (1, 1)
Beispiel #3
0
 def test_param_values(self, obs_dim):
     box_env = TfEnv(DummyBoxEnv(obs_dim=obs_dim))
     with mock.patch(('garage.tf.baselines.'
                      'gaussian_mlp_baseline_with_model.'
                      'GaussianMLPRegressorWithModel'),
                     new=SimpleGaussianMLPRegressor):
         gmb = GaussianMLPBaselineWithModel(env_spec=box_env.spec)
         new_gmb = GaussianMLPBaselineWithModel(
             env_spec=box_env.spec, name='GaussianMLPBaselineWithModel2')
     old_param_values = gmb.get_param_values()
     new_param_values = new_gmb.get_param_values()
     assert not np.array_equal(old_param_values, new_param_values)
     new_gmb.set_param_values(old_param_values)
     new_param_values = new_gmb.get_param_values()
     assert np.array_equal(old_param_values, new_param_values)
    def test_get_embedding(self, obs_dim, embedding_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=embedding_dim))
        embedding_spec = InOutSpec(input_space=env.spec.observation_space,
                                   output_space=env.spec.action_space)
        embedding = GaussianMLPEncoder(embedding_spec)
        task_input = tf.compat.v1.placeholder(tf.float32,
                                              shape=(None, None,
                                                     embedding.input_dim))
        embedding.build(task_input)

        env.reset()
        obs, _, _, _ = env.step(1)

        latent, _ = embedding.forward(obs)
        assert env.action_space.contains(latent)
    def test_get_action(self, obs_dim, action_dim, hidden_dim):
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = GaussianLSTMPolicy(env_spec=env.spec,
                                    hidden_dim=hidden_dim,
                                    state_include_action=False)

        policy.reset()
        obs = env.reset()

        action, _ = policy.get_action(obs.flatten())
        assert env.action_space.contains(action)

        actions, _ = policy.get_actions([obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
Beispiel #6
0
    def test_output_shape(self, batch_size, hidden_sizes):
        env_spec = GymEnv(DummyBoxEnv()).spec
        obs_dim = env_spec.observation_space.flat_dim
        act_dim = env_spec.action_space.flat_dim
        obs = torch.ones(batch_size, obs_dim, dtype=torch.float32)
        act = torch.ones(batch_size, act_dim, dtype=torch.float32)

        qf = ContinuousMLPQFunction(env_spec=env_spec,
                                    hidden_nonlinearity=None,
                                    hidden_sizes=hidden_sizes,
                                    hidden_w_init=nn.init.ones_,
                                    output_w_init=nn.init.ones_)
        output = qf(obs, act)

        assert output.shape == (batch_size, 1)
    def test_build(self, obs_dim, action_dim):
        env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = GaussianMLPPolicy(env_spec=env.spec)
        obs = env.reset()[0]

        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, None,
                                                      policy.input_dim))
        dist_sym = policy.build(state_input, name='dist_sym').dist
        dist_sym2 = policy.build(state_input, name='dist_sym2').dist
        output1 = self.sess.run([dist_sym.loc],
                                feed_dict={state_input: [[obs.flatten()]]})
        output2 = self.sess.run([dist_sym2.loc],
                                feed_dict={state_input: [[obs.flatten()]]})
        assert np.array_equal(output1, output2)
def test_is_pickleable(batch_size):
    env_spec = GymEnv(DummyBoxEnv())
    obs_dim = env_spec.observation_space.flat_dim
    obs = torch.ones([batch_size, obs_dim], dtype=torch.float32)
    qf = DiscreteMLPQFunction(env_spec=env_spec,
                              hidden_nonlinearity=None,
                              hidden_sizes=(2, 2))
    policy = DiscreteQFArgmaxPolicy(qf, env_spec)

    output1 = policy.get_actions(obs.numpy())[0]

    p = pickle.dumps(policy)
    policy_pickled = pickle.loads(p)
    output2 = policy_pickled.get_actions(obs.numpy())[0]
    assert np.array_equal(output1, output2)
Beispiel #9
0
    def test_get_actions(self, obs_dim, act_dim, batch_size, hidden_sizes):
        env_spec = TfEnv(DummyBoxEnv())
        obs = torch.ones([batch_size, obs_dim], dtype=torch.float32)
        nn_module = MLPModule(
            input_dim=obs_dim,
            output_dim=act_dim,
            hidden_nonlinearity=None,
            hidden_sizes=hidden_sizes,
            hidden_w_init=nn.init.ones_,
            output_w_init=nn.init.ones_)

        policy = DeterministicPolicy(env_spec, nn_module)
        expected_output = np.full([batch_size, act_dim],
                                  fill_value=obs_dim * np.prod(hidden_sizes),
                                  dtype=np.float32)
        assert np.array_equal(policy.get_actions(obs), expected_output)
Beispiel #10
0
    def test_is_pickleable(self):
        box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec
        cmb = ContinuousMLPBaseline(env_spec=box_env_spec)

        with tf.compat.v1.variable_scope('ContinuousMLPBaseline', reuse=True):
            bias = tf.compat.v1.get_variable('mlp/hidden_0/bias')
        bias.load(tf.ones_like(bias).eval())

        _, _, paths, _ = get_train_test_data()
        result1 = cmb.predict(paths)
        h = pickle.dumps(cmb)

        with tf.compat.v1.Session(graph=tf.Graph()):
            cmb_pickled = pickle.loads(h)
            result2 = cmb_pickled.predict(paths)
            assert np.array_equal(result1, result2)
    def test_get_actions(self, batch_size, hidden_sizes):
        env_spec = TfEnv(DummyBoxEnv())
        obs_dim = env_spec.observation_space.flat_dim
        act_dim = env_spec.action_space.flat_dim
        obs = torch.ones([batch_size, obs_dim], dtype=torch.float32)

        policy = DeterministicMLPPolicy(env_spec=env_spec,
                                        hidden_nonlinearity=None,
                                        hidden_sizes=hidden_sizes,
                                        hidden_w_init=nn.init.ones_,
                                        output_w_init=nn.init.ones_)

        expected_output = np.full([batch_size, act_dim],
                                  fill_value=obs_dim * np.prod(hidden_sizes),
                                  dtype=np.float32)
        assert np.array_equal(policy.get_actions(obs)[0], expected_output)
Beispiel #12
0
 def test_log_prob(self):
     """Test log_prob method of the policy."""
     env_spec = MetaRLEnv(DummyBoxEnv())
     init_std = 1.
     obs = torch.Tensor([0, 0, 0, 0]).float()
     action = torch.Tensor([0, 0]).float()
     policy = GaussianMLPPolicy(env_spec=env_spec,
                                hidden_sizes=(1, ),
                                init_std=init_std,
                                hidden_nonlinearity=None,
                                std_parameterization='exp',
                                hidden_w_init=nn.init.ones_,
                                output_w_init=nn.init.ones_)
     dist = policy(obs)
     assert torch.allclose(dist.log_prob(action),
                           policy.log_likelihood(obs, action))
    def test_is_pickleable(self, batch_size, hidden_sizes):
        env_spec = TfEnv(DummyBoxEnv())
        obs_dim = env_spec.observation_space.flat_dim
        obs = torch.ones([batch_size, obs_dim], dtype=torch.float32)

        policy = DeterministicMLPPolicy(env_spec=env_spec,
                                        hidden_nonlinearity=None,
                                        hidden_sizes=hidden_sizes,
                                        hidden_w_init=nn.init.ones_,
                                        output_w_init=nn.init.ones_)

        output1 = policy.get_actions(obs)[0]

        p = pickle.dumps(policy)
        policy_pickled = pickle.loads(p)
        output2 = policy_pickled.get_actions(obs)[0]
        assert np.array_equal(output1, output2)
    def test_get_action(self, obs_dim, action_dim):
        """Test get_action method"""
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = ContinuousMLPPolicy(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        action, _ = policy.get_action(obs.flatten())

        assert env.action_space.contains(action)

        actions, _ = policy.get_actions(
            [obs.flatten(), obs.flatten(),
             obs.flatten()])
        for action in actions:
            assert env.action_space.contains(action)
Beispiel #15
0
    def test_param_values(self):
        box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec
        cmb = ContinuousMLPBaseline(env_spec=box_env_spec)
        new_cmb = ContinuousMLPBaseline(env_spec=box_env_spec,
                                        name='ContinuousMLPBaseline2')

        # Manual change the parameter of ContinuousMLPBaseline
        with tf.compat.v1.variable_scope('ContinuousMLPBaseline', reuse=True):
            bias = tf.compat.v1.get_variable('mlp/hidden_0/bias')
        bias.load(tf.ones_like(bias).eval())

        old_param_values = cmb.get_param_values()
        new_param_values = new_cmb.get_param_values()
        assert not np.array_equal(old_param_values, new_param_values)
        new_cmb.set_param_values(old_param_values)
        new_param_values = new_cmb.get_param_values()
        assert np.array_equal(old_param_values, new_param_values)
Beispiel #16
0
def test_forward(hidden_sizes):
    env_spec = GymEnv(DummyBoxEnv()).spec
    obs_dim = env_spec.observation_space.flat_dim
    obs = torch.ones(obs_dim, dtype=torch.float32).unsqueeze(0)

    qf = DiscreteMLPQFunction(env_spec=env_spec,
                              hidden_nonlinearity=None,
                              hidden_sizes=hidden_sizes,
                              hidden_w_init=nn.init.ones_,
                              output_w_init=nn.init.ones_)

    output = qf(obs)

    expected_output = torch.full([1, 1],
                                 fill_value=(obs_dim) * np.prod(hidden_sizes),
                                 dtype=torch.float32)
    assert torch.eq(output, expected_output).all()
Beispiel #17
0
    def test_pickling(self):
        obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        embedding_spec = InOutSpec(
            input_space=akro.Box(low=np.zeros(task_num),
                                 high=np.ones(task_num)),
            output_space=akro.Box(low=np.zeros(latent_dim),
                                  high=np.ones(latent_dim)))
        encoder = GaussianMLPEncoder(embedding_spec)
        policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec,
                                                encoder=encoder)

        pickled = pickle.dumps(policy)
        with tf.compat.v1.variable_scope('resumed'):
            unpickled = pickle.loads(pickled)
            assert hasattr(unpickled, '_f_dist_obs_latent')
            assert hasattr(unpickled, '_f_dist_obs_task')
Beispiel #18
0
    def setup_method(self):
        with mock.patch('tensorflow.random.normal') as mock_rand:
            mock_rand.return_value = 0.5
            super().setup_method()
            self.box_env = TfEnv(DummyBoxEnv())
            self.policy1 = GaussianMLPPolicy(env_spec=self.box_env,
                                             init_std=1.0,
                                             name='P1')
            self.policy2 = GaussianMLPPolicy(env_spec=self.box_env,
                                             init_std=1.2,
                                             name='P2')
            self.policy3 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                      init_std=1.0,
                                                      name='P3')
            self.policy4 = GaussianMLPPolicyWithModel(env_spec=self.box_env,
                                                      init_std=1.2,
                                                      name='P4')

            self.sess.run(tf.global_variables_initializer())

            for a, b in zip(self.policy3.get_params(),
                            self.policy1.get_params()):
                self.sess.run(tf.assign(b, a))
            for a, b in zip(self.policy4.get_params(),
                            self.policy2.get_params()):
                self.sess.run(tf.assign(b, a))

            self.obs = [self.box_env.reset()]
            self.obs_ph = tf.placeholder(
                tf.float32,
                shape=(None, self.box_env.observation_space.flat_dim))
            self.action_ph = tf.placeholder(
                tf.float32, shape=(None, self.box_env.action_space.flat_dim))

            self.dist1_sym = self.policy1.dist_info_sym(self.obs_ph,
                                                        name='p1_sym')
            self.dist2_sym = self.policy2.dist_info_sym(self.obs_ph,
                                                        name='p2_sym')
            self.dist3_sym = self.policy3.dist_info_sym(self.obs_ph,
                                                        name='p3_sym')
            self.dist4_sym = self.policy4.dist_info_sym(self.obs_ph,
                                                        name='p4_sym')

            assert self.policy1.vectorized == self.policy2.vectorized
            assert self.policy3.vectorized == self.policy4.vectorized
    def test_param_values(self):
        box_env_spec = GarageEnv(DummyBoxEnv(obs_dim=(2, ))).spec
        gmb = GaussianMLPBaseline(env_spec=box_env_spec)
        new_gmb = GaussianMLPBaseline(env_spec=box_env_spec,
                                      name='GaussianMLPBaseline2')

        # Manual change the parameter of GaussianMLPBaseline
        with tf.compat.v1.variable_scope('GaussianMLPBaseline', reuse=True):
            bias = tf.compat.v1.get_variable(
                'dist_params/mean_network/hidden_0/bias')
        bias.load(tf.ones_like(bias).eval())

        old_param_values = gmb.get_param_values()
        new_param_values = new_gmb.get_param_values()
        assert not np.array_equal(old_param_values, new_param_values)
        new_gmb.set_param_values(old_param_values)
        new_param_values = new_gmb.get_param_values()
        assert np.array_equal(old_param_values, new_param_values)
Beispiel #20
0
    def test_to(self):
        """Test Tanh Gaussian Policy can be moved to cpu."""
        env_spec = GymEnv(DummyBoxEnv())
        init_std = 2.

        policy = TanhGaussianMLPPolicy(env_spec=env_spec,
                                       hidden_sizes=(1, ),
                                       init_std=init_std,
                                       hidden_nonlinearity=None,
                                       std_parameterization='exp',
                                       hidden_w_init=nn.init.ones_,
                                       output_w_init=nn.init.ones_)
        if torch.cuda.is_available():
            policy.to(torch.device('cuda:0'))
            assert str(next(policy.parameters()).device) == 'cuda:0'
        else:
            policy.to(None)
            assert str(next(policy.parameters()).device) == 'cpu'
Beispiel #21
0
def test_is_pickleable(hidden_sizes):
    env_spec = GymEnv(DummyBoxEnv()).spec
    obs_dim = env_spec.observation_space.flat_dim
    obs = torch.ones(obs_dim, dtype=torch.float32).unsqueeze(0)

    qf = DiscreteMLPQFunction(env_spec=env_spec,
                              hidden_nonlinearity=None,
                              hidden_sizes=hidden_sizes,
                              hidden_w_init=nn.init.ones_,
                              output_w_init=nn.init.ones_)

    output1 = qf(obs)

    p = pickle.dumps(qf)
    qf_pickled = pickle.loads(p)
    output2 = qf_pickled(obs)

    assert torch.eq(output1, output2).all()
Beispiel #22
0
    def test_is_pickleable(self, obs_dim, act_dim, batch_size, hidden_sizes):
        env_spec = TfEnv(DummyBoxEnv())
        obs = torch.ones([batch_size, obs_dim], dtype=torch.float32)
        nn_module = MLPModule(
            input_dim=obs_dim,
            output_dim=act_dim,
            hidden_nonlinearity=None,
            hidden_sizes=hidden_sizes,
            hidden_w_init=nn.init.ones_,
            output_w_init=nn.init.ones_)

        policy = DeterministicPolicy(env_spec, nn_module)
        output1 = policy.get_actions(obs)

        p = pickle.dumps(policy)
        policy_pickled = pickle.loads(p)
        output2 = policy_pickled.get_actions(obs)
        assert np.array_equal(output1, output2)
Beispiel #23
0
    def test_get_qval(self, obs_dim, act_dim, output_dim, hidden_sizes):
        env_spec = TfEnv(DummyBoxEnv())
        obs = torch.ones(obs_dim, dtype=torch.float32).unsqueeze(0)
        act = torch.ones(act_dim, dtype=torch.float32).unsqueeze(0)
        nn_module = MLPModule(input_dim=obs_dim + act_dim,
                              output_dim=output_dim,
                              hidden_nonlinearity=None,
                              hidden_sizes=hidden_sizes,
                              hidden_w_init=nn.init.ones_,
                              output_w_init=nn.init.ones_)

        qf = ContinuousNNQFunction(env_spec, nn_module)
        output = qf.get_qval(obs, act)
        expected_output = torch.full([1, output_dim],
                                     fill_value=(obs_dim + act_dim) *
                                     np.prod(hidden_sizes),
                                     dtype=torch.float32)
        assert torch.eq(output, expected_output)
    def test_get_embedding(self, obs_dim, embedding_dim):
        env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=embedding_dim))
        embedding_spec = InOutSpec(input_space=env.spec.observation_space,
                                   output_space=env.spec.action_space)
        embedding = GaussianMLPEncoder(embedding_spec)
        task_input = tf.compat.v1.placeholder(tf.float32,
                                              shape=(None, None,
                                                     embedding.input_dim))
        embedding.build(task_input, name='task_input')

        env.reset()
        obs = env.step(env.action_space.sample()).observation

        latent, _ = embedding.get_latent(obs)
        latents, _ = embedding.get_latents([obs] * 5)
        assert env.action_space.contains(latent)
        for latent in latents:
            assert env.action_space.contains(latent)
    def test_is_pickleable(self, obs_dim, action_dim):
        env = TfEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        with mock.patch(('garage.tf.policies.'
                         'deterministic_mlp_policy_with_model.MLPModel'),
                        new=SimpleMLPModel):
            policy = DeterministicMLPPolicyWithModel(env_spec=env.spec)

        env.reset()
        obs, _, _, _ = env.step(1)

        action1, _ = policy.get_action(obs)

        p = pickle.dumps(policy)
        with tf.Session(graph=tf.Graph()):
            policy_pickled = pickle.loads(p)
            action2, _ = policy_pickled.get_action(obs)
            assert env.action_space.contains(action2)
            assert np.array_equal(action1, action2)
Beispiel #26
0
    def test_build(self, obs_dim, action_dim):
        """Test build method"""
        env = GymEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        policy = ContinuousMLPPolicy(env_spec=env.spec)

        env.reset()
        obs = env.step(1).observation

        obs_dim = env.spec.observation_space.flat_dim
        state_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim))
        action_sym = policy.build(state_input, name='action_sym')

        action = self.sess.run(action_sym,
                               feed_dict={state_input: [obs.flatten()]})
        action = policy.action_space.unflatten(action)

        assert env.action_space.contains(action)
Beispiel #27
0
    def test_is_pickleable(self):
        env = TfEnv(DummyBoxEnv(obs_dim=(1, ), action_dim=(1, )))
        obs_var = tf.compat.v1.placeholder(
            tf.float32,
            shape=[None, None, env.observation_space.flat_dim],
            name='obs')
        policy = GaussianGRUPolicy(env_spec=env.spec,
                                   state_include_action=False)

        policy.build(obs_var)
        env.reset()
        obs = env.reset()
        with tf.compat.v1.variable_scope('GaussianGRUPolicy/GaussianGRUModel',
                                         reuse=True):
            param = tf.compat.v1.get_variable(
                'dist_params/log_std_param/parameter')
        # assign it to all one
        param.load(tf.ones_like(param).eval())

        output1 = self.sess.run(
            [policy.distribution.loc,
             policy.distribution.stddev()],
            feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]})

        p = pickle.dumps(policy)

        with tf.compat.v1.Session(graph=tf.Graph()) as sess:
            policy_pickled = pickle.loads(p)
            obs_var = tf.compat.v1.placeholder(
                tf.float32,
                shape=[None, None, env.observation_space.flat_dim],
                name='obs')
            policy_pickled.build(obs_var)
            # yapf: disable
            output2 = sess.run(
                [
                    policy_pickled.distribution.loc,
                    policy_pickled.distribution.stddev()
                ],
                feed_dict={
                    policy_pickled.model.input: [[obs.flatten()],
                                                 [obs.flatten()]]
                })
            assert np.array_equal(output1, output2)
    def test_forward(self, hidden_sizes):
        env_spec = TfEnv(DummyBoxEnv())
        obs_dim = env_spec.observation_space.flat_dim
        act_dim = env_spec.action_space.flat_dim
        obs = torch.ones(obs_dim, dtype=torch.float32).unsqueeze(0)
        act = torch.ones(act_dim, dtype=torch.float32).unsqueeze(0)

        qf = ContinuousMLPQFunction(env_spec=env_spec,
                                    hidden_nonlinearity=None,
                                    hidden_sizes=hidden_sizes,
                                    hidden_w_init=nn.init.ones_,
                                    output_w_init=nn.init.ones_)

        output = qf(obs, act)
        expected_output = torch.full([1, 1],
                                     fill_value=(obs_dim + act_dim) *
                                     np.prod(hidden_sizes),
                                     dtype=torch.float32)
        assert torch.eq(output, expected_output)
Beispiel #29
0
    def test_get_latent(self):
        obs_dim, action_dim, task_num, latent_dim = (2, ), (2, ), 5, 2
        env = GarageEnv(DummyBoxEnv(obs_dim=obs_dim, action_dim=action_dim))
        embedding_spec = InOutSpec(
            input_space=akro.Box(low=np.zeros(task_num),
                                 high=np.ones(task_num)),
            output_space=akro.Box(low=np.zeros(latent_dim),
                                  high=np.ones(latent_dim)))
        encoder = GaussianMLPEncoder(embedding_spec)
        policy = GaussianMLPTaskEmbeddingPolicy(env_spec=env.spec,
                                                encoder=encoder)

        task_id = 3
        task_onehot = np.zeros(task_num)
        task_onehot[task_id] = 1
        latent, latent_info = policy.get_latent(task_onehot)
        assert latent.shape == (latent_dim, )
        assert latent_info['mean'].shape == (latent_dim, )
        assert latent_info['log_std'].shape == (latent_dim, )
Beispiel #30
0
    def test_get_action_np(self, hidden_sizes):
        """Test Policy get action function with numpy inputs."""
        env_spec = GymEnv(DummyBoxEnv())
        obs_dim = env_spec.observation_space.flat_dim
        act_dim = env_spec.action_space.flat_dim
        obs = np.ones((obs_dim, ), dtype=np.float32)
        init_std = 2.

        policy = TanhGaussianMLPPolicy(env_spec=env_spec,
                                       hidden_sizes=hidden_sizes,
                                       init_std=init_std,
                                       hidden_nonlinearity=None,
                                       std_parameterization='exp',
                                       hidden_w_init=nn.init.ones_,
                                       output_w_init=nn.init.ones_)
        expected_mean = torch.full((act_dim, ), 1.0, dtype=torch.float)
        action, prob = policy.get_action(obs)
        assert np.allclose(prob['mean'], expected_mean.numpy(), rtol=1e-3)
        assert action.shape == (act_dim, )