Example #1
0
    def test_latent_smoothing(self):
        observation_np = self.env.reset()
        smoothed_policy = GaussianMLPPolicy(
            input_shapes=(self.env.observation_space.shape, ),
            output_shape=self.env.action_space.shape,
            hidden_layer_sizes=self.hidden_layer_sizes,
            smoothing_coefficient=0.5,
        )

        np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)
        self.assertEqual(smoothed_policy._smoothing_alpha, 0.5)
        self.assertEqual(
            smoothed_policy._smoothing_beta,
            np.sqrt((1.0 - np.power(smoothed_policy._smoothing_alpha, 2.0))) /
            (1.0 - smoothed_policy._smoothing_alpha),
        )

        smoothing_x_previous = smoothed_policy._smoothing_x
        for i in range(5):
            action_np = smoothed_policy.actions_np([observation_np[None, :]
                                                    ])[0]
            observation_np = self.env.step(action_np)[0]

            self.assertFalse(
                np.all(
                    np.equal(smoothing_x_previous,
                             smoothed_policy._smoothing_x)))
            smoothing_x_previous = smoothed_policy._smoothing_x

        smoothed_policy.reset()

        np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)
Example #2
0
 def setUp(self):
     self.env = gym.envs.make("MountainCarContinuous-v0")
     self.k = 2
     self.hidden_layer_sizes = (128, 128)
     self.prior_policy = UniformPolicy(
         input_shapes=(self.env.observation_space.shape, ),
         output_shape=self.env.action_space.shape,
     )
     self.main_policy = GaussianMLPPolicy(
         input_shapes=(
             self.env.observation_space.shape,
             self.env.action_space.shape,
         ),
         output_shape=self.env.action_space.shape,
         hidden_layer_sizes=self.hidden_layer_sizes,
         name="Policy",
     )
     self.second_policy = GaussianMLPPolicy(
         input_shapes=(
             self.env.observation_space.shape,
             self.env.action_space.shape,
         ),
         output_shape=self.env.action_space.shape,
         hidden_layer_sizes=self.hidden_layer_sizes,
         name="SecPolicy",
     )
     self.level_k_policy = LevelKPolicy(
         main_policy=self.main_policy,
         secondary_policy=self.second_policy,
         prior_policy=self.prior_policy,
         secondary_prior_policy=self.prior_policy,
         k=self.k,
     )
def get_pr2k_soft_agent(env,
                        agent_id,
                        hidden_layer_sizes,
                        max_replay_buffer_size,
                        k=2,
                        mu=0):
    observation_space = env.env_specs.observation_space[agent_id]
    action_space = env.env_specs.action_space[agent_id]
    opponent_action_shape = (
        env.env_specs.action_space.opponent_flat_dim(agent_id), )
    print(opponent_action_shape, "opponent_action_shape")
    return PR2KSoftAgent(
        env_specs=env.env_specs,
        main_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, opponent_action_shape),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="policy_agent_{}".format(agent_id),
        ),
        opponent_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, action_space.shape),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_policy_agent_{}".format(agent_id),
        ),
        prior_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=action_space.shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="prior_policy_agent_{}".format(agent_id),
        ),
        opponent_prior_policy=GaussianMLPPolicy(
            input_shapes=(observation_space.shape, ),
            output_shape=opponent_action_shape,
            hidden_layer_sizes=hidden_layer_sizes,
            name="opponent_prior_policy_agent_{}".format(agent_id),
        ),
        qf=MLPValueFunction(
            input_shapes=(
                observation_space.shape,
                action_space.shape,
                opponent_action_shape,
            ),
            output_shape=(1, ),
            hidden_layer_sizes=hidden_layer_sizes,
            name="qf_agent_{}".format(agent_id),
        ),
        replay_buffer=IndexedReplayBuffer(
            observation_dim=observation_space.shape[0],
            action_dim=action_space.shape[0],
            max_replay_buffer_size=max_replay_buffer_size,
            opponent_action_dim=opponent_action_shape[0],
        ),
        k=k,
        mu=mu,
        gradient_clipping=10.0,
        agent_id=agent_id,
    )
Example #4
0
 def setUp(self):
     self.env = gym.envs.make('MountainCarContinuous-v0')
     self.hidden_layer_sizes = (128, 128)
     self.policy = GaussianMLPPolicy(
         input_shapes=(self.env.observation_space.shape, ),
         output_shape=self.env.action_space.shape,
         hidden_layer_sizes=self.hidden_layer_sizes,
         name='Policy')
     self.cond_polcy = GaussianMLPPolicy(
         input_shapes=(self.env.observation_space.shape,
                       self.env.action_space.shape),
         output_shape=self.env.action_space.shape,
         hidden_layer_sizes=self.hidden_layer_sizes,
         name='CondPolicy')
Example #5
0
class GaussianPolicyTest(tf.test.TestCase):
    def setUp(self):
        self.env = gym.envs.make("MountainCarContinuous-v0")
        self.hidden_layer_sizes = (128, 128)
        self.policy = GaussianMLPPolicy(
            input_shapes=(self.env.observation_space.shape, ),
            output_shape=self.env.action_space.shape,
            hidden_layer_sizes=self.hidden_layer_sizes,
            name="Policy",
        )
        self.cond_polcy = GaussianMLPPolicy(
            input_shapes=(
                self.env.observation_space.shape,
                self.env.action_space.shape,
            ),
            output_shape=self.env.action_space.shape,
            hidden_layer_sizes=self.hidden_layer_sizes,
            name="CondPolicy",
        )

    def test_actions_and_log_pis_symbolic(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack((observation1_np, observation2_np))

        observations_tf = tf.constant(observations_np, dtype=tf.float32)

        actions = self.policy.get_actions([observations_tf])
        log_pis = self.policy.log_pis([observations_tf], actions)

        self.assertEqual(actions.shape, (2, *self.env.action_space.shape))
        self.assertEqual(log_pis.shape, (2, 1))

        actions_np = self.evaluate(actions)
        log_pis_np = self.evaluate(log_pis)

        self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape))
        self.assertEqual(log_pis_np.shape, (2, 1))

    def test_cond_policy(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        action1_np = self.env.action_space.sample()
        action2_np = self.env.action_space.sample()
        observations_np = np.stack(
            (observation1_np, observation2_np)).astype(np.float32)
        actions_np = np.stack((action1_np, action2_np))
        conditions = [observations_np, actions_np]

        actions_np = self.cond_polcy.get_actions_np(conditions)
        actions = self.cond_polcy.get_actions(conditions)

        self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape))
        self.assertEqual(actions.shape, (2, *self.env.action_space.shape))

    def test_actions_and_log_pis_numeric(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack((observation1_np, observation2_np))

        actions_np = self.policy.get_actions_np([observations_np])
        log_pis_np = self.policy.log_pis_np([observations_np], actions_np)

        self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape))
        self.assertEqual(log_pis_np.shape, (2, 1))

    def test_env_step_with_actions(self):
        observation1_np = self.env.reset()
        action = self.policy.get_actions_np(observation1_np[None])[0, ...]
        self.env.step(action)

    def test_env_step_with_action(self):
        observation1_np = self.env.reset()
        action = self.policy.get_action_np(observation1_np)
        self.env.step(action)
        self.assertEqual(action.shape, self.env.action_space.shape)

    def test_trainable_variables(self):
        self.assertEqual(len(self.policy.trainable_variables),
                         2 * (len(self.hidden_layer_sizes) + 1))

    def test_get_diagnostics(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack((observation1_np, observation2_np))

        diagnostics = self.policy.get_diagnostics([observations_np])

        self.assertTrue(isinstance(diagnostics, OrderedDict))
        # self.assertEqual(
        #     tuple(diagnostics.keys()),
        #     ('shifts-mean',
        #      'shifts-std',
        #      'log_scale_diags-mean',
        #      'log_scale_diags-std',
        #      '-log-pis-mean',
        #      '-log-pis-std',
        #      'raw-actions-mean',
        #      'raw-actions-std',
        #      'actions-mean',
        #      'actions-std'))

        for value in diagnostics.values():
            self.assertTrue(np.isscalar(value))

    def test_clone_target(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack(
            (observation1_np, observation2_np)).astype(np.float32)

        weights = self.policy.get_weights()
        actions_np = self.policy.get_actions_np([observations_np])
        log_pis_np = self.policy.log_pis_np([observations_np], actions_np)

        target_name = "{}_{}".format("target", self.policy._name)
        target_policy = Serializable.clone(self.policy, name=target_name)

        weights_2 = target_policy.get_weights()
        log_pis_np_2 = target_policy.log_pis_np([observations_np], actions_np)

        self.assertEqual(target_policy._name, target_name)
        self.assertIsNot(weights, weights_2)
        for weight, weight_2 in zip(weights, weights_2):
            np.testing.assert_array_equal(weight.shape, weight_2.shape)
        np.testing.assert_array_equal(log_pis_np.shape, log_pis_np_2.shape)
        np.testing.assert_equal(
            actions_np.shape,
            self.policy.get_actions_np([observations_np]).shape)

    def test_serialize_deserialize(self):
        observation1_np = self.env.reset()
        observation2_np = self.env.step(self.env.action_space.sample())[0]
        observations_np = np.stack(
            (observation1_np, observation2_np)).astype(np.float32)

        weights = self.policy.get_weights()
        actions_np = self.policy.get_actions_np([observations_np])
        log_pis_np = self.policy.log_pis_np([observations_np], actions_np)

        serialized = pickle.dumps(self.policy)
        deserialized = pickle.loads(serialized)

        weights_2 = deserialized.get_weights()
        log_pis_np_2 = deserialized.log_pis_np([observations_np], actions_np)

        for weight, weight_2 in zip(weights, weights_2):
            np.testing.assert_array_equal(weight, weight_2)
        np.testing.assert_array_equal(log_pis_np, log_pis_np_2)
        np.testing.assert_equal(
            actions_np.shape,
            deserialized.get_actions_np([observations_np]).shape)

    def test_latent_smoothing(self):
        observation_np = self.env.reset()
        smoothed_policy = GaussianMLPPolicy(
            input_shapes=(self.env.observation_space.shape, ),
            output_shape=self.env.action_space.shape,
            hidden_layer_sizes=self.hidden_layer_sizes,
            smoothing_coefficient=0.5,
        )

        np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)
        self.assertEqual(smoothed_policy._smoothing_alpha, 0.5)
        self.assertEqual(
            smoothed_policy._smoothing_beta,
            np.sqrt((1.0 - np.power(smoothed_policy._smoothing_alpha, 2.0))) /
            (1.0 - smoothed_policy._smoothing_alpha),
        )

        smoothing_x_previous = smoothed_policy._smoothing_x
        for i in range(5):
            action_np = smoothed_policy.actions_np([observation_np[None, :]
                                                    ])[0]
            observation_np = self.env.step(action_np)[0]

            self.assertFalse(
                np.all(
                    np.equal(smoothing_x_previous,
                             smoothed_policy._smoothing_x)))
            smoothing_x_previous = smoothed_policy._smoothing_x

        smoothed_policy.reset()

        np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)