def test_latent_smoothing(self): observation_np = self.env.reset() smoothed_policy = GaussianMLPPolicy( input_shapes=(self.env.observation_space.shape, ), output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, smoothing_coefficient=0.5, ) np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0) self.assertEqual(smoothed_policy._smoothing_alpha, 0.5) self.assertEqual( smoothed_policy._smoothing_beta, np.sqrt((1.0 - np.power(smoothed_policy._smoothing_alpha, 2.0))) / (1.0 - smoothed_policy._smoothing_alpha), ) smoothing_x_previous = smoothed_policy._smoothing_x for i in range(5): action_np = smoothed_policy.actions_np([observation_np[None, :] ])[0] observation_np = self.env.step(action_np)[0] self.assertFalse( np.all( np.equal(smoothing_x_previous, smoothed_policy._smoothing_x))) smoothing_x_previous = smoothed_policy._smoothing_x smoothed_policy.reset() np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)
def setUp(self): self.env = gym.envs.make("MountainCarContinuous-v0") self.k = 2 self.hidden_layer_sizes = (128, 128) self.prior_policy = UniformPolicy( input_shapes=(self.env.observation_space.shape, ), output_shape=self.env.action_space.shape, ) self.main_policy = GaussianMLPPolicy( input_shapes=( self.env.observation_space.shape, self.env.action_space.shape, ), output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, name="Policy", ) self.second_policy = GaussianMLPPolicy( input_shapes=( self.env.observation_space.shape, self.env.action_space.shape, ), output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, name="SecPolicy", ) self.level_k_policy = LevelKPolicy( main_policy=self.main_policy, secondary_policy=self.second_policy, prior_policy=self.prior_policy, secondary_prior_policy=self.prior_policy, k=self.k, )
def get_pr2k_soft_agent(env, agent_id, hidden_layer_sizes, max_replay_buffer_size, k=2, mu=0): observation_space = env.env_specs.observation_space[agent_id] action_space = env.env_specs.action_space[agent_id] opponent_action_shape = ( env.env_specs.action_space.opponent_flat_dim(agent_id), ) print(opponent_action_shape, "opponent_action_shape") return PR2KSoftAgent( env_specs=env.env_specs, main_policy=GaussianMLPPolicy( input_shapes=(observation_space.shape, opponent_action_shape), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name="policy_agent_{}".format(agent_id), ), opponent_policy=GaussianMLPPolicy( input_shapes=(observation_space.shape, action_space.shape), output_shape=opponent_action_shape, hidden_layer_sizes=hidden_layer_sizes, name="opponent_policy_agent_{}".format(agent_id), ), prior_policy=GaussianMLPPolicy( input_shapes=(observation_space.shape, ), output_shape=action_space.shape, hidden_layer_sizes=hidden_layer_sizes, name="prior_policy_agent_{}".format(agent_id), ), opponent_prior_policy=GaussianMLPPolicy( input_shapes=(observation_space.shape, ), output_shape=opponent_action_shape, hidden_layer_sizes=hidden_layer_sizes, name="opponent_prior_policy_agent_{}".format(agent_id), ), qf=MLPValueFunction( input_shapes=( observation_space.shape, action_space.shape, opponent_action_shape, ), output_shape=(1, ), hidden_layer_sizes=hidden_layer_sizes, name="qf_agent_{}".format(agent_id), ), replay_buffer=IndexedReplayBuffer( observation_dim=observation_space.shape[0], action_dim=action_space.shape[0], max_replay_buffer_size=max_replay_buffer_size, opponent_action_dim=opponent_action_shape[0], ), k=k, mu=mu, gradient_clipping=10.0, agent_id=agent_id, )
def setUp(self): self.env = gym.envs.make('MountainCarContinuous-v0') self.hidden_layer_sizes = (128, 128) self.policy = GaussianMLPPolicy( input_shapes=(self.env.observation_space.shape, ), output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, name='Policy') self.cond_polcy = GaussianMLPPolicy( input_shapes=(self.env.observation_space.shape, self.env.action_space.shape), output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, name='CondPolicy')
class GaussianPolicyTest(tf.test.TestCase): def setUp(self): self.env = gym.envs.make("MountainCarContinuous-v0") self.hidden_layer_sizes = (128, 128) self.policy = GaussianMLPPolicy( input_shapes=(self.env.observation_space.shape, ), output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, name="Policy", ) self.cond_polcy = GaussianMLPPolicy( input_shapes=( self.env.observation_space.shape, self.env.action_space.shape, ), output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, name="CondPolicy", ) def test_actions_and_log_pis_symbolic(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack((observation1_np, observation2_np)) observations_tf = tf.constant(observations_np, dtype=tf.float32) actions = self.policy.get_actions([observations_tf]) log_pis = self.policy.log_pis([observations_tf], actions) self.assertEqual(actions.shape, (2, *self.env.action_space.shape)) self.assertEqual(log_pis.shape, (2, 1)) actions_np = self.evaluate(actions) log_pis_np = self.evaluate(log_pis) self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape)) self.assertEqual(log_pis_np.shape, (2, 1)) def test_cond_policy(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] action1_np = self.env.action_space.sample() action2_np = self.env.action_space.sample() observations_np = np.stack( (observation1_np, observation2_np)).astype(np.float32) actions_np = np.stack((action1_np, action2_np)) conditions = [observations_np, actions_np] actions_np = self.cond_polcy.get_actions_np(conditions) actions = self.cond_polcy.get_actions(conditions) self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape)) self.assertEqual(actions.shape, (2, *self.env.action_space.shape)) def test_actions_and_log_pis_numeric(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack((observation1_np, observation2_np)) actions_np = self.policy.get_actions_np([observations_np]) log_pis_np = self.policy.log_pis_np([observations_np], actions_np) self.assertEqual(actions_np.shape, (2, *self.env.action_space.shape)) self.assertEqual(log_pis_np.shape, (2, 1)) def test_env_step_with_actions(self): observation1_np = self.env.reset() action = self.policy.get_actions_np(observation1_np[None])[0, ...] self.env.step(action) def test_env_step_with_action(self): observation1_np = self.env.reset() action = self.policy.get_action_np(observation1_np) self.env.step(action) self.assertEqual(action.shape, self.env.action_space.shape) def test_trainable_variables(self): self.assertEqual(len(self.policy.trainable_variables), 2 * (len(self.hidden_layer_sizes) + 1)) def test_get_diagnostics(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack((observation1_np, observation2_np)) diagnostics = self.policy.get_diagnostics([observations_np]) self.assertTrue(isinstance(diagnostics, OrderedDict)) # self.assertEqual( # tuple(diagnostics.keys()), # ('shifts-mean', # 'shifts-std', # 'log_scale_diags-mean', # 'log_scale_diags-std', # '-log-pis-mean', # '-log-pis-std', # 'raw-actions-mean', # 'raw-actions-std', # 'actions-mean', # 'actions-std')) for value in diagnostics.values(): self.assertTrue(np.isscalar(value)) def test_clone_target(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack( (observation1_np, observation2_np)).astype(np.float32) weights = self.policy.get_weights() actions_np = self.policy.get_actions_np([observations_np]) log_pis_np = self.policy.log_pis_np([observations_np], actions_np) target_name = "{}_{}".format("target", self.policy._name) target_policy = Serializable.clone(self.policy, name=target_name) weights_2 = target_policy.get_weights() log_pis_np_2 = target_policy.log_pis_np([observations_np], actions_np) self.assertEqual(target_policy._name, target_name) self.assertIsNot(weights, weights_2) for weight, weight_2 in zip(weights, weights_2): np.testing.assert_array_equal(weight.shape, weight_2.shape) np.testing.assert_array_equal(log_pis_np.shape, log_pis_np_2.shape) np.testing.assert_equal( actions_np.shape, self.policy.get_actions_np([observations_np]).shape) def test_serialize_deserialize(self): observation1_np = self.env.reset() observation2_np = self.env.step(self.env.action_space.sample())[0] observations_np = np.stack( (observation1_np, observation2_np)).astype(np.float32) weights = self.policy.get_weights() actions_np = self.policy.get_actions_np([observations_np]) log_pis_np = self.policy.log_pis_np([observations_np], actions_np) serialized = pickle.dumps(self.policy) deserialized = pickle.loads(serialized) weights_2 = deserialized.get_weights() log_pis_np_2 = deserialized.log_pis_np([observations_np], actions_np) for weight, weight_2 in zip(weights, weights_2): np.testing.assert_array_equal(weight, weight_2) np.testing.assert_array_equal(log_pis_np, log_pis_np_2) np.testing.assert_equal( actions_np.shape, deserialized.get_actions_np([observations_np]).shape) def test_latent_smoothing(self): observation_np = self.env.reset() smoothed_policy = GaussianMLPPolicy( input_shapes=(self.env.observation_space.shape, ), output_shape=self.env.action_space.shape, hidden_layer_sizes=self.hidden_layer_sizes, smoothing_coefficient=0.5, ) np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0) self.assertEqual(smoothed_policy._smoothing_alpha, 0.5) self.assertEqual( smoothed_policy._smoothing_beta, np.sqrt((1.0 - np.power(smoothed_policy._smoothing_alpha, 2.0))) / (1.0 - smoothed_policy._smoothing_alpha), ) smoothing_x_previous = smoothed_policy._smoothing_x for i in range(5): action_np = smoothed_policy.actions_np([observation_np[None, :] ])[0] observation_np = self.env.step(action_np)[0] self.assertFalse( np.all( np.equal(smoothing_x_previous, smoothed_policy._smoothing_x))) smoothing_x_previous = smoothed_policy._smoothing_x smoothed_policy.reset() np.testing.assert_equal(smoothed_policy._smoothing_x, 0.0)