def test_get_action(self, mock_rand, obs_dim, action_dim, hidden_dim): mock_rand.return_value = 0 env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'categorical_gru_policy.GRUModel'), new=SimpleGRUModel): policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=False) policy.reset() obs = env.reset() expected_prob = np.full(action_dim, 0.5) action, agent_info = policy.get_action(obs) assert env.action_space.contains(action) assert action == 0 assert np.array_equal(agent_info['prob'], expected_prob) actions, agent_infos = policy.get_actions([obs]) for action, prob in zip(actions, agent_infos['prob']): assert env.action_space.contains(action) assert action == 0 assert np.array_equal(prob, expected_prob)
def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy( env_spec=self.env, hidden_dim=1, state_include_action=False) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs)
def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.global_variables_initializer()) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs)
def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy( env_spec=self.env, hidden_dim=1, state_include_action=False) self.sess.run(tf.compat.v1.global_variables_initializer()) categorical_gru_policy.build(self.obs_var) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs)
def test_get_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset(do_resets=None) obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_get_action(self, obs_dim, action_dim, hidden_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.build(obs_var) policy.reset(do_resets=None) obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_build_state_not_include_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset(do_resets=None) obs = env.reset() state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist output1 = self.sess.run( [policy.distribution.probs], feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) output2 = self.sess.run( [dist_sym.probs], feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_dist_info_sym(self, obs_dim, action_dim, hidden_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) with mock.patch(('garage.tf.policies.' 'categorical_gru_policy.GRUModel'), new=SimpleGRUModel): policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=False) policy.reset() obs = env.reset() dist_sym = policy.dist_info_sym(obs_var=obs_ph, state_info_vars=None, name='p2_sym') dist = self.sess.run( dist_sym, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]}) assert np.array_equal(dist['prob'], np.full((2, 1, action_dim), 0.5))
def test_build_state_include_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=True) policy.reset(do_resets=None) obs = env.reset() state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist dist_sym2 = policy.build(state_input, name='dist_sym2').dist concat_obs = np.concatenate([obs.flatten(), np.zeros(action_dim)]) output1 = self.sess.run( [dist_sym.probs], feed_dict={state_input: [[concat_obs], [concat_obs]]}) output2 = self.sess.run( [dist_sym2.probs], feed_dict={state_input: [[concat_obs], [concat_obs]]}) assert np.array_equal(output1, output2)
def test_dist_info_sym_wrong_input(self): env = TfEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) obs_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) with mock.patch(('garage.tf.policies.' 'categorical_gru_policy.GRUModel'), new=SimpleGRUModel): policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=True) policy.reset() obs = env.reset() policy.dist_info_sym( obs_var=obs_ph, state_info_vars={'prev_action': np.zeros((3, 1, 1))}, name='p2_sym') # observation batch size = 2 but prev_action batch size = 3 with pytest.raises(tf.errors.InvalidArgumentError): self.sess.run( policy.model.networks['p2_sym'].input, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})
class TestCategoricalGRUPolicyWithModelTransit(TfGraphTestCase): def setUp(self): super().setUp() env = TfEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) self.default_initializer = tf.constant_initializer(1) self.default_hidden_nonlinearity = tf.nn.tanh self.default_recurrent_nonlinearity = tf.nn.sigmoid self.default_output_nonlinearity = None self.time_step = 1 self.policy1 = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_x_init=self.default_initializer, recurrent_w_h_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=self.default_initializer, state_include_action=True, name='P1') self.policy2 = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_x_init=self.default_initializer, recurrent_w_h_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=tf.constant_initializer(2), state_include_action=True, name='P2') self.sess.run(tf.global_variables_initializer()) self.policy3 = CategoricalGRUPolicyWithModel( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, hidden_w_init=self.default_initializer, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=self.default_initializer, state_include_action=True, name='P3') self.policy4 = CategoricalGRUPolicyWithModel( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, hidden_w_init=self.default_initializer, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=tf.constant_initializer(2), state_include_action=True, name='P4') self.policy1.reset() self.policy2.reset() self.policy3.reset() self.policy4.reset() self.obs = [env.reset()] self.obs = np.concatenate([self.obs for _ in range(self.time_step)], axis=0) self.obs_ph = tf.placeholder(tf.float32, shape=(None, None, env.observation_space.flat_dim)) self.action_ph = tf.placeholder(tf.float32, shape=(None, None, env.action_space.flat_dim)) self.dist1_sym = self.policy1.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p1_sym') self.dist2_sym = self.policy2.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p2_sym') self.dist3_sym = self.policy3.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p3_sym') self.dist4_sym = self.policy4.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p4_sym') def test_dist_info_sym_output(self): # batch size = 2 dist1 = self.sess.run(self.dist1_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist2 = self.sess.run(self.dist2_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist3 = self.sess.run(self.dist3_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist4 = self.sess.run(self.dist4_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) assert np.array_equal(dist1['prob'], dist3['prob']) assert np.array_equal(dist2['prob'], dist4['prob']) @mock.patch('numpy.random.rand') def test_get_action(self, mock_rand): mock_rand.return_value = 0 action1, agent_info1 = self.policy1.get_action(self.obs) action2, agent_info2 = self.policy2.get_action(self.obs) action3, agent_info3 = self.policy3.get_action(self.obs) action4, agent_info4 = self.policy4.get_action(self.obs) assert action1 == action3 assert action2 == action4 assert np.array_equal(agent_info1['prob'], agent_info3['prob']) assert np.array_equal(agent_info2['prob'], agent_info4['prob']) actions1, agent_infos1 = self.policy1.get_actions([self.obs]) actions2, agent_infos2 = self.policy2.get_actions([self.obs]) actions3, agent_infos3 = self.policy3.get_actions([self.obs]) actions4, agent_infos4 = self.policy4.get_actions([self.obs]) assert np.array_equal(actions1, actions3) assert np.array_equal(actions2, actions4) assert np.array_equal(agent_infos1['prob'], agent_infos3['prob']) assert np.array_equal(agent_infos2['prob'], agent_infos4['prob']) def test_kl_sym(self): kl_diff_sym1 = self.policy1.distribution.kl_sym( self.dist1_sym, self.dist2_sym) objective1 = tf.reduce_mean(kl_diff_sym1) kl_func = tensor_utils.compile_function([self.obs_ph], objective1) kl1 = kl_func([self.obs, self.obs]) kl_diff_sym2 = self.policy3.distribution.kl_sym( self.dist3_sym, self.dist4_sym) objective2 = tf.reduce_mean(kl_diff_sym2) kl_func = tensor_utils.compile_function([self.obs_ph], objective2) kl2 = kl_func([self.obs, self.obs]) assert np.array_equal(kl1, kl2) def test_log_likehihood_sym(self): log_prob_sym1 = self.policy1.distribution.log_likelihood_sym( self.action_ph, self.dist1_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func([self.obs, self.obs], np.ones((2, self.time_step, 1))) log_prob_sym2 = self.policy3.distribution.log_likelihood_sym( self.action_ph, self.dist3_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2([self.obs, self.obs], np.ones((2, self.time_step, 1))) assert np.array_equal(log_prob1, log_prob2) log_prob_sym1 = self.policy2.distribution.log_likelihood_sym( self.action_ph, self.dist2_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func([self.obs, self.obs], np.ones((2, self.time_step, 1))) log_prob_sym2 = self.policy4.distribution.log_likelihood_sym( self.action_ph, self.dist4_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2([self.obs, self.obs], np.ones((2, self.time_step, 1))) assert np.array_equal(log_prob1, log_prob2) def test_policy_entropy_sym(self): entropy_sym1 = self.policy1.distribution.entropy_sym( self.dist1_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym1) entropy1 = entropy_func([self.obs, self.obs]) entropy_sym2 = self.policy3.distribution.entropy_sym( self.dist3_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym2) entropy2 = entropy_func([self.obs, self.obs]) assert np.array_equal(entropy1, entropy2) def test_likelihood_ratio_sym(self): likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym( self.action_ph, self.dist1_sym, self.dist2_sym, name='li_ratio_sym1') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym1) likelihood_ratio1 = likelihood_ratio_func(np.ones((2, 1, 1)), [self.obs, self.obs]) likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym( self.action_ph, self.dist3_sym, self.dist4_sym, name='li_ratio_sym2') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym2) likelihood_ratio2 = likelihood_ratio_func(np.ones((2, 1, 1)), [self.obs, self.obs]) assert np.array_equal(likelihood_ratio1, likelihood_ratio2)