def test_is_pickleable(self): env = GarageEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=False) obs = env.reset() policy._gru_cell.weights[0].load( tf.ones_like(policy._gru_cell.weights[0]).eval()) state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist output1 = self.sess.run( [dist_sym.probs], feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy_pickled.build(state_input, name='dist_sym').dist # yapf: disable output2 = sess.run( [dist_sym.probs], feed_dict={ state_input: [[obs.flatten()], [obs.flatten()]] }) # yapf: enable assert np.array_equal(output1, output2)
def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy( env_spec=self.env, hidden_dim=1, state_include_action=False) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs)
def test_clone(self): env = GarageEnv(DummyDiscreteEnv(obs_dim=(10, ), action_dim=4)) policy = CategoricalGRUPolicy(env_spec=env.spec) policy_clone = policy.clone('CategoricalGRUPolicyClone') assert policy.env_spec == policy_clone.env_spec for cloned_param, param in zip(policy_clone.parameters.values(), policy.parameters.values()): assert np.array_equal(cloned_param, param)
def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy(env_spec=self.env, hidden_dim=1) self.sess.run(tf.global_variables_initializer()) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs)
def test_is_pickleable(self): env = TfEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) with mock.patch(('garage.tf.policies.' 'categorical_gru_policy.GRUModel'), new=SimpleGRUModel): policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=False) env.reset() obs = env.reset() with tf.compat.v1.variable_scope('CategoricalGRUPolicy/prob_network', reuse=True): return_var = tf.compat.v1.get_variable('return_var') # assign it to all one return_var.load(tf.ones_like(return_var).eval()) output1 = self.sess.run( policy.model.outputs[0], feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) # yapf: disable output2 = sess.run( policy_pickled.model.outputs[0], feed_dict={ policy_pickled.model.input: [[obs.flatten()], [obs.flatten()]] }) # yapf: enable assert np.array_equal(output1, output2)
def test_invalid_env(self): env = TfEnv(DummyBoxEnv()) with mock.patch(('garage.tf.policies.' 'categorical_gru_policy.GRUModel'), new=SimpleGRUModel): with pytest.raises(ValueError): CategoricalGRUPolicy(env_spec=env.spec)
def test_trpo_gru_cartpole(self): deterministic.set_seed(2) with TFTrainer(snapshot_config, sess=self.sess) as trainer: env = normalize(GymEnv('CartPole-v1', max_episode_length=100)) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, is_tf_worker=True) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(algo, env) last_avg_ret = trainer.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 40 env.close()
def train_gru_trpo(ctxt=None): set_seed(seed) with TFTrainer(snapshot_config=ctxt) as trainer: env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, ) self.algo = LoggedTRPO( env=env, env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHVP( base_eps=1e-5))) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def test_is_pickleable(self): env = GarageEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=False) obs = env.reset() policy.model._gru_cell.weights[0].load( tf.ones_like(policy.model._gru_cell.weights[0]).eval()) output1 = self.sess.run( [policy.distribution.probs], feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) p = pickle.dumps(policy) with tf.compat.v1.Session(graph=tf.Graph()) as sess: policy_pickled = pickle.loads(p) # yapf: disable output2 = sess.run( [policy_pickled.distribution.probs], feed_dict={ policy_pickled.model.input: [[obs.flatten()], [obs.flatten()]] }) # yapf: enable assert np.array_equal(output1, output2)
def train_ppo(ctxt=None): set_seed(seed) with TFTrainer(ctxt) as trainer: env = MyGymEnv(gym_env, max_episode_length=100) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec, state_include_action=False) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler( agents=policy, envs=env, max_episode_length=env.spec.max_episode_length, worker_class=FragmentWorker, is_tf_worker=True, ) self.algo = LoggedPPO( env=env, env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, center_adv=False, optimizer_args=dict(max_optimization_epochs=8)) trainer.setup(self.algo, env) trainer.train(n_epochs=n_eps, batch_size=4000) return self.algo.rew_chkpts
def run_garage(env, seed, log_dir): """Create garage model and training. Replace the ppo with the algorithm you want to run. Args: env (gym.Env): Environment of the task. seed (int): Random seed for the trial. log_dir (str): Log dir path. Returns: str: Path to output csv file """ deterministic.set_seed(seed) config = tf.compat.v1.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=12, inter_op_parallelism_threads=12) sess = tf.compat.v1.Session(config=config) with LocalTFRunner(snapshot_config, sess=sess, max_cpus=12) as runner: env = TfEnv(normalize(env)) policy = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_epochs=10, tf_optimizer_args=dict(learning_rate=1e-3), ), ) # Set up logger since we are not using run_experiment tabular_log_file = osp.join(log_dir, 'progress.csv') dowel_logger.add_output(dowel.StdOutput()) dowel_logger.add_output(dowel.CsvOutput(tabular_log_file)) dowel_logger.add_output(dowel.TensorBoardOutput(log_dir)) runner.setup(algo, env, sampler_args=dict(n_envs=12)) runner.train(n_epochs=488, batch_size=2048) dowel_logger.remove_all() return tabular_log_file
def test_categorial_gru_policy(self): categorical_gru_policy = CategoricalGRUPolicy( env_spec=self.env, hidden_dim=1, state_include_action=False) self.sess.run(tf.compat.v1.global_variables_initializer()) categorical_gru_policy.build(self.obs_var) categorical_gru_policy.reset() obs = self.env.observation_space.high assert categorical_gru_policy.get_action(obs)
def test_build_state_not_include_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset(do_resets=None) obs = env.reset() state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist output1 = self.sess.run( [policy.distribution.probs], feed_dict={policy.model.input: [[obs.flatten()], [obs.flatten()]]}) output2 = self.sess.run( [dist_sym.probs], feed_dict={state_input: [[obs.flatten()], [obs.flatten()]]}) assert np.array_equal(output1, output2)
def test_get_action(self, mock_rand, obs_dim, action_dim, hidden_dim): mock_rand.return_value = 0 env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) with mock.patch(('garage.tf.policies.' 'categorical_gru_policy.GRUModel'), new=SimpleGRUModel): policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=False) policy.reset() obs = env.reset() expected_prob = np.full(action_dim, 0.5) action, agent_info = policy.get_action(obs) assert env.action_space.contains(action) assert action == 0 assert np.array_equal(agent_info['prob'], expected_prob) actions, agent_infos = policy.get_actions([obs]) for action, prob in zip(actions, agent_infos['prob']): assert env.action_space.contains(action) assert action == 0 assert np.array_equal(prob, expected_prob)
def test_dist_info_sym(self, obs_dim, action_dim, hidden_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) with mock.patch(('garage.tf.policies.' 'categorical_gru_policy.GRUModel'), new=SimpleGRUModel): policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=False) policy.reset() obs = env.reset() dist_sym = policy.dist_info_sym(obs_var=obs_ph, state_info_vars=None, name='p2_sym') dist = self.sess.run( dist_sym, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]}) assert np.array_equal(dist['prob'], np.full((2, 1, action_dim), 0.5))
def categorical_gru_policy(ctxt, env_id, seed): """Create Categorical CNN Policy on TF-PPO. Args: ctxt (garage.experiment.ExperimentContext): The experiment configuration used by Trainer to create the snapshotter. env_id (str): Environment id of the task. seed (int): Random positive integer for the trial. """ deterministic.set_seed(seed) with TFTrainer(ctxt) as trainer: env = normalize(GymEnv(env_id)) policy = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=32, hidden_nonlinearity=tf.nn.tanh, ) baseline = LinearFeatureBaseline(env_spec=env.spec) sampler = LocalSampler(agents=policy, envs=env, max_episode_length=env.spec.max_episode_length) algo = PPO( env_spec=env.spec, policy=policy, baseline=baseline, sampler=sampler, discount=0.99, gae_lambda=0.95, lr_clip_range=0.2, policy_ent_coeff=0.0, optimizer_args=dict( batch_size=32, max_optimization_epochs=10, learning_rate=1e-3, ), ) trainer.setup(algo, env) trainer.train(n_epochs=488, batch_size=2048)
def test_get_action(self, obs_dim, action_dim, hidden_dim): env = TfEnv(DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) obs_var = tf.compat.v1.placeholder( tf.float32, shape=[None, None, env.observation_space.flat_dim], name='obs') policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.build(obs_var) policy.reset(do_resets=None) obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_policies(self): """Test the policies initialization.""" box_env = TfEnv(DummyBoxEnv()) discrete_env = TfEnv(DummyDiscreteEnv()) categorical_gru_policy = CategoricalGRUPolicy(env_spec=discrete_env, hidden_dim=1) categorical_lstm_policy = CategoricalLSTMPolicy(env_spec=discrete_env, hidden_dim=1) categorical_mlp_policy = CategoricalMLPPolicy(env_spec=discrete_env, hidden_sizes=(1, )) continuous_mlp_policy = ContinuousMLPPolicy(env_spec=box_env, hidden_sizes=(1, )) deterministic_mlp_policy = DeterministicMLPPolicy(env_spec=box_env, hidden_sizes=(1, )) gaussian_gru_policy = GaussianGRUPolicy(env_spec=box_env, hidden_dim=1) gaussian_lstm_policy = GaussianLSTMPolicy(env_spec=box_env, hidden_dim=1) gaussian_mlp_policy = GaussianMLPPolicy(env_spec=box_env, hidden_sizes=(1, ))
def test_get_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=False) policy.reset(do_resets=None) obs = env.reset() action, _ = policy.get_action(obs.flatten()) assert env.action_space.contains(action) actions, _ = policy.get_actions([obs.flatten()]) for action in actions: assert env.action_space.contains(action)
def test_trpo_gru_cartpole(self): with LocalTFRunner(snapshot_config, sess=self.sess) as runner: env = TfEnv(normalize(gym.make('CartPole-v1'))) policy = CategoricalGRUPolicy(name='policy', env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=100, discount=0.99, max_kl_step=0.01, optimizer_args=dict(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5))) runner.setup(algo, env) last_avg_ret = runner.train(n_epochs=10, batch_size=2048) assert last_avg_ret > 80 env.close()
def test_build_state_include_action(self, obs_dim, action_dim, hidden_dim): env = GarageEnv( DummyDiscreteEnv(obs_dim=obs_dim, action_dim=action_dim)) policy = CategoricalGRUPolicy(env_spec=env.spec, hidden_dim=hidden_dim, state_include_action=True) policy.reset(do_resets=None) obs = env.reset() state_input = tf.compat.v1.placeholder(tf.float32, shape=(None, None, policy.input_dim)) dist_sym = policy.build(state_input, name='dist_sym').dist dist_sym2 = policy.build(state_input, name='dist_sym2').dist concat_obs = np.concatenate([obs.flatten(), np.zeros(action_dim)]) output1 = self.sess.run( [dist_sym.probs], feed_dict={state_input: [[concat_obs], [concat_obs]]}) output2 = self.sess.run( [dist_sym2.probs], feed_dict={state_input: [[concat_obs], [concat_obs]]}) assert np.array_equal(output1, output2)
def test_dist_info_sym_wrong_input(self): env = TfEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) obs_ph = tf.compat.v1.placeholder( tf.float32, shape=(None, None, env.observation_space.flat_dim)) with mock.patch(('garage.tf.policies.' 'categorical_gru_policy.GRUModel'), new=SimpleGRUModel): policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=True) policy.reset() obs = env.reset() policy.dist_info_sym( obs_var=obs_ph, state_info_vars={'prev_action': np.zeros((3, 1, 1))}, name='p2_sym') # observation batch size = 2 but prev_action batch size = 3 with pytest.raises(tf.errors.InvalidArgumentError): self.sess.run( policy.model.networks['p2_sym'].input, feed_dict={obs_ph: [[obs.flatten()], [obs.flatten()]]})
def test_invalid_env(self): env = GarageEnv(DummyBoxEnv()) with pytest.raises(ValueError): CategoricalGRUPolicy(env_spec=env.spec)
def test_clone(self): env = GarageEnv(DummyDiscreteEnv(obs_dim=(10, ), action_dim=4)) policy = CategoricalGRUPolicy(env_spec=env.spec) policy_clone = policy.clone('CategoricalGRUPolicyClone') assert policy.env_spec == policy_clone.env_spec
def test_state_info_specs_with_state_include_action(self): env = GarageEnv(DummyDiscreteEnv(obs_dim=(10, ), action_dim=4)) policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=True) assert policy.state_info_specs == [('prev_action', (4, ))]
def test_state_info_specs(self): env = GarageEnv(DummyDiscreteEnv(obs_dim=(10, ), action_dim=4)) policy = CategoricalGRUPolicy(env_spec=env.spec, state_include_action=False) assert policy.state_info_specs == []
class TestCategoricalGRUPolicyWithModelTransit(TfGraphTestCase): def setUp(self): super().setUp() env = TfEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) self.default_initializer = tf.constant_initializer(1) self.default_hidden_nonlinearity = tf.nn.tanh self.default_recurrent_nonlinearity = tf.nn.sigmoid self.default_output_nonlinearity = None self.time_step = 1 self.policy1 = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_x_init=self.default_initializer, recurrent_w_h_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=self.default_initializer, state_include_action=True, name='P1') self.policy2 = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_x_init=self.default_initializer, recurrent_w_h_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=tf.constant_initializer(2), state_include_action=True, name='P2') self.sess.run(tf.global_variables_initializer()) self.policy3 = CategoricalGRUPolicyWithModel( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, hidden_w_init=self.default_initializer, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=self.default_initializer, state_include_action=True, name='P3') self.policy4 = CategoricalGRUPolicyWithModel( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, hidden_w_init=self.default_initializer, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=tf.constant_initializer(2), state_include_action=True, name='P4') self.policy1.reset() self.policy2.reset() self.policy3.reset() self.policy4.reset() self.obs = [env.reset()] self.obs = np.concatenate([self.obs for _ in range(self.time_step)], axis=0) self.obs_ph = tf.placeholder(tf.float32, shape=(None, None, env.observation_space.flat_dim)) self.action_ph = tf.placeholder(tf.float32, shape=(None, None, env.action_space.flat_dim)) self.dist1_sym = self.policy1.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p1_sym') self.dist2_sym = self.policy2.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p2_sym') self.dist3_sym = self.policy3.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p3_sym') self.dist4_sym = self.policy4.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p4_sym') def test_dist_info_sym_output(self): # batch size = 2 dist1 = self.sess.run(self.dist1_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist2 = self.sess.run(self.dist2_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist3 = self.sess.run(self.dist3_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) dist4 = self.sess.run(self.dist4_sym, feed_dict={self.obs_ph: [self.obs, self.obs]}) assert np.array_equal(dist1['prob'], dist3['prob']) assert np.array_equal(dist2['prob'], dist4['prob']) @mock.patch('numpy.random.rand') def test_get_action(self, mock_rand): mock_rand.return_value = 0 action1, agent_info1 = self.policy1.get_action(self.obs) action2, agent_info2 = self.policy2.get_action(self.obs) action3, agent_info3 = self.policy3.get_action(self.obs) action4, agent_info4 = self.policy4.get_action(self.obs) assert action1 == action3 assert action2 == action4 assert np.array_equal(agent_info1['prob'], agent_info3['prob']) assert np.array_equal(agent_info2['prob'], agent_info4['prob']) actions1, agent_infos1 = self.policy1.get_actions([self.obs]) actions2, agent_infos2 = self.policy2.get_actions([self.obs]) actions3, agent_infos3 = self.policy3.get_actions([self.obs]) actions4, agent_infos4 = self.policy4.get_actions([self.obs]) assert np.array_equal(actions1, actions3) assert np.array_equal(actions2, actions4) assert np.array_equal(agent_infos1['prob'], agent_infos3['prob']) assert np.array_equal(agent_infos2['prob'], agent_infos4['prob']) def test_kl_sym(self): kl_diff_sym1 = self.policy1.distribution.kl_sym( self.dist1_sym, self.dist2_sym) objective1 = tf.reduce_mean(kl_diff_sym1) kl_func = tensor_utils.compile_function([self.obs_ph], objective1) kl1 = kl_func([self.obs, self.obs]) kl_diff_sym2 = self.policy3.distribution.kl_sym( self.dist3_sym, self.dist4_sym) objective2 = tf.reduce_mean(kl_diff_sym2) kl_func = tensor_utils.compile_function([self.obs_ph], objective2) kl2 = kl_func([self.obs, self.obs]) assert np.array_equal(kl1, kl2) def test_log_likehihood_sym(self): log_prob_sym1 = self.policy1.distribution.log_likelihood_sym( self.action_ph, self.dist1_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func([self.obs, self.obs], np.ones((2, self.time_step, 1))) log_prob_sym2 = self.policy3.distribution.log_likelihood_sym( self.action_ph, self.dist3_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2([self.obs, self.obs], np.ones((2, self.time_step, 1))) assert np.array_equal(log_prob1, log_prob2) log_prob_sym1 = self.policy2.distribution.log_likelihood_sym( self.action_ph, self.dist2_sym) log_prob_func = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym1) log_prob1 = log_prob_func([self.obs, self.obs], np.ones((2, self.time_step, 1))) log_prob_sym2 = self.policy4.distribution.log_likelihood_sym( self.action_ph, self.dist4_sym) log_prob_func2 = tensor_utils.compile_function( [self.obs_ph, self.action_ph], log_prob_sym2) log_prob2 = log_prob_func2([self.obs, self.obs], np.ones((2, self.time_step, 1))) assert np.array_equal(log_prob1, log_prob2) def test_policy_entropy_sym(self): entropy_sym1 = self.policy1.distribution.entropy_sym( self.dist1_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym1) entropy1 = entropy_func([self.obs, self.obs]) entropy_sym2 = self.policy3.distribution.entropy_sym( self.dist3_sym, name='entropy_sym1') entropy_func = tensor_utils.compile_function([self.obs_ph], entropy_sym2) entropy2 = entropy_func([self.obs, self.obs]) assert np.array_equal(entropy1, entropy2) def test_likelihood_ratio_sym(self): likelihood_ratio_sym1 = self.policy1.distribution.likelihood_ratio_sym( self.action_ph, self.dist1_sym, self.dist2_sym, name='li_ratio_sym1') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym1) likelihood_ratio1 = likelihood_ratio_func(np.ones((2, 1, 1)), [self.obs, self.obs]) likelihood_ratio_sym2 = self.policy3.distribution.likelihood_ratio_sym( self.action_ph, self.dist3_sym, self.dist4_sym, name='li_ratio_sym2') likelihood_ratio_func = tensor_utils.compile_function( [self.action_ph, self.obs_ph], likelihood_ratio_sym2) likelihood_ratio2 = likelihood_ratio_func(np.ones((2, 1, 1)), [self.obs, self.obs]) assert np.array_equal(likelihood_ratio1, likelihood_ratio2)
def setUp(self): super().setUp() env = TfEnv(DummyDiscreteEnv(obs_dim=(1, ), action_dim=1)) self.default_initializer = tf.constant_initializer(1) self.default_hidden_nonlinearity = tf.nn.tanh self.default_recurrent_nonlinearity = tf.nn.sigmoid self.default_output_nonlinearity = None self.time_step = 1 self.policy1 = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_x_init=self.default_initializer, recurrent_w_h_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=self.default_initializer, state_include_action=True, name='P1') self.policy2 = CategoricalGRUPolicy( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_x_init=self.default_initializer, recurrent_w_h_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=tf.constant_initializer(2), state_include_action=True, name='P2') self.sess.run(tf.global_variables_initializer()) self.policy3 = CategoricalGRUPolicyWithModel( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, hidden_w_init=self.default_initializer, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=self.default_initializer, state_include_action=True, name='P3') self.policy4 = CategoricalGRUPolicyWithModel( env_spec=env.spec, hidden_dim=4, hidden_nonlinearity=self.default_hidden_nonlinearity, hidden_w_init=self.default_initializer, recurrent_nonlinearity=self.default_recurrent_nonlinearity, recurrent_w_init=self.default_initializer, output_nonlinearity=self.default_output_nonlinearity, output_w_init=tf.constant_initializer(2), state_include_action=True, name='P4') self.policy1.reset() self.policy2.reset() self.policy3.reset() self.policy4.reset() self.obs = [env.reset()] self.obs = np.concatenate([self.obs for _ in range(self.time_step)], axis=0) self.obs_ph = tf.placeholder(tf.float32, shape=(None, None, env.observation_space.flat_dim)) self.action_ph = tf.placeholder(tf.float32, shape=(None, None, env.action_space.flat_dim)) self.dist1_sym = self.policy1.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p1_sym') self.dist2_sym = self.policy2.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p2_sym') self.dist3_sym = self.policy3.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p3_sym') self.dist4_sym = self.policy4.dist_info_sym( obs_var=self.obs_ph, state_info_vars={'prev_action': np.zeros((2, self.time_step, 1))}, name='p4_sym')