def test_critique(self): network = CriticNetwork(hidden_layer_dims=[2]) rx = np.zeros((10,2)) ra = np.zeros((10,1)) dataset = CriticDataset(dict(observations=rx, actions=ra), batch_size=10) with tf.Session() as session: critic = WassersteinCritic( obs_dim=1, act_dim=1, dataset=dataset, network=network ) session.run(tf.global_variables_initializer()) paths = [ dict(observations=[[1],[2]], actions=[[1],[2]], rewards=[[1],[2]]), dict(observations=[[1],[2],[3]], actions=[[1],[2],[3]], rewards=[[1],[2],[3]]), dict(observations=[[1]], actions=[[1]], rewards=[[1]]), ] rewards = critic.critique(1, paths) self.assertTrue(len(rewards[0]) == 2) self.assertTrue(len(rewards[1]) == 3) self.assertTrue(len(rewards[2]) == 1)
def build_critic(args, data, env, writer=None): if args.use_critic_replay_memory: critic_replay_memory = hgail.misc.utils.KeyValueReplayMemory(maxsize=3 * args.batch_size) else: critic_replay_memory = None critic_dataset = CriticDataset( data, replay_memory=critic_replay_memory, batch_size=args.critic_batch_size, flat_recurrent=args.policy_recurrent ) critic_network = ObservationActionMLP( name='critic', hidden_layer_dims=args.critic_hidden_layer_dims, dropout_keep_prob=args.critic_dropout_keep_prob ) critic = WassersteinCritic( obs_dim=env.observation_space.flat_dim, act_dim=env.action_space.flat_dim, dataset=critic_dataset, network=critic_network, gradient_penalty=args.gradient_penalty, optimizer=tf.train.RMSPropOptimizer(args.critic_learning_rate), n_train_epochs=args.n_critic_train_epochs, summary_writer=writer, grad_norm_rescale=args.critic_grad_rescale, verbose=2, debug_nan=True ) return critic
def test_hgail_two_round_stochastic_env(self): env = TfEnv(TwoRoundNondeterministicRewardEnv()) # dataset of one-hot obs and acts # optimal actions: 0, 1 # first state n_expert_samples = 1000 batch_size = 1000 half = int(n_expert_samples / 2) rx = np.zeros((n_expert_samples, 3)) rx[:half, 2] = 1 rx[half:, 0] = 1 ra = np.zeros((n_expert_samples, 2)) ra[:half, 0] = 1 ra[half:, 1] = 1 critic_dataset = CriticDataset(dict(observations=rx, actions=ra), batch_size=batch_size) with tf.Session() as session: # build it algo = build_hgail(env, critic_dataset, batch_size) session.run(tf.global_variables_initializer()) # run it! algo.train(sess=session) policy = algo.hierarchy[0].algo.policy # evaluate l0_state_infos = dict(latent=[[1, 0]]) l0_dist_2 = policy.dist_info([[0., 0., 1.]], l0_state_infos)['prob'] l0_dist_0 = policy.dist_info([[1., 0., 0.]], l0_state_infos)['prob'] l1_state_infos = dict(latent=[[0, 1]]) l1_dist_2 = policy.dist_info([[0., 0., 1.]], l1_state_infos)['prob'] l1_dist_0 = policy.dist_info([[1., 0., 0.]], l1_state_infos)['prob'] np.testing.assert_array_almost_equal(l0_dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(l0_dist_0, [[0, 1]], 1) np.testing.assert_array_almost_equal(l1_dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(l1_dist_0, [[0, 1]], 1)
def test_gail_one_round_deterministic_env(self): with tf.Session() as session: n_expert_samples = 1000 rx = np.ones((n_expert_samples, 1)) ra = np.zeros((n_expert_samples, 2)) ra[:, 1] = 1 # one hot actions dataset = CriticDataset(dict(observations=rx, actions=ra), batch_size=1000) env = TfEnv( GymEnv("OneRoundDeterministicReward-v0", force_reset=True)) policy, critic = train_gail(session, env, dataset, use_env_rewards=False, n_itr=20) dist = policy.dist_info([[1.]])['prob'] np.testing.assert_array_almost_equal(dist, [[0, 1]], 2)
def test_gail_two_round_stochastic_env(self): with tf.Session() as session: # dataset of one-hot obs and acts # optimal actions: 0, 1 # first state n_expert_samples = 1000 half = int(n_expert_samples / 2) rx = np.zeros((n_expert_samples, 3)) rx[:half, 2] = 1 rx[half:, 0] = 1 ra = np.zeros((n_expert_samples, 2)) ra[:half, 0] = 1 ra[half:, 1] = 1 dataset = CriticDataset(dict(observations=rx, actions=ra), batch_size=1000) env = TfEnv(TwoRoundNondeterministicRewardEnv()) policy, critic = train_gail(session, env, dataset, obs_dim=3, act_dim=2, use_env_rewards=False, critic_scale=1., n_itr=15, policy_hid_layer_dims=[32, 32], batch_size=4000, critic_learning_rate=.001, gradient_penalty=1., critic_n_train_epochs=10, sampler_args=dict(n_envs=10)) dist_2 = policy.dist_info([[0., 0., 1.]])['prob'] dist_0 = policy.dist_info([[1., 0., 0.]])['prob'] np.testing.assert_array_almost_equal(dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(dist_0, [[0, 1]], 1)
def test_train(self): with tf.Session() as session: network = CriticNetwork(hidden_layer_dims=[24]) batch_size = 10 obs_dim = 2 act_dim = 1 real_data = dict( observations=np.ones((batch_size, obs_dim)) * .5, actions=np.ones((batch_size, act_dim)) * .5) fake_data = dict( observations=np.ones((batch_size, obs_dim)) * -.5, actions=np.ones((batch_size, act_dim)) * -.5) dataset = CriticDataset(real_data, batch_size=batch_size) critic = WassersteinCritic( obs_dim=obs_dim, act_dim=act_dim, dataset=dataset, network=network, gradient_penalty=.01 ) session.run(tf.global_variables_initializer()) n_epochs = 500 for epoch in range(n_epochs): critic.train(epoch, fake_data) real_rewards = critic.network.forward( real_data['observations'], real_data['actions']) fake_rewards = critic.network.forward( fake_data['observations'], fake_data['actions']) self.assertTrue(real_rewards[0] > 1) self.assertTrue(fake_rewards[0] < -1)
# load env from the training process snapshot_filepath = utils.latest_snapshot(exp_dir, phase='train') env = utils.load_env(snapshot_filepath) # load critic dataset expert_data_filepath = os.path.join(exp_dir, 'collection', 'expert_traj.h5') data = hgail.misc.utils.load_dataset(expert_data_filepath, maxsize=real_data_maxsize) data['actions'] = hgail.misc.utils.to_onehot(data['actions']) if use_critic_replay_memory: critic_replay_memory = hgail.misc.utils.KeyValueReplayMemory(maxsize=3 * batch_size) else: critic_replay_memory = None critic_dataset = CriticDataset(data, replay_memory=critic_replay_memory, batch_size=1000) # session for actual training with tf.Session() as session: # summary writer summary_writer = tf.summary.FileWriter( os.path.join(exp_dir, 'imitate', 'summaries')) # build the critic critic_network = ObservationActionMLP( name='critic', hidden_layer_dims=[64, 64], dropout_keep_prob=critic_dropout_keep_prob) critic = WassersteinCritic(
env = utils.load_env(snapshot_filepath) policy_filepath = '../data/experiments/{}/imitate/log/itr_500.pkl'.format(exp_name) policy_param_values = utils.load_policy_param_values(policy_filepath) # policy_param_values = None # load critic dataset expert_data_filepath = os.path.join(exp_dir, 'collection', 'expert_traj.h5') data = hgail.misc.utils.load_dataset(expert_data_filepath, maxsize=real_data_maxsize) data['actions'] = hgail.misc.utils.to_onehot(data['actions']) if use_critic_replay_memory: critic_replay_memory = hgail.misc.utils.KeyValueReplayMemory(maxsize=4 * batch_size) else: critic_replay_memory = None critic_dataset = CriticDataset(data, batch_size=4000, replay_memory=critic_replay_memory) # session for actual training with tf.Session() as session: # summary writer summary_writer = tf.summary.FileWriter( os.path.join(exp_dir, phase, 'summaries')) # build the critic with tf.variable_scope('critic'): critic_network = ObservationActionMLP( name='critic', hidden_layer_dims=[64,64] ) critic = WassersteinCritic(
def test_infogail_two_round_stochastic_env(self): env = TfEnv(TwoRoundNondeterministicRewardEnv()) # dataset of one-hot obs and acts # optimal actions: 0, 1 # first state n_expert_samples = 1000 batch_size = 1000 half = int(n_expert_samples / 2) rx = np.zeros((n_expert_samples, 3)) rx[:half, 2] = 1 rx[half:, 0] = 1 ra = np.zeros((n_expert_samples, 2)) ra[:half, 0] = 1 ra[half:, 1] = 1 with tf.Session() as session: # critic critic_dataset = CriticDataset(dict(observations=rx, actions=ra), batch_size=batch_size) critic_network = ObservationActionMLP(name='critic', hidden_layer_dims=[32, 32]) critic = WassersteinCritic(obs_dim=3, act_dim=2, dataset=critic_dataset, network=critic_network, gradient_penalty=.01, optimizer=tf.train.AdamOptimizer( .001, beta1=.5, beta2=.9), n_train_epochs=50) # recognition model recog_dataset = RecognitionDataset(batch_size=batch_size) recog_network = ObservationActionMLP(name='recog', hidden_layer_dims=[32, 32], output_dim=2) recog = RecognitionModel(obs_dim=3, act_dim=2, dataset=recog_dataset, network=recog_network, variable_type='categorical', latent_dim=2) # policy env.spec.num_envs = 10 latent_sampler = UniformlyRandomLatentSampler( scheduler=ConstantIntervalScheduler(), name='latent_sampler', dim=2) policy = CategoricalLatentVarMLPPolicy( policy_name="policy", latent_sampler=latent_sampler, env_spec=env.spec) # gail reward_handler = RewardHandler(use_env_rewards=False, critic_final_scale=1.) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = GAIL(critic=critic, recognition=recog, reward_handler=reward_handler, env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=200, n_itr=15, discount=.99, step_size=.01, sampler_args=dict(n_envs=env.spec.num_envs)) session.run(tf.global_variables_initializer()) # run it! algo.train(sess=session) # evaluate l0_state_infos = dict(latent=[[1, 0]]) l0_dist_2 = policy.dist_info([[0., 0., 1.]], l0_state_infos)['prob'] l0_dist_0 = policy.dist_info([[1., 0., 0.]], l0_state_infos)['prob'] l1_state_infos = dict(latent=[[0, 1]]) l1_dist_2 = policy.dist_info([[0., 0., 1.]], l1_state_infos)['prob'] l1_dist_0 = policy.dist_info([[1., 0., 0.]], l1_state_infos)['prob'] np.testing.assert_array_almost_equal(l0_dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(l0_dist_0, [[0, 1]], 1) np.testing.assert_array_almost_equal(l1_dist_2, [[1, 0]], 1) np.testing.assert_array_almost_equal(l1_dist_0, [[0, 1]], 1)