Ejemplo n.º 1
0
    def test_critique(self):

        network = CriticNetwork(hidden_layer_dims=[2])
        rx = np.zeros((10,2))
        ra = np.zeros((10,1))
        dataset = CriticDataset(dict(observations=rx, actions=ra), batch_size=10)

        with tf.Session() as session:

            critic = WassersteinCritic(
                        obs_dim=1,
                        act_dim=1,
                        dataset=dataset, 
                        network=network
                    )
            session.run(tf.global_variables_initializer())
            paths = [
                dict(observations=[[1],[2]], actions=[[1],[2]], rewards=[[1],[2]]),
                dict(observations=[[1],[2],[3]], actions=[[1],[2],[3]], rewards=[[1],[2],[3]]),
                dict(observations=[[1]], actions=[[1]], rewards=[[1]]),
            ]
            rewards = critic.critique(1, paths)
            self.assertTrue(len(rewards[0]) == 2)
            self.assertTrue(len(rewards[1]) == 3)
            self.assertTrue(len(rewards[2]) == 1)
Ejemplo n.º 2
0
def build_critic(args, data, env, writer=None):
    if args.use_critic_replay_memory:
        critic_replay_memory = hgail.misc.utils.KeyValueReplayMemory(maxsize=3 * args.batch_size)
    else:
        critic_replay_memory = None

    critic_dataset = CriticDataset(
        data, 
        replay_memory=critic_replay_memory,
        batch_size=args.critic_batch_size,
        flat_recurrent=args.policy_recurrent
    )

    critic_network = ObservationActionMLP(
        name='critic', 
        hidden_layer_dims=args.critic_hidden_layer_dims,
        dropout_keep_prob=args.critic_dropout_keep_prob
    )
    critic = WassersteinCritic(
        obs_dim=env.observation_space.flat_dim,
        act_dim=env.action_space.flat_dim,
        dataset=critic_dataset, 
        network=critic_network,
        gradient_penalty=args.gradient_penalty,
        optimizer=tf.train.RMSPropOptimizer(args.critic_learning_rate),
        n_train_epochs=args.n_critic_train_epochs,
        summary_writer=writer,
        grad_norm_rescale=args.critic_grad_rescale,
        verbose=2,
        debug_nan=True
    )
    return critic
Ejemplo n.º 3
0
    def test_hgail_two_round_stochastic_env(self):

        env = TfEnv(TwoRoundNondeterministicRewardEnv())

        # dataset of one-hot obs and acts
        # optimal actions: 0, 1
        # first state
        n_expert_samples = 1000
        batch_size = 1000
        half = int(n_expert_samples / 2)
        rx = np.zeros((n_expert_samples, 3))
        rx[:half, 2] = 1
        rx[half:, 0] = 1
        ra = np.zeros((n_expert_samples, 2))
        ra[:half, 0] = 1
        ra[half:, 1] = 1

        critic_dataset = CriticDataset(dict(observations=rx, actions=ra),
                                       batch_size=batch_size)

        with tf.Session() as session:
            # build it
            algo = build_hgail(env, critic_dataset, batch_size)
            session.run(tf.global_variables_initializer())

            # run it!
            algo.train(sess=session)
            policy = algo.hierarchy[0].algo.policy

            # evaluate
            l0_state_infos = dict(latent=[[1, 0]])
            l0_dist_2 = policy.dist_info([[0., 0., 1.]],
                                         l0_state_infos)['prob']
            l0_dist_0 = policy.dist_info([[1., 0., 0.]],
                                         l0_state_infos)['prob']

            l1_state_infos = dict(latent=[[0, 1]])
            l1_dist_2 = policy.dist_info([[0., 0., 1.]],
                                         l1_state_infos)['prob']
            l1_dist_0 = policy.dist_info([[1., 0., 0.]],
                                         l1_state_infos)['prob']

            np.testing.assert_array_almost_equal(l0_dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(l0_dist_0, [[0, 1]], 1)
            np.testing.assert_array_almost_equal(l1_dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(l1_dist_0, [[0, 1]], 1)
Ejemplo n.º 4
0
    def test_gail_one_round_deterministic_env(self):

        with tf.Session() as session:

            n_expert_samples = 1000
            rx = np.ones((n_expert_samples, 1))
            ra = np.zeros((n_expert_samples, 2))
            ra[:, 1] = 1  # one hot actions
            dataset = CriticDataset(dict(observations=rx, actions=ra),
                                    batch_size=1000)

            env = TfEnv(
                GymEnv("OneRoundDeterministicReward-v0", force_reset=True))

            policy, critic = train_gail(session,
                                        env,
                                        dataset,
                                        use_env_rewards=False,
                                        n_itr=20)
            dist = policy.dist_info([[1.]])['prob']
            np.testing.assert_array_almost_equal(dist, [[0, 1]], 2)
Ejemplo n.º 5
0
    def test_gail_two_round_stochastic_env(self):

        with tf.Session() as session:

            # dataset of one-hot obs and acts
            # optimal actions: 0, 1
            # first state
            n_expert_samples = 1000
            half = int(n_expert_samples / 2)
            rx = np.zeros((n_expert_samples, 3))
            rx[:half, 2] = 1
            rx[half:, 0] = 1
            ra = np.zeros((n_expert_samples, 2))
            ra[:half, 0] = 1
            ra[half:, 1] = 1
            dataset = CriticDataset(dict(observations=rx, actions=ra),
                                    batch_size=1000)

            env = TfEnv(TwoRoundNondeterministicRewardEnv())

            policy, critic = train_gail(session,
                                        env,
                                        dataset,
                                        obs_dim=3,
                                        act_dim=2,
                                        use_env_rewards=False,
                                        critic_scale=1.,
                                        n_itr=15,
                                        policy_hid_layer_dims=[32, 32],
                                        batch_size=4000,
                                        critic_learning_rate=.001,
                                        gradient_penalty=1.,
                                        critic_n_train_epochs=10,
                                        sampler_args=dict(n_envs=10))
            dist_2 = policy.dist_info([[0., 0., 1.]])['prob']
            dist_0 = policy.dist_info([[1., 0., 0.]])['prob']
            np.testing.assert_array_almost_equal(dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(dist_0, [[0, 1]], 1)
Ejemplo n.º 6
0
    def test_train(self):

        with tf.Session() as session:

            network = CriticNetwork(hidden_layer_dims=[24])
            batch_size = 10
            obs_dim = 2
            act_dim = 1
            real_data = dict(
                observations=np.ones((batch_size, obs_dim)) * .5, 
                actions=np.ones((batch_size, act_dim)) * .5)
            fake_data = dict(
                observations=np.ones((batch_size, obs_dim)) * -.5, 
                actions=np.ones((batch_size, act_dim)) * -.5)
            dataset = CriticDataset(real_data, batch_size=batch_size)

            critic = WassersteinCritic(
                        obs_dim=obs_dim,
                        act_dim=act_dim,
                        dataset=dataset, 
                        network=network,
                        gradient_penalty=.01
                    )

            session.run(tf.global_variables_initializer())

            n_epochs = 500
            for epoch in range(n_epochs):
                critic.train(epoch, fake_data)

            real_rewards = critic.network.forward(
                real_data['observations'], real_data['actions'])
            fake_rewards = critic.network.forward(
                fake_data['observations'], fake_data['actions'])

            self.assertTrue(real_rewards[0] > 1)
            self.assertTrue(fake_rewards[0] < -1)
Ejemplo n.º 7
0
# load env from the training process
snapshot_filepath = utils.latest_snapshot(exp_dir, phase='train')
env = utils.load_env(snapshot_filepath)

# load critic dataset
expert_data_filepath = os.path.join(exp_dir, 'collection', 'expert_traj.h5')
data = hgail.misc.utils.load_dataset(expert_data_filepath,
                                     maxsize=real_data_maxsize)
data['actions'] = hgail.misc.utils.to_onehot(data['actions'])
if use_critic_replay_memory:
    critic_replay_memory = hgail.misc.utils.KeyValueReplayMemory(maxsize=3 *
                                                                 batch_size)
else:
    critic_replay_memory = None
critic_dataset = CriticDataset(data,
                               replay_memory=critic_replay_memory,
                               batch_size=1000)

# session for actual training
with tf.Session() as session:

    # summary writer
    summary_writer = tf.summary.FileWriter(
        os.path.join(exp_dir, 'imitate', 'summaries'))

    # build the critic
    critic_network = ObservationActionMLP(
        name='critic',
        hidden_layer_dims=[64, 64],
        dropout_keep_prob=critic_dropout_keep_prob)
    critic = WassersteinCritic(
Ejemplo n.º 8
0
env = utils.load_env(snapshot_filepath)
policy_filepath = '../data/experiments/{}/imitate/log/itr_500.pkl'.format(exp_name)
policy_param_values = utils.load_policy_param_values(policy_filepath)
# policy_param_values = None

# load critic dataset
expert_data_filepath = os.path.join(exp_dir, 'collection', 'expert_traj.h5')
data = hgail.misc.utils.load_dataset(expert_data_filepath, maxsize=real_data_maxsize)
data['actions'] = hgail.misc.utils.to_onehot(data['actions'])

if use_critic_replay_memory:
    critic_replay_memory = hgail.misc.utils.KeyValueReplayMemory(maxsize=4 *  batch_size)
else:
    critic_replay_memory = None

critic_dataset = CriticDataset(data, batch_size=4000, replay_memory=critic_replay_memory)

# session for actual training
with tf.Session() as session:
 
    # summary writer 
    summary_writer = tf.summary.FileWriter(
        os.path.join(exp_dir, phase, 'summaries'))

    # build the critic
    with tf.variable_scope('critic'):
        critic_network = ObservationActionMLP(
            name='critic', 
            hidden_layer_dims=[64,64]
        )
        critic = WassersteinCritic(
Ejemplo n.º 9
0
    def test_infogail_two_round_stochastic_env(self):

        env = TfEnv(TwoRoundNondeterministicRewardEnv())

        # dataset of one-hot obs and acts
        # optimal actions: 0, 1
        # first state
        n_expert_samples = 1000
        batch_size = 1000
        half = int(n_expert_samples / 2)
        rx = np.zeros((n_expert_samples, 3))
        rx[:half, 2] = 1
        rx[half:, 0] = 1
        ra = np.zeros((n_expert_samples, 2))
        ra[:half, 0] = 1
        ra[half:, 1] = 1

        with tf.Session() as session:
            # critic
            critic_dataset = CriticDataset(dict(observations=rx, actions=ra),
                                           batch_size=batch_size)
            critic_network = ObservationActionMLP(name='critic',
                                                  hidden_layer_dims=[32, 32])
            critic = WassersteinCritic(obs_dim=3,
                                       act_dim=2,
                                       dataset=critic_dataset,
                                       network=critic_network,
                                       gradient_penalty=.01,
                                       optimizer=tf.train.AdamOptimizer(
                                           .001, beta1=.5, beta2=.9),
                                       n_train_epochs=50)

            # recognition model
            recog_dataset = RecognitionDataset(batch_size=batch_size)
            recog_network = ObservationActionMLP(name='recog',
                                                 hidden_layer_dims=[32, 32],
                                                 output_dim=2)
            recog = RecognitionModel(obs_dim=3,
                                     act_dim=2,
                                     dataset=recog_dataset,
                                     network=recog_network,
                                     variable_type='categorical',
                                     latent_dim=2)

            # policy
            env.spec.num_envs = 10
            latent_sampler = UniformlyRandomLatentSampler(
                scheduler=ConstantIntervalScheduler(),
                name='latent_sampler',
                dim=2)
            policy = CategoricalLatentVarMLPPolicy(
                policy_name="policy",
                latent_sampler=latent_sampler,
                env_spec=env.spec)

            # gail
            reward_handler = RewardHandler(use_env_rewards=False,
                                           critic_final_scale=1.)
            baseline = LinearFeatureBaseline(env_spec=env.spec)
            algo = GAIL(critic=critic,
                        recognition=recog,
                        reward_handler=reward_handler,
                        env=env,
                        policy=policy,
                        baseline=baseline,
                        batch_size=4000,
                        max_path_length=200,
                        n_itr=15,
                        discount=.99,
                        step_size=.01,
                        sampler_args=dict(n_envs=env.spec.num_envs))

            session.run(tf.global_variables_initializer())

            # run it!
            algo.train(sess=session)

            # evaluate
            l0_state_infos = dict(latent=[[1, 0]])
            l0_dist_2 = policy.dist_info([[0., 0., 1.]],
                                         l0_state_infos)['prob']
            l0_dist_0 = policy.dist_info([[1., 0., 0.]],
                                         l0_state_infos)['prob']

            l1_state_infos = dict(latent=[[0, 1]])
            l1_dist_2 = policy.dist_info([[0., 0., 1.]],
                                         l1_state_infos)['prob']
            l1_dist_0 = policy.dist_info([[1., 0., 0.]],
                                         l1_state_infos)['prob']

            np.testing.assert_array_almost_equal(l0_dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(l0_dist_0, [[0, 1]], 1)
            np.testing.assert_array_almost_equal(l1_dist_2, [[1, 0]], 1)
            np.testing.assert_array_almost_equal(l1_dist_0, [[0, 1]], 1)