Esempio n. 1
0
def make_PPO2(env_name, num_vec):
    env = util.make_vec_env(env_name, num_vec)
    # TODO(adam): add support for wrapping env with VecNormalize
    # (This is non-trivial since we'd need to make sure it's also applied
    # when the policy is re-loaded to generate rollouts.)
    policy = util.make_blank_policy(env, verbose=1, init_tensorboard=True)
    return policy
Esempio n. 2
0
def test_wrap_learned_reward_no_crash(use_gail, env="CartPole-v1"):
    """
  Briefly train with AIRL, and then used the learned reward to wrap
  a duplicate environment. Finally, use that learned reward to train
  a policy.
  """
    trainer = init_trainer(env, use_gail=use_gail)
    trainer.train(n_epochs=1)

    learned_reward_env = trainer.wrap_env_test_reward(env)
    policy = util.make_blank_policy(env, init_tensorboard=False)
    policy.set_env(learned_reward_env)
    policy.learn(10)
Esempio n. 3
0
def init_trainer(env_id, policy_dir, use_gail, use_random_expert=True,
                 num_vec=8, discrim_scale=False,
                 discrim_kwargs={}, reward_kwargs={}, trainer_kwargs={}):
  """Builds a Trainer, ready to be trained on a vectorized environment
  and either expert rollout data or random rollout data.

  Args:
    env_id (str): The string id of a gym environment.
    use_gail (bool): If True, then train using GAIL. If False, then train
        using AIRL.
    policy_dir (str): The directory containing the pickled experts for
        generating rollouts. Only applicable if `use_random_expert` is True.
    use_random_expert (bool):
        If True, then use a blank (random) policy to generate rollouts.
        If False, then load an expert policy. Will crash if there is no expert
        policy in `policy_dir`.
    trainer_kwargs (dict): Aguments for the Trainer constructor.
    reward_kwargs (dict): Arguments for the `*RewardNet` constructor.
    discrim_kwargs (dict): Arguments for the `DiscrimNet*` constructor.
  """
  env = util.make_vec_env(env_id, num_vec)
  gen_policy = util.make_blank_policy(env, verbose=1)

  if use_random_expert:
    expert_policies = [gen_policy]
  else:
    expert_policies = util.load_policy(env, basedir=policy_dir)
    if expert_policies is None:
      raise ValueError(env)

  if use_gail:
    discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                         env.action_space,
                                         scale=discrim_scale,
                                         **discrim_kwargs)
  else:
    rn = BasicShapedRewardNet(env.observation_space, env.action_space,
                              scale=discrim_scale, **reward_kwargs)
    discrim = discrim_net.DiscrimNetAIRL(rn, **discrim_kwargs)

  trainer = Trainer(env, gen_policy, discrim,
                    expert_policies=expert_policies, **trainer_kwargs)
  return trainer
Esempio n. 4
0
    def add_plot_gen(env, name):
        if n_gen_plot_episodes <= 0:
            return

        gen_policy = trainer.gen_policy
        rand_policy = util.make_blank_policy(env)
        exp_policy = trainer.expert_policies[-1]

        gen_ret = util.rollout.mean_return(gen_policy,
                                           env,
                                           n_episodes=n_gen_plot_episodes)
        rand_ret = util.rollout.mean_return(rand_policy,
                                            env,
                                            n_episodes=n_gen_plot_episodes)
        exp_ret = util.rollout.mean_return(exp_policy,
                                           env,
                                           n_episodes=n_gen_plot_episodes)
        gen_ep_reward[name].append(gen_ret)
        rand_ep_reward[name].append(rand_ret)
        exp_ep_reward[name].append(exp_ret)
        tf.logging.info("generator return: {}".format(gen_ret))
        tf.logging.info("random return: {}".format(rand_ret))
        tf.logging.info("exp return: {}".format(exp_ret))
Esempio n. 5
0
def test_trained_policy_better_than_random(use_gail,
                                           env='CartPole-v1',
                                           n_episodes=50):
    """
  Make sure that generator policy trained to mimick expert policy
  demonstrations) achieves higher reward than a random policy.

  In other words, perform a basic check on the imitation learning
  capabilities of AIRL and GAIL.
  """
    env = util.make_vec_env(env, 32)
    trainer = init_trainer(env, use_expert_rollouts=True, use_gail=use_gail)
    expert_policy = util.load_policy(env, basedir="expert_models")
    random_policy = util.make_blank_policy(env)
    if expert_policy is None:
        pytest.fail("Couldn't load expert_policy!")

    trainer.train(n_epochs=200)

    # Idea: Plot n_epochs vs generator reward.
    for _ in range(4):
        expert_ret = rollout.mean_return(expert_policy,
                                         env,
                                         n_episodes=n_episodes)
        gen_ret = rollout.mean_return(trainer.gen_policy,
                                      env,
                                      n_episodes=n_episodes)
        random_ret = rollout.mean_return(random_policy,
                                         env,
                                         n_episodes=n_episodes)

        print("expert return:", expert_ret)
        print("generator return:", gen_ret)
        print("random return:", random_ret)
        assert expert_ret > random_ret
        assert gen_ret > random_ret