Exemple #1
0
def test_serialize_identity(session, env_id, discrim_net_cls):
  """Does output of deserialized discriminator match that of original?"""
  env = gym.make(env_id)
  original = DISCRIM_NET_SETUPS[discrim_net_cls](env)
  random = base.RandomPolicy(env.observation_space, env.action_space)
  session.run(tf.global_variables_initializer())

  with tempfile.TemporaryDirectory(prefix='imitation-serialize') as tmpdir:
    original.save(tmpdir)
    with tf.variable_scope("loaded"):
      loaded = discrim_net_cls.load(tmpdir)

  old_obs, act, new_obs, _rew = rollout.generate_transitions(random, env,
                                                             n_timesteps=100)
  labels = np.random.randint(2, size=len(old_obs)).astype(np.float32)
  log_prob = np.random.randn(len(old_obs))

  feed_dict = {}
  outputs = {'train': [], 'test': []}
  for net in [original, loaded]:
    feed_dict.update({
        net.old_obs_ph: old_obs,
        net.act_ph: act,
        net.new_obs_ph: new_obs,
        net.labels_ph: labels,
        net.log_policy_act_prob_ph: log_prob,
    })
    outputs['train'].append(net.policy_train_reward)
    outputs['test'].append(net.policy_test_reward)

  rewards = session.run(outputs, feed_dict=feed_dict)

  for key, predictions in rewards.items():
    assert len(predictions) == 2
    assert np.allclose(predictions[0], predictions[1])
def test_serialize_identity(session, env_id, reward_net_cls):
    """Does output of deserialized reward network match that of original?"""
    env = gym.make(env_id)
    with tf.variable_scope("original"):
        original = reward_net_cls(env.observation_space, env.action_space)
    random = base.RandomPolicy(env.observation_space, env.action_space)
    session.run(tf.global_variables_initializer())

    with tempfile.TemporaryDirectory(
            prefix='imitation-serialize-rew') as tmpdir:
        original.save(tmpdir)
        with tf.variable_scope("loaded"):
            loaded = reward_net_cls.load(tmpdir)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    rollouts = rollout.generate_transitions(random, env, n_timesteps=100)
    feed_dict = {}
    outputs = {'train': [], 'test': []}
    for net in [original, loaded]:
        feed_dict.update(_make_feed_dict(net, rollouts))
        outputs['train'].append(net.reward_output_train)
        outputs['test'].append(net.reward_output_test)

    rewards = session.run(outputs, feed_dict=feed_dict)

    for key, predictions in rewards.items():
        assert len(predictions) == 2
        assert np.allclose(predictions[0], predictions[1])
Exemple #3
0
def test_train_disc_no_crash(use_gail, parallel,
                             env='CartPole-v1', n_timesteps=200):
  trainer = init_test_trainer(env, use_gail=use_gail, parallel=parallel)
  trainer.train_disc()
  obs_old, act, obs_new, _ = rollout.generate_transitions(
      trainer.gen_policy, env, n_timesteps=n_timesteps)
  trainer.train_disc(gen_old_obs=obs_old, gen_act=act,
                     gen_new_obs=obs_new)
def test_train_disc_no_crash(use_gail, parallel, n_timesteps=200):
    trainer = init_test_trainer(use_gail=use_gail, parallel=parallel)
    trainer.train_disc()
    transitions = rollout.generate_transitions(trainer.gen_policy,
                                               trainer.venv,
                                               n_timesteps=n_timesteps)
    trainer.train_disc(gen_obs=transitions.obs,
                       gen_acts=transitions.acts,
                       gen_next_obs=transitions.next_obs)
Exemple #5
0
def test_train_disc_improve_D(use_gail, env='CartPole-v1', n_timesteps=200,
                              n_steps=1000):
  trainer = init_test_trainer(env, use_gail)
  obs_old, act, obs_new, _ = rollout.generate_transitions(
      trainer.gen_policy, env, n_timesteps=n_timesteps)
  kwargs = dict(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new)
  loss1 = trainer.eval_disc_loss(**kwargs)
  trainer.train_disc(n_steps=n_steps, **kwargs)
  loss2 = trainer.eval_disc_loss(**kwargs)
  assert loss2 < loss1
def test_train_disc_improve_D(tmpdir, use_gail, n_timesteps=200, n_steps=1000):
    trainer = init_test_trainer(tmpdir, use_gail)
    gen_samples = rollout.generate_transitions(trainer.gen_policy,
                                               trainer.venv_train_norm,
                                               n_timesteps=n_timesteps)
    loss1 = trainer.eval_disc_loss(gen_samples=gen_samples)
    for _ in range(n_steps):
        trainer.train_disc_step(gen_samples=gen_samples)
    loss2 = trainer.eval_disc_loss(gen_samples=gen_samples)
    assert loss2 < loss1
def test_actions_valid(env_name, policy_type):
    """Test output actions of our custom policies always lie in action space."""
    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with serialize.load_policy(policy_type, "foobar", venv) as policy:
        transitions = rollout.generate_transitions(policy,
                                                   venv,
                                                   n_timesteps=100)

    for a in transitions.acts:
        assert venv.action_space.contains(a)
Exemple #8
0
def test_serialize_identity(env_name, model_cfg, normalize):
    """Test output actions of deserialized policy are same as original."""
    orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    vec_normalize = None
    if normalize:
        venv = vec_normalize = VecNormalize(venv)

    model_name, model_cls_name = model_cfg
    try:
        model_cls = registry.load_attr(model_cls_name)
    except (AttributeError, ImportError):  # pragma: no cover
        pytest.skip("Couldn't load stable baselines class. "
                    "(Probably because mpi4py not installed.)")

    model = model_cls('MlpPolicy', venv)
    model.learn(1000)

    venv.env_method('seed', 0)
    venv.reset()
    if normalize:
        # don't want statistics to change as we collect rollouts
        vec_normalize.training = False
    orig_rollout = rollout.generate_transitions(model,
                                                venv,
                                                n_timesteps=1000,
                                                deterministic_policy=True)

    with tempfile.TemporaryDirectory(
            prefix='imitation-serialize-pol') as tmpdir:
        serialize.save_stable_model(tmpdir, model, vec_normalize)
        # We use `orig_venv` since `load_policy` automatically wraps `loaded`
        # with a VecNormalize, when appropriate.
        with serialize.load_policy(model_name, tmpdir, orig_venv) as loaded:
            orig_venv.env_method('seed', 0)
            orig_venv.reset()
            new_rollout = rollout.generate_transitions(
                loaded, orig_venv, n_timesteps=1000, deterministic_policy=True)

    orig_acts = orig_rollout[1]
    new_acts = new_rollout[1]
    assert np.allclose(orig_acts, new_acts)
def test_train_disc_improve_D(use_gail, n_timesteps=200, n_steps=1000):
    trainer = init_test_trainer(use_gail)
    transitions = rollout.generate_transitions(trainer.gen_policy,
                                               trainer.venv,
                                               n_timesteps=n_timesteps)
    kwargs = dict(gen_obs=transitions.obs,
                  gen_acts=transitions.acts,
                  gen_next_obs=transitions.next_obs)
    loss1 = trainer.eval_disc_loss(**kwargs)
    trainer.train_disc(n_steps=n_steps, **kwargs)
    loss2 = trainer.eval_disc_loss(**kwargs)
    assert loss2 < loss1
Exemple #10
0
    def _populate_gen_replay_buffer(self) -> None:
        """Generate and store generator samples in the buffer.

    More specifically, rolls out generator-policy trajectories in the
    environment until `self._n_disc_samples_per_buffer` obs-act-obs samples are
    produced, and then stores these samples.
    """
        gen_rollouts = rollout.generate_transitions(
            self._gen_policy,
            self.env_train,
            n_timesteps=self._n_disc_samples_per_buffer)[:3]
        self._gen_replay_buffer.store(*gen_rollouts)
def test_serialize_identity(session, env_name, reward_net):
    """Does output of deserialized reward network match that of original?"""
    net_name, net_cls = reward_net
    print(f"Testing {net_name}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with tf.variable_scope("original"):
        original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    with tempfile.TemporaryDirectory(
            prefix='imitation-serialize-rew') as tmpdir:
        original.save(tmpdir)
        with tf.variable_scope("loaded"):
            loaded = net_cls.load(tmpdir)

        assert original.observation_space == loaded.observation_space
        assert original.action_space == loaded.action_space

        rollouts = rollout.generate_transitions(random, venv, n_timesteps=100)
        feed_dict = {}
        outputs = {'train': [], 'test': []}
        for net in [original, loaded]:
            feed_dict.update(_make_feed_dict(net, rollouts))
            outputs['train'].append(net.reward_output_train)
            outputs['test'].append(net.reward_output_test)

        unshaped_name = f"{net_name}_unshaped"
        shaped_name = f"{net_name}_shaped"
        with serialize.load_reward(unshaped_name, tmpdir, venv) as unshaped_fn:
            with serialize.load_reward(shaped_name, tmpdir, venv) as shaped_fn:
                rewards = session.run(outputs, feed_dict=feed_dict)

                old_obs, actions, new_obs, _ = rollouts
                steps = np.zeros((old_obs.shape[0], ))
                rewards['train'].append(
                    shaped_fn(old_obs, actions, new_obs, steps))
                rewards['test'].append(
                    unshaped_fn(old_obs, actions, new_obs, steps))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
def test_serialize_identity(session, env_name, net_cls, tmpdir):
  """Does output of deserialized reward network match that of original?"""
  logging.info(f"Testing {net_cls}")

  venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
  with tf.variable_scope("original"):
    original = net_cls(venv.observation_space, venv.action_space)
  random = base.RandomPolicy(venv.observation_space, venv.action_space)
  session.run(tf.global_variables_initializer())

  original.save(tmpdir)
  with tf.variable_scope("loaded"):
    loaded = reward_net.RewardNet.load(tmpdir)

  assert original.observation_space == loaded.observation_space
  assert original.action_space == loaded.action_space

  transitions = rollout.generate_transitions(random, venv, n_timesteps=100)
  feed_dict = {}
  outputs = {'train': [], 'test': []}
  for net in [original, loaded]:
    feed_dict.update(_make_feed_dict(net, transitions))
    outputs['train'].append(net.reward_output_train)
    outputs['test'].append(net.reward_output_test)

  with serialize.load_reward("RewardNet_unshaped",
                             tmpdir, venv) as unshaped_fn:
    with serialize.load_reward("RewardNet_shaped",
                               tmpdir, venv) as shaped_fn:
      rewards = session.run(outputs, feed_dict=feed_dict)

      steps = np.zeros((transitions.obs.shape[0],))
      args = (transitions.obs, transitions.acts,
              transitions.next_obs, steps)
      rewards['train'].append(shaped_fn(*args))
      rewards['test'].append(unshaped_fn(*args))

  for key, predictions in rewards.items():
    assert len(predictions) == 3
    assert np.allclose(predictions[0], predictions[1])
    assert np.allclose(predictions[0], predictions[2])
Exemple #13
0
def test_train_disc_step_no_crash(tmpdir, use_gail, parallel, n_timesteps=200):
    trainer = init_test_trainer(tmpdir, use_gail=use_gail, parallel=parallel)
    transitions = rollout.generate_transitions(trainer.gen_policy,
                                               trainer.venv,
                                               n_timesteps=n_timesteps)
    trainer.train_disc_step(gen_samples=transitions)