def test_serialize_identity(session, env_id, discrim_net_cls): """Does output of deserialized discriminator match that of original?""" env = gym.make(env_id) original = DISCRIM_NET_SETUPS[discrim_net_cls](env) random = base.RandomPolicy(env.observation_space, env.action_space) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory(prefix='imitation-serialize') as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded"): loaded = discrim_net_cls.load(tmpdir) old_obs, act, new_obs, _rew = rollout.generate_transitions(random, env, n_timesteps=100) labels = np.random.randint(2, size=len(old_obs)).astype(np.float32) log_prob = np.random.randn(len(old_obs)) feed_dict = {} outputs = {'train': [], 'test': []} for net in [original, loaded]: feed_dict.update({ net.old_obs_ph: old_obs, net.act_ph: act, net.new_obs_ph: new_obs, net.labels_ph: labels, net.log_policy_act_prob_ph: log_prob, }) outputs['train'].append(net.policy_train_reward) outputs['test'].append(net.policy_test_reward) rewards = session.run(outputs, feed_dict=feed_dict) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])
def test_serialize_identity(session, env_id, reward_net_cls): """Does output of deserialized reward network match that of original?""" env = gym.make(env_id) with tf.variable_scope("original"): original = reward_net_cls(env.observation_space, env.action_space) random = base.RandomPolicy(env.observation_space, env.action_space) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory( prefix='imitation-serialize-rew') as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded"): loaded = reward_net_cls.load(tmpdir) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space rollouts = rollout.generate_transitions(random, env, n_timesteps=100) feed_dict = {} outputs = {'train': [], 'test': []} for net in [original, loaded]: feed_dict.update(_make_feed_dict(net, rollouts)) outputs['train'].append(net.reward_output_train) outputs['test'].append(net.reward_output_test) rewards = session.run(outputs, feed_dict=feed_dict) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])
def test_train_disc_no_crash(use_gail, parallel, env='CartPole-v1', n_timesteps=200): trainer = init_test_trainer(env, use_gail=use_gail, parallel=parallel) trainer.train_disc() obs_old, act, obs_new, _ = rollout.generate_transitions( trainer.gen_policy, env, n_timesteps=n_timesteps) trainer.train_disc(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new)
def test_train_disc_no_crash(use_gail, parallel, n_timesteps=200): trainer = init_test_trainer(use_gail=use_gail, parallel=parallel) trainer.train_disc() transitions = rollout.generate_transitions(trainer.gen_policy, trainer.venv, n_timesteps=n_timesteps) trainer.train_disc(gen_obs=transitions.obs, gen_acts=transitions.acts, gen_next_obs=transitions.next_obs)
def test_train_disc_improve_D(use_gail, env='CartPole-v1', n_timesteps=200, n_steps=1000): trainer = init_test_trainer(env, use_gail) obs_old, act, obs_new, _ = rollout.generate_transitions( trainer.gen_policy, env, n_timesteps=n_timesteps) kwargs = dict(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new) loss1 = trainer.eval_disc_loss(**kwargs) trainer.train_disc(n_steps=n_steps, **kwargs) loss2 = trainer.eval_disc_loss(**kwargs) assert loss2 < loss1
def test_train_disc_improve_D(tmpdir, use_gail, n_timesteps=200, n_steps=1000): trainer = init_test_trainer(tmpdir, use_gail) gen_samples = rollout.generate_transitions(trainer.gen_policy, trainer.venv_train_norm, n_timesteps=n_timesteps) loss1 = trainer.eval_disc_loss(gen_samples=gen_samples) for _ in range(n_steps): trainer.train_disc_step(gen_samples=gen_samples) loss2 = trainer.eval_disc_loss(gen_samples=gen_samples) assert loss2 < loss1
def test_actions_valid(env_name, policy_type): """Test output actions of our custom policies always lie in action space.""" venv = util.make_vec_env(env_name, n_envs=1, parallel=False) with serialize.load_policy(policy_type, "foobar", venv) as policy: transitions = rollout.generate_transitions(policy, venv, n_timesteps=100) for a in transitions.acts: assert venv.action_space.contains(a)
def test_serialize_identity(env_name, model_cfg, normalize): """Test output actions of deserialized policy are same as original.""" orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False) vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) model_name, model_cls_name = model_cfg try: model_cls = registry.load_attr(model_cls_name) except (AttributeError, ImportError): # pragma: no cover pytest.skip("Couldn't load stable baselines class. " "(Probably because mpi4py not installed.)") model = model_cls('MlpPolicy', venv) model.learn(1000) venv.env_method('seed', 0) venv.reset() if normalize: # don't want statistics to change as we collect rollouts vec_normalize.training = False orig_rollout = rollout.generate_transitions(model, venv, n_timesteps=1000, deterministic_policy=True) with tempfile.TemporaryDirectory( prefix='imitation-serialize-pol') as tmpdir: serialize.save_stable_model(tmpdir, model, vec_normalize) # We use `orig_venv` since `load_policy` automatically wraps `loaded` # with a VecNormalize, when appropriate. with serialize.load_policy(model_name, tmpdir, orig_venv) as loaded: orig_venv.env_method('seed', 0) orig_venv.reset() new_rollout = rollout.generate_transitions( loaded, orig_venv, n_timesteps=1000, deterministic_policy=True) orig_acts = orig_rollout[1] new_acts = new_rollout[1] assert np.allclose(orig_acts, new_acts)
def test_train_disc_improve_D(use_gail, n_timesteps=200, n_steps=1000): trainer = init_test_trainer(use_gail) transitions = rollout.generate_transitions(trainer.gen_policy, trainer.venv, n_timesteps=n_timesteps) kwargs = dict(gen_obs=transitions.obs, gen_acts=transitions.acts, gen_next_obs=transitions.next_obs) loss1 = trainer.eval_disc_loss(**kwargs) trainer.train_disc(n_steps=n_steps, **kwargs) loss2 = trainer.eval_disc_loss(**kwargs) assert loss2 < loss1
def _populate_gen_replay_buffer(self) -> None: """Generate and store generator samples in the buffer. More specifically, rolls out generator-policy trajectories in the environment until `self._n_disc_samples_per_buffer` obs-act-obs samples are produced, and then stores these samples. """ gen_rollouts = rollout.generate_transitions( self._gen_policy, self.env_train, n_timesteps=self._n_disc_samples_per_buffer)[:3] self._gen_replay_buffer.store(*gen_rollouts)
def test_serialize_identity(session, env_name, reward_net): """Does output of deserialized reward network match that of original?""" net_name, net_cls = reward_net print(f"Testing {net_name}") venv = util.make_vec_env(env_name, n_envs=1, parallel=False) with tf.variable_scope("original"): original = net_cls(venv.observation_space, venv.action_space) random = base.RandomPolicy(venv.observation_space, venv.action_space) session.run(tf.global_variables_initializer()) with tempfile.TemporaryDirectory( prefix='imitation-serialize-rew') as tmpdir: original.save(tmpdir) with tf.variable_scope("loaded"): loaded = net_cls.load(tmpdir) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space rollouts = rollout.generate_transitions(random, venv, n_timesteps=100) feed_dict = {} outputs = {'train': [], 'test': []} for net in [original, loaded]: feed_dict.update(_make_feed_dict(net, rollouts)) outputs['train'].append(net.reward_output_train) outputs['test'].append(net.reward_output_test) unshaped_name = f"{net_name}_unshaped" shaped_name = f"{net_name}_shaped" with serialize.load_reward(unshaped_name, tmpdir, venv) as unshaped_fn: with serialize.load_reward(shaped_name, tmpdir, venv) as shaped_fn: rewards = session.run(outputs, feed_dict=feed_dict) old_obs, actions, new_obs, _ = rollouts steps = np.zeros((old_obs.shape[0], )) rewards['train'].append( shaped_fn(old_obs, actions, new_obs, steps)) rewards['test'].append( unshaped_fn(old_obs, actions, new_obs, steps)) for key, predictions in rewards.items(): assert len(predictions) == 3 assert np.allclose(predictions[0], predictions[1]) assert np.allclose(predictions[0], predictions[2])
def test_serialize_identity(session, env_name, net_cls, tmpdir): """Does output of deserialized reward network match that of original?""" logging.info(f"Testing {net_cls}") venv = util.make_vec_env(env_name, n_envs=1, parallel=False) with tf.variable_scope("original"): original = net_cls(venv.observation_space, venv.action_space) random = base.RandomPolicy(venv.observation_space, venv.action_space) session.run(tf.global_variables_initializer()) original.save(tmpdir) with tf.variable_scope("loaded"): loaded = reward_net.RewardNet.load(tmpdir) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space transitions = rollout.generate_transitions(random, venv, n_timesteps=100) feed_dict = {} outputs = {'train': [], 'test': []} for net in [original, loaded]: feed_dict.update(_make_feed_dict(net, transitions)) outputs['train'].append(net.reward_output_train) outputs['test'].append(net.reward_output_test) with serialize.load_reward("RewardNet_unshaped", tmpdir, venv) as unshaped_fn: with serialize.load_reward("RewardNet_shaped", tmpdir, venv) as shaped_fn: rewards = session.run(outputs, feed_dict=feed_dict) steps = np.zeros((transitions.obs.shape[0],)) args = (transitions.obs, transitions.acts, transitions.next_obs, steps) rewards['train'].append(shaped_fn(*args)) rewards['test'].append(unshaped_fn(*args)) for key, predictions in rewards.items(): assert len(predictions) == 3 assert np.allclose(predictions[0], predictions[1]) assert np.allclose(predictions[0], predictions[2])
def test_train_disc_step_no_crash(tmpdir, use_gail, parallel, n_timesteps=200): trainer = init_test_trainer(tmpdir, use_gail=use_gail, parallel=parallel) transitions = rollout.generate_transitions(trainer.gen_policy, trainer.venv, n_timesteps=n_timesteps) trainer.train_disc_step(gen_samples=transitions)