def test_serialize_identity(discrim_net, venv, tmpdir): """Does output of deserialized discriminator match that of original?""" original = discrim_net random = base.RandomPolicy(venv.observation_space, venv.action_space) tmppath = os.path.join(tmpdir, "discrim_net.pt") th.save(original, tmppath) loaded = th.load(tmppath) transitions = rollout.generate_transitions(random, venv, n_timesteps=100) rewards = {"train": [], "test": []} for net in [original, loaded]: rewards["train"].append( net.predict_reward_train( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, )) rewards["test"].append( net.predict_reward_test( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, )) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])
def test_train_disc_small_expert_data_warning(tmpdir, _algorithm_cls): logger.configure(tmpdir, ["tensorboard", "stdout"]) venv = util.make_vec_env( "CartPole-v1", n_envs=2, parallel=_parallel, ) gen_algo = util.init_rl(venv, verbose=1) small_data = rollout.generate_transitions(gen_algo, venv, n_timesteps=20) with pytest.raises(ValueError, match="Transitions.*expert_batch_size"): _algorithm_cls( venv=venv, expert_data=small_data, expert_batch_size=21, gen_algo=gen_algo, log_dir=tmpdir, ) with pytest.raises(ValueError, match="expert_batch_size.*positive"): _algorithm_cls( venv=venv, expert_data=small_data, expert_batch_size=-1, gen_algo=gen_algo, log_dir=tmpdir, )
def test_serialize_identity(env_name, model_cfg, normalize, tmpdir): """Test output actions of deserialized policy are same as original.""" orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False) vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) model_name, model_cls_name = model_cfg try: model_cls = registry.load_attr(model_cls_name) except (AttributeError, ImportError): # pragma: no cover pytest.skip( "Couldn't load stable baselines class. " "(Probably because mpi4py not installed.)" ) model = model_cls("MlpPolicy", venv) model.learn(1000) venv.env_method("seed", 0) venv.reset() if normalize: # don't want statistics to change as we collect rollouts vec_normalize.training = False orig_rollout = rollout.generate_transitions( model, venv, n_timesteps=1000, deterministic_policy=True, rng=np.random.RandomState(0), ) serialize.save_stable_model(tmpdir, model, vec_normalize) # We use `orig_venv` since `load_policy` automatically wraps `loaded` # with a VecNormalize, when appropriate. with serialize.load_policy(model_name, tmpdir, orig_venv) as loaded: orig_venv.env_method("seed", 0) orig_venv.reset() new_rollout = rollout.generate_transitions( loaded, orig_venv, n_timesteps=1000, deterministic_policy=True, rng=np.random.RandomState(0), ) assert np.allclose(orig_rollout.acts, new_rollout.acts)
def test_potential_shaping_invariants(graph, session, venv, potential_cls, discount: float, num_timesteps: int = 100): """Test that potential shaping obeys several invariants. Specifically: 1. new_potential must be constant when dones is true, and zero when `discount == 1.0`. 2. new_potential depends only on next observation. 3. old_potential depends only on current observation. 4. Shaping is discount * new_potential - old_potential. """ # Invariants: # When done, new_potential should always be zero. # self.discount * new_potential - old_potential should equal the output # Same old_obs should have same old_potential; same new_obs should have same new_potential. policy = base.RandomPolicy(venv.observation_space, venv.action_space) transitions = rollout.generate_transitions(policy, venv, n_timesteps=num_timesteps) with graph.as_default(), session.as_default(): potential = potential_cls(venv.observation_space, venv.action_space, discount=discount) session.run(tf.global_variables_initializer()) (old_pot, ), (new_pot, ) = rewards.evaluate_potentials([potential], transitions) # Check invariant 1: new_potential must be zero when dones is true transitions_all_done = dataclasses.replace(transitions, dones=np.ones_like( transitions.dones, dtype=np.bool)) with session.as_default(): _, new_pot_done = rewards.evaluate_potentials([potential], transitions_all_done) expected_new_pot_done = 0.0 if discount == 1.0 else np.mean(new_pot_done) assert np.allclose(new_pot_done, expected_new_pot_done) # Check invariants 2 and 3: {new,old}_potential depend only on {next,current} observation def _shuffle(fld: str): arr = np.array(getattr(transitions, fld)) np.random.shuffle(arr) trans = dataclasses.replace(transitions, **{fld: arr}) with session.as_default(): return rewards.evaluate_potentials([potential], trans) (old_pot_shuffled, ), _ = _shuffle("next_obs") _, (new_pot_shuffled, ) = _shuffle("obs") assert np.all(old_pot == old_pot_shuffled) assert np.all(new_pot == new_pot_shuffled) # Check invariant 4: that reward output is as expected given potentials with session.as_default(): rew = rewards.evaluate_models({"m": potential}, transitions)["m"] assert np.allclose(rew, discount * new_pot - old_pot)
def test_train_disc_step_no_crash(trainer, expert_batch_size): transitions = rollout.generate_transitions( trainer.gen_algo, trainer.venv, n_timesteps=expert_batch_size, truncate=True, ) trainer.train_disc(gen_samples=types.dataclass_quick_asdict(transitions))
def test_actions_valid(env_name, policy_type): """Test output actions of our custom policies always lie in action space.""" venv = util.make_vec_env(env_name, n_envs=1, parallel=False) policy = serialize.load_policy(policy_type, "foobar", venv) transitions = rollout.generate_transitions(policy, venv, n_timesteps=100) for a in transitions.acts: assert venv.action_space.contains(a)
def test_train_disc_improve_D(tmpdir, trainer, n_timesteps=200, n_steps=1000): gen_samples = rollout.generate_transitions(trainer.gen_policy, trainer.venv_train_norm, n_timesteps=n_timesteps) loss1 = trainer.eval_disc_loss(gen_samples=gen_samples) for _ in range(n_steps): trainer.train_disc_step(gen_samples=gen_samples) loss2 = trainer.eval_disc_loss(gen_samples=gen_samples) assert loss2 < loss1
def test_serialize_identity(env_name, model_cfg, normalize, tmpdir): """Test output actions of deserialized policy are same as original.""" orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False) vec_normalize = None if normalize: venv = vec_normalize = VecNormalize(venv) model_name, model_cls_name = model_cfg model_cls = registry.load_attr(model_cls_name) # FIXME(sam): verbose=1 is a hack to stop it from setting up SB logger model = model_cls("MlpPolicy", venv, verbose=1) model.learn(1000) venv.env_method("seed", 0) venv.reset() if normalize: # don't want statistics to change as we collect rollouts vec_normalize.training = False orig_rollout = rollout.generate_transitions( model, venv, n_timesteps=1000, deterministic_policy=True, rng=np.random.RandomState(0), ) serialize.save_stable_model(tmpdir, model, vec_normalize) # We use `orig_venv` since `load_policy` automatically wraps `loaded` # with a VecNormalize, when appropriate. loaded = serialize.load_policy(model_name, tmpdir, orig_venv) orig_venv.env_method("seed", 0) orig_venv.reset() new_rollout = rollout.generate_transitions( loaded, orig_venv, n_timesteps=1000, deterministic_policy=True, rng=np.random.RandomState(0), ) assert np.allclose(orig_rollout.acts, new_rollout.acts)
def test_train_disc_improve_D(tmpdir, trainer, n_timesteps=200, n_steps=100): gen_samples = rollout.generate_transitions( trainer.gen_algo, trainer.venv_train_norm, n_timesteps=n_timesteps ) init_stats = None final_stats = None for _ in range(n_steps): final_stats = trainer.train_disc_step(gen_samples=gen_samples) if init_stats is None: init_stats = final_stats assert final_stats["disc_loss"] < init_stats["disc_loss"]
def test_ground_truth_similar_to_gym(graph, session, venv, reward_id): """Checks that reward models predictions match those of Gym reward.""" # Generate rollouts, recording Gym reward policy = base_policies.RandomPolicy(venv.observation_space, venv.action_space) transitions = rollout.generate_transitions(policy, venv, n_timesteps=1024) gym_reward = transitions.rews # Make predictions using reward model with graph.as_default(), session.as_default(): reward_model = serialize.load_reward(reward_id, "dummy", venv, 1.0) pred_reward = base.evaluate_models({"m": reward_model}, transitions)["m"] # Are the predictions close to true Gym reward? np.testing.assert_allclose(gym_reward, pred_reward, rtol=0, atol=5e-5)
def test_serialize_identity(env_name, net_cls, tmpdir): """Does output of deserialized reward network match that of original?""" logging.info(f"Testing {net_cls}") venv = util.make_vec_env(env_name, n_envs=1, parallel=False) original = net_cls(venv.observation_space, venv.action_space) random = base.RandomPolicy(venv.observation_space, venv.action_space) tmppath = os.path.join(tmpdir, "reward.pt") th.save(original, tmppath) loaded = th.load(tmppath) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space transitions = rollout.generate_transitions(random, venv, n_timesteps=100) unshaped_fn = serialize.load_reward("RewardNet_unshaped", tmppath, venv) shaped_fn = serialize.load_reward("RewardNet_shaped", tmppath, venv) rewards = { "train": [], "test": [], } for net in [original, loaded]: trans_args = ( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, ) rewards["train"].append(net.predict_reward_train(*trans_args)) rewards["test"].append(net.predict_reward_test(*trans_args)) args = ( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, ) rewards["train"].append(shaped_fn(*args)) rewards["test"].append(unshaped_fn(*args)) for key, predictions in rewards.items(): assert len(predictions) == 3 assert np.allclose(predictions[0], predictions[1]) assert np.allclose(predictions[0], predictions[2])
def test_train_disc_small_expert_data_warning(tmpdir, _algorithm_cls): logger.configure(tmpdir, ["tensorboard", "stdout"]) venv = util.make_vec_env( "CartPole-v1", n_envs=2, parallel=_parallel, log_dir=tmpdir, ) gen_algo = util.init_rl(venv, verbose=1) small_data = rollout.generate_transitions(gen_algo, venv, n_timesteps=20) with pytest.warns(RuntimeWarning, match="discriminator batch size"): _algorithm_cls( venv=venv, expert_data=small_data, gen_algo=gen_algo, log_dir=tmpdir, )
def test_serialize_identity(session, env_name, net_cls, tmpdir): """Does output of deserialized reward network match that of original?""" logging.info(f"Testing {net_cls}") venv = util.make_vec_env(env_name, n_envs=1, parallel=False) with tf.variable_scope("original"): original = net_cls(venv.observation_space, venv.action_space) random = base.RandomPolicy(venv.observation_space, venv.action_space) session.run(tf.global_variables_initializer()) original.save(tmpdir) with tf.variable_scope("loaded"): loaded = reward_net.RewardNet.load(tmpdir) assert original.observation_space == loaded.observation_space assert original.action_space == loaded.action_space transitions = rollout.generate_transitions(random, venv, n_timesteps=100) feed_dict = {} outputs = {"train": [], "test": []} for net in [original, loaded]: feed_dict.update(_make_feed_dict(net, transitions)) outputs["train"].append(net.reward_output_train) outputs["test"].append(net.reward_output_test) with serialize.load_reward("RewardNet_unshaped", tmpdir, venv) as unshaped_fn: with serialize.load_reward("RewardNet_shaped", tmpdir, venv) as shaped_fn: rewards = session.run(outputs, feed_dict=feed_dict) args = ( transitions.obs, transitions.acts, transitions.next_obs, transitions.dones, ) rewards["train"].append(shaped_fn(*args)) rewards["test"].append(unshaped_fn(*args)) for key, predictions in rewards.items(): assert len(predictions) == 3 assert np.allclose(predictions[0], predictions[1]) assert np.allclose(predictions[0], predictions[2])
def test_train_disc_improve_D(tmpdir, trainer, expert_transitions, expert_batch_size, n_steps=3): expert_samples = expert_transitions[:expert_batch_size] expert_samples = types.dataclass_quick_asdict(expert_samples) gen_samples = rollout.generate_transitions( trainer.gen_algo, trainer.venv_train, n_timesteps=expert_batch_size, truncate=True, ) gen_samples = types.dataclass_quick_asdict(gen_samples) init_stats = final_stats = None for _ in range(n_steps): final_stats = trainer.train_disc(gen_samples=gen_samples, expert_samples=expert_samples) if init_stats is None: init_stats = final_stats assert final_stats["disc_loss"] < init_stats["disc_loss"]
def test_serialize_identity(session, env_name, discrim_net_cls, tmpdir): """Does output of deserialized discriminator match that of original?""" venv = util.make_vec_env(env_name, parallel=False) original = DISCRIM_NET_SETUPS[discrim_net_cls](venv) random = base.RandomPolicy(venv.observation_space, venv.action_space) session.run(tf.global_variables_initializer()) original.save(tmpdir) with tf.variable_scope("loaded"): loaded = discrim_net.DiscrimNet.load(tmpdir) transitions = rollout.generate_transitions(random, venv, n_timesteps=100) length = len(transitions.obs) # n_timesteps is only a lower bound labels = np.random.randint(2, size=length).astype(np.float32) log_prob = np.random.randn(length) feed_dict = {} outputs = {"train": [], "test": []} for net in [original, loaded]: feed_dict.update( { net.obs_ph: transitions.obs, net.act_ph: transitions.acts, net.next_obs_ph: transitions.next_obs, net.labels_gen_is_one_ph: labels, net.log_policy_act_prob_ph: log_prob, } ) outputs["train"].append(net.policy_train_reward) outputs["test"].append(net.policy_test_reward) rewards = session.run(outputs, feed_dict=feed_dict) for key, predictions in rewards.items(): assert len(predictions) == 2 assert np.allclose(predictions[0], predictions[1])
def f(total_timesteps: int) -> types.Transitions: # TODO(adam): inefficient -- discards partial trajectories and resets environment return rollout.generate_transitions(policy, venv, n_timesteps=total_timesteps)
def test_train_disc_step_no_crash(trainer, n_timesteps=200): transitions = rollout.generate_transitions( trainer.gen_algo, trainer.venv, n_timesteps=n_timesteps ) trainer.train_disc_step(gen_samples=transitions)
def test_train_disc_step_no_crash(tmpdir, use_gail, parallel, n_timesteps=200): trainer = init_test_trainer(tmpdir, use_gail=use_gail, parallel=parallel) transitions = rollout.generate_transitions( trainer.gen_policy, trainer.venv, n_timesteps=n_timesteps ) trainer.train_disc_step(gen_samples=transitions)