def test_demo_mismatch(): path_prefix = os.path.dirname(os.path.abspath(__file__)) # observation size mismatch with pytest.raises(RuntimeError): mismatch_obs = setup_test_behavior_specs(False, False, vector_action_space=2, vector_obs_space=9) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_obs) # action mismatch with pytest.raises(RuntimeError): mismatch_act = setup_test_behavior_specs(False, False, vector_action_space=3, vector_obs_space=9) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_act) # action type mismatch with pytest.raises(RuntimeError): mismatch_act_type = setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=9) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_act_type) # number obs mismatch with pytest.raises(RuntimeError): mismatch_obs_number = setup_test_behavior_specs(False, True, vector_action_space=2, vector_obs_space=9) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, mismatch_obs_number)
def test_normalization(): behavior_spec = mb.setup_test_behavior_specs(use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) policy = NNPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), False, "testdir", False, ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = trainer_config trainer_settings.reward_signals = reward_signal_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = NNPolicy(0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False) if trainer_settings.trainer_type == TrainerType.SAC: optimizer = SACOptimizer(policy, trainer_settings) else: optimizer = PPOOptimizer(policy, trainer_settings) return optimizer
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_brain = mb.setup_test_behavior_specs( use_discrete, False, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy("test", mock_brain) trainer.add_policy("test", policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def test_normalizer_after_load(tmp_path): behavior_spec = mb.setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1 ) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_spec=behavior_spec.action_spec, ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) trainer_params = TrainerSettings(network_settings=NetworkSettings(normalize=True)) policy = TFPolicy(0, behavior_spec, trainer_params) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert steps == 6 assert mean[0] == 0.5 assert variance[0] / steps == pytest.approx(0.25, abs=0.01) # Save ckpt and load into another policy path1 = os.path.join(tmp_path, "runid1") model_saver = TFModelSaver(trainer_params, path1) model_saver.register(policy) mock_brain_name = "MockBrain" model_saver.save_checkpoint(mock_brain_name, 6) assert len(os.listdir(tmp_path)) > 0 policy1 = TFPolicy(0, behavior_spec, trainer_params) model_saver = TFModelSaver(trainer_params, path1, load=True) model_saver.register(policy1) model_saver.initialize_or_load(policy1) # Make another update to new policy, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_spec=behavior_spec.action_spec, ) trajectory_buffer = trajectory.to_agentbuffer() policy1.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy1.sess.run( [policy1.normalization_steps, policy1.running_mean, policy1.running_variance] ) assert steps == 16 assert mean[0] == 0.8125 assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
def test_load_and_set(dummy_config, use_discrete): mock_specs = mb.setup_test_behavior_specs( use_discrete, False, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") trainer.seed = 1 policy = trainer.create_policy("test", mock_specs) trainer.seed = 20 # otherwise graphs are the same to_load_policy = trainer.create_policy("test", mock_specs) weights = policy.get_weights() load_weights = to_load_policy.get_weights() try: for w, lw in zip(weights, load_weights): np.testing.assert_array_equal(w, lw) except AssertionError: pass to_load_policy.load_weights(weights) load_weights = to_load_policy.get_weights() for w, lw in zip(weights, load_weights): np.testing.assert_array_equal(w, lw)
def test_sac_save_load_buffer(tmpdir, dummy_config): mock_specs = mb.setup_test_behavior_specs( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.hyperparameters.save_replay_buffer = True trainer = SACTrainer("test", 1, trainer_params, True, False, 0, "testdir") behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) policy = trainer.create_policy(behavior_id, mock_specs) trainer.add_policy(behavior_id, policy) trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.behavior_spec) buffer_len = trainer.update_buffer.num_experiences trainer.save_model() # Wipe Trainer and try to load trainer2 = SACTrainer("test", 1, trainer_params, True, True, 0, "testdir") policy = trainer2.create_policy(behavior_id, mock_specs) trainer2.add_policy(behavior_id, policy) assert trainer2.update_buffer.num_experiences == buffer_len
def create_policy_mock( dummy_config: TrainerSettings, use_rnn: bool = False, use_discrete: bool = True, use_visual: bool = False, model_path: str = "", load: bool = False, seed: int = 0, ) -> TFPolicy: mock_spec = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = dummy_config trainer_settings.keep_checkpoints = 3 trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings() if use_rnn else None ) policy = TFPolicy( seed, mock_spec, trainer_settings, model_path=model_path, load=load ) return policy
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_behavior_spec = mb.setup_test_behavior_specs( use_discrete, False, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config mock_brain_name = "MockBrain" behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name) trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(behavior_id, mock_behavior_spec) trainer.add_policy(behavior_id, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not if use_discrete: n_agents = len(buffer["discrete_log_probs"]) buffer["discrete_log_probs"].reset_field() for _ in range(n_agents): buffer["discrete_log_probs"].append( np.ones( int(sum(mock_behavior_spec.action_spec.discrete_branches)), dtype=np.float32, )) else: n_agents = len(buffer["continuous_log_probs"]) buffer["continuous_log_probs"].reset_field() for _ in range(n_agents): buffer["continuous_log_probs"].append( np.ones(mock_behavior_spec.action_spec.continuous_size, dtype=np.float32)) trainer.update_buffer = buffer trainer._update_policy()
def test_process_trajectory(dummy_config): behavior_spec = mb.setup_test_behavior_specs( True, False, vector_action_space=DISCRETE_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) mock_brain_name = "MockBrain" behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name) trainer = PPOTrainer("test_brain", 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(behavior_id, behavior_spec) trainer.add_policy(behavior_id, policy) trajectory_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, observation_shapes=behavior_spec.observation_shapes, max_step_complete=True, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that GAE worked assert ( "advantages" in trainer.update_buffer and "discounted_returns" in trainer.update_buffer ) # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=time_horizon + 1, max_step_complete=False, observation_shapes=behavior_spec.observation_shapes, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=1) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" brain_name = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0).brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0, dummy_config, True, "0") # first policy encountered becomes policy trained by wrapped PPO parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) trajectory_queue0 = AgentManagerQueue(behavior_id_team0) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) trajectory_queue1 = AgentManagerQueue(behavior_id_team1) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = dummy_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=12) if use_rnn else None) policy = TorchPolicy(0, mock_brain, trainer_settings) optimizer = TorchSACOptimizer(policy, trainer_settings) return optimizer
def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False) optimizer = TorchPPOOptimizer(policy, trainer_settings) return optimizer
def test_resume(dummy_config, tmp_path): mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=1) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" brain_name = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0).brain_name tmp_path = tmp_path.as_posix() ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, tmp_path) controller = GhostController(100) trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0, dummy_config, True, tmp_path) parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) trainer.save_model() # Make a new trainer, check that the policies are the same ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0, tmp_path) trainer2 = GhostTrainer(ppo_trainer2, brain_name, controller, 0, dummy_config, True, tmp_path) policy = trainer2.create_policy(parsed_behavior_id0, mock_specs) trainer2.add_policy(parsed_behavior_id0, policy) policy = trainer2.create_policy(parsed_behavior_id1, mock_specs) trainer2.add_policy(parsed_behavior_id1, policy) trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id) trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id) weights = trainer1_policy.get_weights() weights2 = trainer2_policy.get_weights() for w, lw in zip(weights, weights2): np.testing.assert_array_equal(w, lw)
def test_sac_trainer_update_normalization(sac_config): behavior_id_team0 = "test_brain?team=0" brain_name = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0).brain_name mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=1) base_config = sac_config.behaviors output_path = "results_dir" train_model = True load_model = False seed = 42 trainer_factory = TrainerFactory( trainer_config=base_config, output_path=output_path, train_model=train_model, load_model=load_model, seed=seed, param_manager=EnvironmentParameterManager(), ) sac_trainer = trainer_factory.generate(brain_name) parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) policy = sac_trainer.create_policy(parsed_behavior_id0, mock_specs) sac_trainer.add_policy(parsed_behavior_id0, policy) trajectory_queue0 = AgentManagerQueue(behavior_id_team0) sac_trainer.subscribe_trajectory_queue(trajectory_queue0) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_specs=create_observation_specs_with_shapes([(1, )]), action_spec=mock_specs.action_spec, ) trajectory_queue0.put(trajectory) # mocking out update_normalization in both the policy and critic with patch( "mlagents.trainers.torch.networks.ValueNetwork.update_normalization" ) as optimizer_update_normalization_mock, patch( "mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization" ) as policy_update_normalization_mock: sac_trainer.advance() optimizer_update_normalization_mock.assert_called_once() policy_update_normalization_mock.assert_called_once()
def test_step_overflow(): behavior_spec = mb.setup_test_behavior_specs(use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1) policy = TFPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), create_tf_graph=False, ) policy.create_input_placeholders() policy.initialize() policy.set_step(2**31 - 1) assert policy.get_current_step() == 2**31 - 1 policy.increment_step(3) assert policy.get_current_step() == 2**31 + 2
def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.reward_signals = { RewardSignalType.EXTRINSIC: RewardSignalSettings(strength=1.0, gamma=0.99) } trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=8, memory_size=10) if use_rnn else None) policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False) optimizer = TorchPOCAOptimizer(policy, trainer_settings) return optimizer
def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config, framework=FrameworkType.TENSORFLOW) trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) if use_rnn else None ) policy = TFPolicy( 0, mock_specs, trainer_settings, "test", False, create_tf_graph=False ) optimizer = PPOOptimizer(policy, trainer_settings) policy.initialize() return optimizer
def test_large_normalization(): behavior_spec = mb.setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1 ) # Taken from Walker seed 3713 which causes NaN without proper initialization large_obs1 = [ 1800.00036621, 1799.96972656, 1800.01245117, 1800.07214355, 1800.02758789, 1799.98303223, 1799.88647461, 1799.89575195, 1800.03479004, 1800.14025879, 1800.17675781, 1800.20581055, 1800.33740234, 1800.36450195, 1800.43457031, 1800.45544434, 1800.44604492, 1800.56713867, 1800.73901367, ] large_obs2 = [ 1799.99975586, 1799.96679688, 1799.92980957, 1799.89550781, 1799.93774414, 1799.95300293, 1799.94067383, 1799.92993164, 1799.84057617, 1799.69873047, 1799.70605469, 1799.82849121, 1799.85095215, 1799.76977539, 1799.78283691, 1799.76708984, 1799.67163086, 1799.59191895, 1799.5135498, 1799.45556641, 1799.3717041, ] policy = TFPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), "testdir", False, ) time_horizon = len(large_obs1) trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_space=[2], ) for i in range(time_horizon): trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01) assert variance[0] / steps == pytest.approx( np.var(large_obs1, dtype=np.float32), abs=0.01 ) time_horizon = len(large_obs2) trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_space=[2], ) for i in range(time_horizon): trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert mean[0] == pytest.approx( np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01 ) assert variance[0] / steps == pytest.approx( np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01 )
def test_publish_queue(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[1], vector_obs_space=8 ) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) brain_name = parsed_behavior_id0.brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(behavior_id_team0) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(behavior_id_team1) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() mock_specs = mb.setup_test_behavior_specs( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
def test_publish_queue(dummy_config): mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[1], vector_obs_space=8) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) brain_name = parsed_behavior_id0.brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0, dummy_config, True, "0") # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(behavior_id_team0) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(behavior_id_team1) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs) # Mock out reward signal eval copy_buffer_fields( buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.rewards_key("extrinsic"), RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.rewards_key("curiosity"), RewardSignalUtil.returns_key("curiosity"), RewardSignalUtil.value_estimates_key("curiosity"), ], ) trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
def test_advance(dummy_config): specs = setup_test_behavior_specs(use_discrete=False, use_visual=False, vector_action_space=2) dummy_config.hyperparameters.steps_per_update = 20 dummy_config.hyperparameters.reward_signal_steps_per_update = 20 dummy_config.hyperparameters.buffer_init_steps = 0 trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0") behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) policy = trainer.create_policy(behavior_id, specs) trainer.add_policy(behavior_id, policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) trajectory = make_fake_trajectory( length=15, observation_shapes=specs.observation_shapes, max_step_complete=True, action_space=2, is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=6, observation_shapes=specs.observation_shapes, max_step_complete=False, action_space=2, is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").num > 0 # Assert we're not just using the default values assert (trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").mean > 0) # Make sure there is a policy on the queue policy_queue.get_nowait() # Add another trajectory. Since this is less than 20 steps total (enough for) # two updates, there should NOT be a policy on the queue. trajectory = make_fake_trajectory( length=5, observation_shapes=specs.observation_shapes, max_step_complete=False, action_space=2, is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Call add_policy and check that we update the correct number of times. # This is to emulate a load from checkpoint. behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) policy = trainer.create_policy(behavior_id, specs) policy.get_current_step = lambda: 200 trainer.add_policy(behavior_id, policy) trainer.saver.initialize_or_load(policy) trainer.optimizer.update = mock.Mock() trainer.optimizer.update_reward_signals = mock.Mock() trainer.optimizer.update_reward_signals.return_value = {} trainer.optimizer.update.return_value = {} trajectory_queue.put(trajectory) trainer.advance() # Make sure we did exactly 1 update assert trainer.optimizer.update.call_count == 1 assert trainer.optimizer.update_reward_signals.call_count == 1