def test_warning_group_reward(self): with self.assertLogs("mlagents.trainers", level="WARN") as cm: rl_trainer = create_rl_trainer() # This one should warn trajectory = mb.make_fake_trajectory( length=10, observation_specs=create_observation_specs_with_shapes([(1, ) ]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), group_reward=1.0, ) buff = trajectory.to_agentbuffer() rl_trainer._warn_if_group_reward(buff) assert len(cm.output) > 0 len_of_first_warning = len(cm.output) rl_trainer = create_rl_trainer() # This one shouldn't trajectory = mb.make_fake_trajectory( length=10, observation_specs=create_observation_specs_with_shapes([(1, ) ]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) buff = trajectory.to_agentbuffer() rl_trainer._warn_if_group_reward(buff) # Make sure warnings don't get bigger assert len(cm.output) == len_of_first_warning
def test_trajectory_to_agentbuffer(): length = 15 wanted_keys = [ "next_visual_obs0", "visual_obs0", "vector_obs", "next_vector_in", "memory", "masks", "done", "actions_pre", "actions", "action_probs", "action_mask", "prev_action", "environment_rewards", ] wanted_keys = set(wanted_keys) trajectory = make_fake_trajectory( length=length, observation_shapes=[(VEC_OBS_SIZE, ), (84, 84, 3)], action_space=[ACTION_SIZE], ) agentbuffer = trajectory.to_agentbuffer() seen_keys = set() for key, field in agentbuffer.items(): assert len(field) == length seen_keys.add(key) assert seen_keys == wanted_keys
def test_advance(mocked_clear_update_buffer): trainer = create_rl_trainer() trajectory_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) time_horizon = 15 trajectory = mb.make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that get_step is correct assert trainer.get_step == time_horizon # Check that we can turn off the trainer and that the buffer is cleared for _ in range(0, 10): trajectory_queue.put(trajectory) trainer.advance() # Check that the buffer has been cleared assert not trainer.should_still_train assert mocked_clear_update_buffer.call_count > 0
def test_update_buffer_append(): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=create_observation_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) agentbuffer_trajectory = trajectory.to_agentbuffer() assert trainer.update_buffer.num_experiences == 0 # Check that if we append, our update buffer gets longer. # max_steps = 100 for i in range(10): trainer._process_trajectory(trajectory) trainer._append_to_update_buffer(agentbuffer_trajectory) assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon # Check that if we append after stopping training, nothing happens. # We process enough trajectories to hit max steps trainer.set_is_policy_updating(False) trainer._process_trajectory(trajectory) trainer._append_to_update_buffer(agentbuffer_trajectory) assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
def test_trajectory_to_agentbuffer(): length = 15 wanted_keys = [ "next_obs_0", "next_obs_1", "obs_0", "obs_1", "memory", "masks", "done", "continuous_action", "discrete_action", "continuous_log_probs", "discrete_log_probs", "action_mask", "prev_action", "environment_rewards", ] wanted_keys = set(wanted_keys) trajectory = make_fake_trajectory( length=length, sensor_specs=create_sensor_specs_with_shapes([(VEC_OBS_SIZE,), (84, 84, 3)]), action_spec=ActionSpec.create_continuous(ACTION_SIZE), ) agentbuffer = trajectory.to_agentbuffer() seen_keys = set() for key, field in agentbuffer.items(): assert len(field) == length seen_keys.add(key) assert seen_keys == wanted_keys
def test_trajectory_to_agentbuffer(): length = 15 wanted_keys = [ (ObservationKeyPrefix.OBSERVATION, 0), (ObservationKeyPrefix.OBSERVATION, 1), (ObservationKeyPrefix.NEXT_OBSERVATION, 0), (ObservationKeyPrefix.NEXT_OBSERVATION, 1), BufferKey.MEMORY, BufferKey.MASKS, BufferKey.DONE, BufferKey.CONTINUOUS_ACTION, BufferKey.DISCRETE_ACTION, BufferKey.CONTINUOUS_LOG_PROBS, BufferKey.DISCRETE_LOG_PROBS, BufferKey.ACTION_MASK, BufferKey.PREV_ACTION, BufferKey.ENVIRONMENT_REWARDS, ] wanted_keys = set(wanted_keys) trajectory = make_fake_trajectory( length=length, observation_specs=create_observation_specs_with_shapes([ (VEC_OBS_SIZE, ), (84, 84, 3) ]), action_spec=ActionSpec.create_continuous(ACTION_SIZE), ) agentbuffer = trajectory.to_agentbuffer() seen_keys = set() for key, field in agentbuffer.items(): assert len(field) == length seen_keys.add(key) assert seen_keys == wanted_keys
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 summary_freq = trainer.trainer_settings.summary_freq checkpoint_interval = trainer.trainer_settings.checkpoint_interval trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=create_observation_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) # Check that we can turn off the trainer and that the buffer is cleared num_trajectories = 5 for _ in range(0, num_trajectories): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that we have called write_summary the appropriate number of times calls = [ mock.call(step) for step in range(summary_freq, num_trajectories * time_horizon, summary_freq) ] mock_write_summary.assert_has_calls(calls, any_order=True) checkpoint_range = range(checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval) calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range] trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True) export_ext = "onnx" add_checkpoint_calls = [ mock.call( trainer.brain_name, ModelCheckpoint( step, f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.{export_ext}", None, mock.ANY, [ f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.pt" ], ), trainer.trainer_settings.keep_checkpoints, ) for step in checkpoint_range ] mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary): trainer = create_rl_trainer() mock_policy = mock.Mock() mock_policy.model_path = "mock_model_path" trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 summary_freq = trainer.trainer_settings.summary_freq checkpoint_interval = trainer.trainer_settings.checkpoint_interval trajectory = mb.make_fake_trajectory( length=time_horizon, observation_shapes=[(1, )], max_step_complete=True, action_space=[2], ) # Check that we can turn off the trainer and that the buffer is cleared num_trajectories = 5 for _ in range(0, num_trajectories): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that we have called write_summary the appropriate number of times calls = [ mock.call(step) for step in range(summary_freq, num_trajectories * time_horizon, summary_freq) ] mock_write_summary.assert_has_calls(calls, any_order=True) checkpoint_range = range(checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval) calls = [ mock.call(f"{mock_policy.model_path}/{trainer.brain_name}-{step}", mock.ANY) for step in checkpoint_range ] mock_policy.checkpoint.assert_has_calls(calls, any_order=True) add_checkpoint_calls = [ mock.call( trainer.brain_name, NNCheckpoint( step, f"{mock_policy.model_path}/{trainer.brain_name}-{step}.nn", None, mock.ANY, ), trainer.trainer_settings.keep_checkpoints, ) for step in checkpoint_range ] mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
def test_trajectory_to_agentbuffer(): length = 15 # These keys should be of type np.ndarray wanted_keys = [ (ObservationKeyPrefix.OBSERVATION, 0), (ObservationKeyPrefix.OBSERVATION, 1), (ObservationKeyPrefix.NEXT_OBSERVATION, 0), (ObservationKeyPrefix.NEXT_OBSERVATION, 1), BufferKey.MEMORY, BufferKey.MASKS, BufferKey.DONE, BufferKey.CONTINUOUS_ACTION, BufferKey.DISCRETE_ACTION, BufferKey.CONTINUOUS_LOG_PROBS, BufferKey.DISCRETE_LOG_PROBS, BufferKey.ACTION_MASK, BufferKey.PREV_ACTION, BufferKey.ENVIRONMENT_REWARDS, BufferKey.GROUP_REWARD, ] # These keys should be of type List wanted_group_keys = [ BufferKey.GROUPMATE_REWARDS, BufferKey.GROUP_CONTINUOUS_ACTION, BufferKey.GROUP_DISCRETE_ACTION, BufferKey.GROUP_DONES, BufferKey.GROUP_NEXT_CONT_ACTION, BufferKey.GROUP_NEXT_DISC_ACTION, ] wanted_keys = set(wanted_keys + wanted_group_keys) trajectory = make_fake_trajectory( length=length, observation_specs=create_observation_specs_with_shapes([ (VEC_OBS_SIZE, ), (84, 84, 3) ]), action_spec=ActionSpec.create_continuous(ACTION_SIZE), num_other_agents_in_group=4, ) agentbuffer = trajectory.to_agentbuffer() seen_keys = set() for key, field in agentbuffer.items(): assert len(field) == length seen_keys.add(key) assert seen_keys.issuperset(wanted_keys) for _key in wanted_group_keys: for step in agentbuffer[_key]: assert len(step) == 4
def _compare_two_optimizers(opt1: TorchOptimizer, opt2: TorchOptimizer) -> None: trajectory = mb.make_fake_trajectory( length=10, observation_specs=opt1.policy.behavior_spec.observation_specs, action_spec=opt1.policy.behavior_spec.action_spec, max_step_complete=True, ) with torch.no_grad(): _, opt1_val_out, _ = opt1.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False) _, opt2_val_out, _ = opt2.get_trajectory_value_estimates( trajectory.to_agentbuffer(), trajectory.next_obs, done=False) for opt1_val, opt2_val in zip(opt1_val_out.values(), opt2_val_out.values()): np.testing.assert_array_equal(opt1_val, opt2_val)
def test_sac_trainer_update_normalization(sac_config): behavior_id_team0 = "test_brain?team=0" brain_name = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0).brain_name mock_specs = mb.setup_test_behavior_specs(True, False, vector_action_space=[2], vector_obs_space=1) base_config = sac_config.behaviors output_path = "results_dir" train_model = True load_model = False seed = 42 trainer_factory = TrainerFactory( trainer_config=base_config, output_path=output_path, train_model=train_model, load_model=load_model, seed=seed, param_manager=EnvironmentParameterManager(), ) sac_trainer = trainer_factory.generate(brain_name) parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( behavior_id_team0) policy = sac_trainer.create_policy(parsed_behavior_id0, mock_specs) sac_trainer.add_policy(parsed_behavior_id0, policy) trajectory_queue0 = AgentManagerQueue(behavior_id_team0) sac_trainer.subscribe_trajectory_queue(trajectory_queue0) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_specs=create_observation_specs_with_shapes([(1, )]), action_spec=mock_specs.action_spec, ) trajectory_queue0.put(trajectory) # mocking out update_normalization in both the policy and critic with patch( "mlagents.trainers.torch.networks.ValueNetwork.update_normalization" ) as optimizer_update_normalization_mock, patch( "mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization" ) as policy_update_normalization_mock: sac_trainer.advance() optimizer_update_normalization_mock.assert_called_once() policy_update_normalization_mock.assert_called_once()
def test_advance(mocked_clear_update_buffer, mocked_save_model): trainer = create_rl_trainer() mock_policy = mock.Mock() mock_policy.model_path = "mock_model_path" trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_shapes=[(1, )], max_step_complete=True, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() policy_queue.get_nowait() # Check that get_step is correct assert trainer.get_step == time_horizon # Check that we can turn off the trainer and that the buffer is cleared for _ in range(0, 5): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that if the policy doesn't update, we don't push it to the queue trainer.set_is_policy_updating(False) for _ in range(0, 10): trajectory_queue.put(trajectory) trainer.advance() # Check that there nothing in the policy queue with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Check that the buffer has been cleared assert not trainer.should_still_train assert mocked_clear_update_buffer.call_count > 0 assert mocked_save_model.call_count == 0
def test_poca_end_episode(): name_behavior_id = "test_trainer" trainer = POCATrainer( name_behavior_id, 10, TrainerSettings(max_steps=100, checkpoint_interval=10, summary_freq=20), True, False, 0, "mock_model_path", ) behavior_spec = BehaviorSpec(create_observation_specs_with_shapes([(1, )]), ActionSpec.create_discrete((2, ))) parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id( name_behavior_id) mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec) trainer.add_policy(parsed_behavior_id, mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, observation_specs=behavior_spec.observation_specs, max_step_complete=False, action_spec=behavior_spec.action_spec, num_other_agents_in_group=2, group_reward=1.0, is_terminal=False, ) trajectory_queue.put(trajectory) trainer.advance() # Test that some trajectoories have been injested for reward in trainer.collected_group_rewards.values(): assert reward == 10 # Test end episode trainer.end_episode() assert len(trainer.collected_group_rewards.keys()) == 0
def test_summary_checkpoint(mock_write_summary, mock_save_model): trainer = create_rl_trainer() trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 summary_freq = trainer.trainer_settings.summary_freq checkpoint_interval = trainer.trainer_settings.checkpoint_interval trajectory = mb.make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) # Check that we can turn off the trainer and that the buffer is cleared num_trajectories = 5 for _ in range(0, num_trajectories): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that we have called write_summary the appropriate number of times calls = [ mock.call(step) for step in range(summary_freq, num_trajectories * time_horizon, summary_freq) ] mock_write_summary.assert_has_calls(calls, any_order=True) calls = [ mock.call(trainer.brain_name) for step in range(checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval) ] mock_save_model.assert_has_calls(calls, any_order=True)