def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1], [0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1], [0.1]])), } mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=2, sensor_specs=create_sensor_specs_with_shapes([(8, )] + num_vis_obs * [(84, 84, 3)]), action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1], [0.1]])), env_action=ActionTuple(continuous=np.array([[0.1], [0.1]])), value=[0.1, 0.1], outputs=fake_action_outputs, agent_ids=mock_decision_steps.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()) for _ in range(5): processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0, fake_action_info) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0 # Test empty steps mock_decision_steps, mock_terminal_steps = mb.create_mock_steps( num_agents=0, sensor_specs=create_sensor_specs_with_shapes([(8, )] + num_vis_obs * [(84, 84, 3)]), action_spec=ActionSpec.create_continuous(2), ) processor.add_experiences(mock_decision_steps, mock_terminal_steps, 0, ActionInfo.empty()) # Assert that the AgentProcessor is still empty assert len(processor.experience_buffers[0]) == 0
def test_create_inputs(encoder_type, normalize, num_vector, num_visual): vec_obs_shape = (5, ) vis_obs_shape = (84, 84, 3) obs_shapes = [] for _ in range(num_vector): obs_shapes.append(vec_obs_shape) for _ in range(num_visual): obs_shapes.append(vis_obs_shape) h_size = 128 sen_spec = create_sensor_specs_with_shapes(obs_shapes) encoders, embedding_sizes = ModelUtils.create_input_processors( sen_spec, h_size, encoder_type, normalize) total_output = sum(embedding_sizes) vec_enc = [] vis_enc = [] for i, enc in enumerate(encoders): if len(obs_shapes[i]) == 1: vec_enc.append(enc) else: vis_enc.append(enc) assert len(vec_enc) == num_vector assert len(vis_enc) == num_visual assert total_output == int(num_visual * h_size + vec_obs_shape[0] * num_vector) if num_vector > 0: assert isinstance(vec_enc[0], VectorInput) for enc in vis_enc: assert isinstance(enc, ModelUtils.get_encoder_for_type(encoder_type))
def test_valuenetwork(): torch.manual_seed(0) obs_size = 4 num_outputs = 2 network_settings = NetworkSettings() sen_spec = create_sensor_specs_with_shapes([(obs_size, )]) stream_names = [f"stream_name{n}" for n in range(4)] value_net = ValueNetwork(stream_names, sen_spec, network_settings, outputs_per_stream=num_outputs) optimizer = torch.optim.Adam(value_net.parameters(), lr=3e-3) for _ in range(50): sample_obs = torch.ones((1, obs_size)) values, _ = value_net([sample_obs]) loss = 0 for s_name in stream_names: assert values[s_name].shape == (1, num_outputs) # Try to force output to 1 loss += torch.nn.functional.mse_loss(values[s_name], torch.ones((1, num_outputs))) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for value in values.values(): for _out in value: assert _out[0] == pytest.approx(1.0, abs=0.1)
def test_trajectory_to_agentbuffer(): length = 15 wanted_keys = [ "next_obs_0", "next_obs_1", "obs_0", "obs_1", "memory", "masks", "done", "continuous_action", "discrete_action", "continuous_log_probs", "discrete_log_probs", "action_mask", "prev_action", "environment_rewards", ] wanted_keys = set(wanted_keys) trajectory = make_fake_trajectory( length=length, sensor_specs=create_sensor_specs_with_shapes([(VEC_OBS_SIZE,), (84, 84, 3)]), action_spec=ActionSpec.create_continuous(ACTION_SIZE), ) agentbuffer = trajectory.to_agentbuffer() seen_keys = set() for key, field in agentbuffer.items(): assert len(field) == length seen_keys.add(key) assert seen_keys == wanted_keys
def test_networkbody_visual(): torch.manual_seed(0) vec_obs_size = 4 obs_size = (84, 84, 3) network_settings = NetworkSettings() obs_shapes = [(vec_obs_size, ), obs_size] networkbody = NetworkBody(create_sensor_specs_with_shapes(obs_shapes), network_settings) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3) sample_obs = 0.1 * torch.ones((1, 84, 84, 3)) sample_vec_obs = torch.ones((1, vec_obs_size)) obs = [sample_vec_obs] + [sample_obs] for _ in range(150): encoded, _ = networkbody(obs) assert encoded.shape == (1, network_settings.hidden_units) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten(): assert _enc == pytest.approx(1.0, abs=0.1)
def create_mock_group_spec( number_visual_observations=0, vector_action_space_type="continuous", vector_observation_space_size=3, vector_action_space_size=None, ): """ Creates a mock BrainParameters object with parameters. """ # Avoid using mutable object as default param if vector_action_space_type == "continuous": if vector_action_space_size is None: vector_action_space_size = 2 else: vector_action_space_size = vector_action_space_size[0] action_spec = ActionSpec.create_continuous(vector_action_space_size) else: if vector_action_space_size is None: vector_action_space_size = (2, ) else: vector_action_space_size = tuple(vector_action_space_size) action_spec = ActionSpec.create_discrete(vector_action_space_size) obs_shapes = [(vector_observation_space_size, )] for _ in range(number_visual_observations): obs_shapes += [(8, 8, 3)] sen_spec = create_sensor_specs_with_shapes(obs_shapes) return BehaviorSpec(sen_spec, action_spec)
def test_batched_step_result_from_proto(): n_agents = 10 shapes = [(3, ), (4, )] spec = BehaviorSpec(create_sensor_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, spec) for agent_id in range(n_agents): if agent_id in decision_steps: # we set the reward equal to the agent id in generate_list_agent_proto assert decision_steps[agent_id].reward == agent_id elif agent_id in terminal_steps: assert terminal_steps[agent_id].reward == agent_id else: raise Exception("Missing agent from the steps") # We sort the AgentId since they are split between DecisionSteps and TerminalSteps combined_agent_id = list(decision_steps.agent_id) + list( terminal_steps.agent_id) combined_agent_id.sort() assert combined_agent_id == list(range(n_agents)) for agent_id in range(n_agents): assert (agent_id in terminal_steps) == (agent_id % 2 == 0) if agent_id in terminal_steps: assert terminal_steps[agent_id].interrupted == (agent_id % 4 == 0) assert decision_steps.obs[0].shape[1] == shapes[0][0] assert decision_steps.obs[1].shape[1] == shapes[1][0] assert terminal_steps.obs[0].shape[1] == shapes[0][0] assert terminal_steps.obs[1].shape[1] == shapes[1][0]
def _make_sensor_specs(self) -> SensorSpec: obs_shape: List[Any] = [] for _ in range(self.num_vector): obs_shape.append((self.vec_obs_size, )) for _ in range(self.num_visual): obs_shape.append(self.vis_obs_size) sen_spec = create_sensor_specs_with_shapes(obs_shape) return sen_spec
def test_empty_terminal_steps(): specs = BehaviorSpec( sensor_specs=create_sensor_specs_with_shapes([(3, 2), (5, )]), action_spec=ActionSpec.create_continuous(3), ) ts = TerminalSteps.empty(specs) assert len(ts.obs) == 2 assert ts.obs[0].shape == (0, 3, 2) assert ts.obs[1].shape == (0, 5)
def test_action_masking_continuous(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_sensor_specs_with_shapes(shapes), ActionSpec.create_continuous(10)) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert masks is None
def test_empty_decision_steps(): specs = BehaviorSpec( sensor_specs=create_sensor_specs_with_shapes([(3, 2), (5, )]), action_spec=ActionSpec.create_continuous(3), ) ds = DecisionSteps.empty(specs) assert len(ds.obs) == 2 assert ds.obs[0].shape == (0, 3, 2) assert ds.obs[1].shape == (0, 5)
def test_actor_critic(ac_type, lstm): obs_size = 4 network_settings = NetworkSettings( memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True) sen_spec = create_sensor_specs_with_shapes([(obs_size, )]) act_size = 2 mask = torch.ones([1, act_size * 2]) stream_names = [f"stream_name{n}" for n in range(4)] # action_spec = ActionSpec.create_continuous(act_size[0]) action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) actor = ac_type(sen_spec, network_settings, action_spec, stream_names) if lstm: sample_obs = torch.ones( (1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones( (1, network_settings.memory.sequence_length, actor.memory_size)) else: sample_obs = torch.ones((1, obs_size)) memories = torch.tensor([]) # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass value_out, memories_out = actor.critic_pass([sample_obs], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1, ) # Test get action stats and_value action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value( [sample_obs], memories=memories, masks=mask) if lstm: assert action.continuous_tensor.shape == (64, 2) else: assert action.continuous_tensor.shape == (1, 2) assert len(action.discrete_list) == 2 for _disc in action.discrete_list: if lstm: assert _disc.shape == (64, 1) else: assert _disc.shape == (1, 1) if mem_out is not None: assert mem_out.shape == memories.shape for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) else: assert value_out[stream].shape == (1, )
def test_batched_step_result_from_proto_raises_on_nan(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_sensor_specs_with_shapes(shapes), ActionSpec.create_continuous(3)) ap_list = generate_list_agent_proto(n_agents, shapes, nan_observations=True) with pytest.raises(RuntimeError): steps_from_proto(ap_list, behavior_spec)
def test_end_episode(): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": ActionTuple(continuous=np.array([[0.1]])), "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "log_probs": LogProbsTuple(continuous=np.array([[0.1]])), } mock_decision_step, mock_terminal_step = mb.create_mock_steps( num_agents=1, sensor_specs=create_sensor_specs_with_shapes([(8,)]), action_spec=ActionSpec.create_continuous(2), ) fake_action_info = ActionInfo( action=ActionTuple(continuous=np.array([[0.1]])), env_action=ActionTuple(continuous=np.array([[0.1]])), value=[0.1], outputs=fake_action_outputs, agent_ids=mock_decision_step.agent_id, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences( mock_decision_step, mock_terminal_step, 0, ActionInfo.empty() ) # Run 3 trajectories, with different workers (to simulate different agents) remove_calls = [] for _ep in range(3): remove_calls.append(mock.call([get_global_agent_id(_ep, 0)])) for _ in range(5): processor.add_experiences( mock_decision_step, mock_terminal_step, _ep, fake_action_info ) # Make sure we don't add experiences from the prior agents after the done # Call end episode processor.end_episode() # Check that we removed every agent policy.remove_previous_action.assert_has_calls(remove_calls) # Check that there are no experiences left assert len(processor.experience_buffers.keys()) == 0 assert len(processor.last_take_action_outputs.keys()) == 0 assert len(processor.episode_steps.keys()) == 0 assert len(processor.episode_rewards.keys()) == 0
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 summary_freq = trainer.trainer_settings.summary_freq checkpoint_interval = trainer.trainer_settings.checkpoint_interval trajectory = mb.make_fake_trajectory( length=time_horizon, sensor_specs=create_sensor_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) # Check that we can turn off the trainer and that the buffer is cleared num_trajectories = 5 for _ in range(0, num_trajectories): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that we have called write_summary the appropriate number of times calls = [ mock.call(step) for step in range(summary_freq, num_trajectories * time_horizon, summary_freq) ] mock_write_summary.assert_has_calls(calls, any_order=True) checkpoint_range = range(checkpoint_interval, num_trajectories * time_horizon, checkpoint_interval) calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range] trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True) export_ext = "onnx" add_checkpoint_calls = [ mock.call( trainer.brain_name, ModelCheckpoint( step, f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.{export_ext}", None, mock.ANY, ), trainer.trainer_settings.keep_checkpoints, ) for step in checkpoint_range ] mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
def test_action_masking_discrete_1(): n_agents = 10 shapes = [(3, ), (4, )] behavior_spec = BehaviorSpec(create_sensor_specs_with_shapes(shapes), ActionSpec.create_discrete((10, ))) ap_list = generate_list_agent_proto(n_agents, shapes) decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec) masks = decision_steps.action_mask assert isinstance(masks, list) assert len(masks) == 1 assert masks[0].shape == (n_agents / 2, 10) assert masks[0][0, 0]
def setup_test_behavior_specs(use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8): if use_discrete: action_spec = ActionSpec.create_discrete(tuple(vector_action_space)) else: action_spec = ActionSpec.create_continuous(vector_action_space) observation_shapes = [(84, 84, 3)] * int(use_visual) + [ (vector_obs_space, ) ] sen_spec = create_sensor_specs_with_shapes(observation_shapes) behavior_spec = BehaviorSpec(sen_spec, action_spec) return behavior_spec
def test_process_trajectory(dummy_config): mock_specs = mb.setup_test_behavior_specs( True, False, vector_action_space=[2], vector_obs_space=1 ) behavior_id_team0 = "test_brain?team=0" behavior_id_team1 = "test_brain?team=1" brain_name = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0).brain_name ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # first policy encountered becomes policy trained by wrapped PPO parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0) policy = trainer.create_policy(parsed_behavior_id0, mock_specs) trainer.add_policy(parsed_behavior_id0, policy) trajectory_queue0 = AgentManagerQueue(behavior_id_team0) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1) policy = trainer.create_policy(parsed_behavior_id1, mock_specs) trainer.add_policy(parsed_behavior_id1, policy) trajectory_queue1 = AgentManagerQueue(behavior_id_team1) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, sensor_specs=create_sensor_specs_with_shapes([(1,)]), action_spec=mock_specs.action_spec, ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def test_advance(mocked_clear_update_buffer, mocked_save_model): trainer = create_rl_trainer() mock_policy = mock.Mock() trainer.add_policy("TestBrain", mock_policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) time_horizon = 10 trajectory = mb.make_fake_trajectory( length=time_horizon, sensor_specs=create_sensor_specs_with_shapes([(1, )]), max_step_complete=True, action_spec=ActionSpec.create_discrete((2, )), ) trajectory_queue.put(trajectory) trainer.advance() policy_queue.get_nowait() # Check that get_step is correct assert trainer.get_step == time_horizon # Check that we can turn off the trainer and that the buffer is cleared for _ in range(0, 5): trajectory_queue.put(trajectory) trainer.advance() # Check that there is stuff in the policy queue policy_queue.get_nowait() # Check that if the policy doesn't update, we don't push it to the queue trainer.set_is_policy_updating(False) for _ in range(0, 10): trajectory_queue.put(trajectory) trainer.advance() # Check that there nothing in the policy queue with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Check that the buffer has been cleared assert not trainer.should_still_train assert mocked_clear_update_buffer.call_count > 0 assert mocked_save_model.call_count == 0
def test_networkbody_lstm(): torch.manual_seed(0) obs_size = 4 seq_len = 16 network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings( sequence_length=seq_len, memory_size=12)) obs_shapes = [(obs_size, )] networkbody = NetworkBody(create_sensor_specs_with_shapes(obs_shapes), network_settings) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = torch.ones((1, seq_len, obs_size)) for _ in range(200): encoded, _ = networkbody([sample_obs], memories=torch.ones(1, seq_len, 12)) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten(): assert _enc == pytest.approx(1.0, abs=0.1)
CONTINUOUS_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) + "/test.demo") DISCRETE_PATH = (os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir) + "/testdcvis.demo") SEED = [42] ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(2) ACTIONSPEC_FOURDISCRETE = ActionSpec.create_discrete((2, 3, 3, 3)) ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((20, )) @pytest.mark.parametrize( "behavior_spec", [ BehaviorSpec(create_sensor_specs_with_shapes([(8, )]), ACTIONSPEC_CONTINUOUS) ], ) def test_construction(behavior_spec: BehaviorSpec) -> None: gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) gail_rp = GAILRewardProvider(behavior_spec, gail_settings) assert gail_rp.name == "GAIL" @pytest.mark.parametrize( "behavior_spec", [ BehaviorSpec(create_sensor_specs_with_shapes([(8, )]), ACTIONSPEC_CONTINUOUS) ],
create_reward_provider, ) from mlagents_envs.base_env import BehaviorSpec, ActionSpec from mlagents.trainers.settings import RewardSignalSettings, RewardSignalType from mlagents.trainers.tests.torch.test_reward_providers.utils import ( create_agent_buffer, ) from mlagents.trainers.tests.dummy_config import create_sensor_specs_with_shapes ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5) ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3)) @pytest.mark.parametrize( "behavior_spec", [ BehaviorSpec(create_sensor_specs_with_shapes([(10, )]), ACTIONSPEC_CONTINUOUS), BehaviorSpec(create_sensor_specs_with_shapes([(10, )]), ACTIONSPEC_TWODISCRETE), ], ) def test_construction(behavior_spec: BehaviorSpec) -> None: settings = RewardSignalSettings() settings.gamma = 0.2 extrinsic_rp = ExtrinsicRewardProvider(behavior_spec, settings) assert extrinsic_rp.gamma == 0.2 assert extrinsic_rp.name == "Extrinsic" @pytest.mark.parametrize( "behavior_spec",
from mlagents.trainers.settings import RNDSettings, RewardSignalType from mlagents.trainers.tests.torch.test_reward_providers.utils import ( create_agent_buffer, ) from mlagents.trainers.tests.dummy_config import create_sensor_specs_with_shapes SEED = [42] ACTIONSPEC_CONTINUOUS = ActionSpec.create_continuous(5) ACTIONSPEC_TWODISCRETE = ActionSpec.create_discrete((2, 3)) ACTIONSPEC_DISCRETE = ActionSpec.create_discrete((2, )) @pytest.mark.parametrize( "behavior_spec", [ BehaviorSpec(create_sensor_specs_with_shapes([(10, )]), ACTIONSPEC_CONTINUOUS), BehaviorSpec(create_sensor_specs_with_shapes([(10, )]), ACTIONSPEC_TWODISCRETE), ], ) def test_construction(behavior_spec: BehaviorSpec) -> None: curiosity_settings = RNDSettings(32, 0.01) curiosity_settings.strength = 0.1 curiosity_rp = RNDRewardProvider(behavior_spec, curiosity_settings) assert curiosity_rp.strength == 0.1 assert curiosity_rp.name == "RND" @pytest.mark.parametrize( "behavior_spec",