def test_process_trajectory(dummy_config): brain_params_team0 = BrainParameters( brain_name="test_brain?team=0", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) brain_name = BehaviorIdentifiers.from_name_behavior_id( brain_params_team0.brain_name).brain_name brain_params_team1 = BrainParameters( brain_name="test_brain?team=1", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0", False) trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0") # first policy encountered becomes policy trained by wrapped PPO policy = trainer.create_policy(brain_params_team0) trainer.add_policy(brain_params_team0.brain_name, policy) trajectory_queue0 = AgentManagerQueue(brain_params_team0.brain_name) trainer.subscribe_trajectory_queue(trajectory_queue0) # Ghost trainer should ignore this queue because off policy policy = trainer.create_policy(brain_params_team1) trainer.add_policy(brain_params_team1.brain_name, policy) trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name) trainer.subscribe_trajectory_queue(trajectory_queue1) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue0.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.trainer.update_buffer.num_experiences == 15 trajectory_queue1.put(trajectory) trainer.advance() # Check that ghost trainer ignored off policy queue assert trainer.trainer.update_buffer.num_experiences == 15 # Check that it emptied the queue assert trajectory_queue1.empty()
def test_demo_mismatch(): path_prefix = os.path.dirname(os.path.abspath(__file__)) # observation mismatch with pytest.raises(RuntimeError): brain_params_obs = BrainParameters( brain_name="test_brain", vector_observation_space_size=9, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=1, ) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_obs) # action mismatch with pytest.raises(RuntimeError): brain_params_act = BrainParameters( brain_name="test_brain", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[3], vector_action_descriptions=[], vector_action_space_type=1, ) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_act) # action type mismatch with pytest.raises(RuntimeError): brain_params_type = BrainParameters( brain_name="test_brain", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_type) # vis obs mismatch with pytest.raises(RuntimeError): brain_params_vis = BrainParameters( brain_name="test_brain", vector_observation_space_size=8, camera_resolutions=[[30, 40]], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=1, ) _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, brain_params_vis)
def test_trainer_increment_step(ppo_optimizer, dummy_config): trainer_params = dummy_config mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer = PPOTrainer( brain_params.brain_name, 0, trainer_params, True, False, 0, "0" ) policy_mock = mock.Mock(spec=NNPolicy) policy_mock.get_current_step.return_value = 0 step_count = ( 5 ) # 10 hacked because this function is no longer called through trainer policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.add_policy("testbehavior", policy_mock) trainer._increment_step(5, "testbehavior") policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def test_normalization(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=2, ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) trainer.process_trajectory(trajectory) # Check that the running mean and variance is correct steps, mean, variance = trainer.ppo_policy.sess.run([ trainer.policy.model.normalization_steps, trainer.policy.model.running_mean, trainer.policy.model.running_variance, ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=2, ) trainer.process_trajectory(trajectory) # Check that the running mean and variance is correct steps, mean, variance = trainer.ppo_policy.sess.run([ trainer.policy.model.normalization_steps, trainer.policy.model.running_mean, trainer.policy.model.running_variance, ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def test_normalization(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["output_path"] = "./results/test_trainer_models/TestModel" time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) policy = policy = NNPolicy(0, brain_params, dummy_config, False, False) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def test_handles_no_default_section(dummy_config): """ Make sure the trainer setup handles a missing "default" in the config. """ brain_name = "testbrain" no_default_config = {brain_name: dummy_config["default"]} brain_parameters = BrainParameters( brain_name=brain_name, vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer_factory = trainer_util.TrainerFactory( trainer_config=no_default_config, summaries_dir="test_dir", run_id="testrun", model_path="model_dir", keep_checkpoints=1, train_model=True, load_model=False, seed=42, ) trainer_factory.generate(brain_parameters.brain_name)
def test_raise_if_no_config_for_brain(dummy_config): """ Make sure the trainer setup raises a friendlier exception if both "default" and the brain name are missing from the config. """ brain_name = "testbrain" bad_config = {"some_other_brain": dummy_config["default"]} brain_parameters = BrainParameters( brain_name=brain_name, vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer_factory = trainer_util.TrainerFactory( trainer_config=bad_config, summaries_dir="test_dir", run_id="testrun", model_path="model_dir", keep_checkpoints=1, train_model=True, load_model=False, seed=42, ) with pytest.raises(TrainerConfigError): trainer_factory.generate(brain_parameters)
def test_process_trajectory(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that GAE worked assert ( "advantages" in trainer.update_buffer and "discounted_returns" in trainer.update_buffer ) # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=time_horizon + 1, max_step_complete=False, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
def test_ppo_get_value_estimates(mock_communicator, mock_launcher, dummy_config): tf.reset_default_graph() brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" policy = PPOPolicy(0, brain_params, dummy_config, False, False) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=False) for key, val in run_out.items(): assert type(key) is str assert type(val) is float run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True) for key, val in run_out.items(): assert type(key) is str assert val == 0.0 # Check if we ignore terminal states properly policy.reward_signals["extrinsic"].use_terminal_states = False run_out = policy.get_value_estimates(trajectory.next_obs, "test_agent", done=True) for key, val in run_out.items(): assert type(key) is str assert val != 0.0 agentbuffer = trajectory.to_agentbuffer() batched_values = policy.get_batched_value_estimates(agentbuffer) for values in batched_values.values(): assert len(values) == 15
def group_spec_to_brain_parameters( name: str, group_spec: AgentGroupSpec ) -> BrainParameters: vec_size = np.sum( [shape[0] for shape in group_spec.observation_shapes if len(shape) == 1] ) vis_sizes = [shape for shape in group_spec.observation_shapes if len(shape) == 3] cam_res = [CameraResolution(s[0], s[1], s[2]) for s in vis_sizes] a_size: List[int] = [] if group_spec.is_action_discrete(): a_size += list(group_spec.discrete_action_branches) vector_action_space_type = 0 else: a_size += [group_spec.action_size] vector_action_space_type = 1 return BrainParameters( name, int(vec_size), cam_res, a_size, [], vector_action_space_type )
def make_brain_parameters( discrete_action: bool = False, visual_inputs: int = 0, brain_name: str = "RealFakeBrain", vec_obs_size: int = 6, ) -> BrainParameters: resolutions = [ CameraResolution(width=30, height=40, num_channels=3) for _ in range(visual_inputs) ] return BrainParameters( vector_observation_space_size=vec_obs_size, camera_resolutions=resolutions, vector_action_space_size=[2], vector_action_descriptions=["", ""], vector_action_space_type=int(not discrete_action), brain_name=brain_name, )
def group_spec_to_brain_parameters( name: str, group_spec ) -> BrainParameters: vec_size = np.sum( [shape[0] for shape in group_spec['observation_shapes'] if len(shape) == 1] ) vis_sizes = [shape for shape in group_spec['observation_shapes'] if len(shape) == 3] cam_res = [CameraResolution(s[0], s[1], s[2]) for s in vis_sizes] a_size: List[int] = [] if group_spec['action_type'] == 'DISCRETE': a_size += list(group_spec['action_shape']) vector_action_space_type = 0 else: a_size += [group_spec.action_size] vector_action_space_type = 1 return BrainParameters( brain_name=name, vector_observation_space_size=int(vec_size), camera_resolutions=cam_res, vector_action_space_size=a_size, vector_action_descriptions=[], vector_action_space_type=vector_action_space_type )
def test_trainer_increment_step(dummy_config): trainer_params = dummy_config brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0", False) policy_mock = mock.Mock() step_count = 10 policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.policy = policy_mock trainer.increment_step(5) policy_mock.increment_step.assert_called_with(5) assert trainer.step == 10
def test_add_rewards_output(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0", False) rewardsout = AllRewardsOutput( reward_signals={ "extrinsic": RewardSignalResult( scaled_reward=np.array([1.0, 1.0], dtype=np.float32), unscaled_reward=np.array([1.0, 1.0], dtype=np.float32), ) }, environment=np.array([1.0, 1.0], dtype=np.float32), ) values = {"extrinsic": np.array([[2.0]], dtype=np.float32)} agent_id = "123" idx = 0 # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail. next_idx = 1 trainer.add_rewards_outputs( rewardsout, values=values, agent_id=agent_id, agent_idx=idx, agent_next_idx=next_idx, ) assert trainer.processing_buffer[agent_id]["extrinsic_value_estimates"][ 0] == 2.0 assert trainer.processing_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
def test_handles_no_config_provided(BrainParametersMock): """ Make sure the trainer setup handles no configs provided at all. """ brain_name = "testbrain" no_default_config = RunOptions().behaviors brain_parameters = BrainParameters( brain_name=brain_name, vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer_factory = trainer_util.TrainerFactory( trainer_config=no_default_config, output_path="output_path", train_model=True, load_model=False, seed=42, ) trainer_factory.generate(brain_parameters.brain_name)
def test_trainer_increment_step(dummy_config): trainer_params = dummy_config brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer = PPOTrainer(brain_params.brain_name, 0, trainer_params, True, False, 0, "0", False) policy_mock = mock.Mock() step_count = ( 5 ) # 10 hacked becausee this function is no longer called through trainer policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.policy = policy_mock trainer.increment_step(5) print(trainer.policy.increment_step(5)) policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def test_publish_queue(dummy_config): brain_params_team0 = BrainParameters( brain_name="test_brain?team=0", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[1], vector_action_descriptions=[], vector_action_space_type=0, ) parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id( brain_params_team0.brain_name ) brain_name = parsed_behavior_id0.brain_name brain_params_team1 = BrainParameters( brain_name="test_brain?team=1", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[1], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0") controller = GhostController(100) trainer = GhostTrainer( ppo_trainer, brain_name, controller, 0, dummy_config, True, "0" ) # First policy encountered becomes policy trained by wrapped PPO # This queue should remain empty after swap snapshot policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0) trainer.add_policy(parsed_behavior_id0, policy) policy_queue0 = AgentManagerQueue(brain_params_team0.brain_name) trainer.publish_policy_queue(policy_queue0) # Ghost trainer should use this queue for ghost policy swap parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id( brain_params_team1.brain_name ) policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1) trainer.add_policy(parsed_behavior_id1, policy) policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name) trainer.publish_policy_queue(policy_queue1) # check ghost trainer swap pushes to ghost queue and not trainer assert policy_queue0.empty() and policy_queue1.empty() trainer._swap_snapshots() assert policy_queue0.empty() and not policy_queue1.empty() # clear policy_queue1.get_nowait() mock_brain = mb.setup_mock_brain( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.trainer.update_buffer = buffer # when ghost trainer advance and wrapped trainer buffers full # the wrapped trainer pushes updated policy to correct queue assert policy_queue0.empty() and policy_queue1.empty() trainer.advance() assert not policy_queue0.empty() and policy_queue1.empty()
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import ( DemonstrationMetaProto, ) from mlagents.trainers.brain import BrainParameters from mlagents.trainers.demo_loader import ( load_demonstration, demo_to_buffer, get_demo_files, write_delimited, ) BRAIN_PARAMS = BrainParameters( brain_name="test_brain", vector_observation_space_size=8, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=1, ) def test_load_demo(): path_prefix = os.path.dirname(os.path.abspath(__file__)) behavior_spec, pair_infos, total_expected = load_demonstration( path_prefix + "/test.demo") assert np.sum(behavior_spec.observation_shapes[0]) == 8 assert len(pair_infos) == total_expected _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1, BRAIN_PARAMS) assert len(demo_buffer["actions"]) == total_expected - 1
def load_demonstration( file_path: str ) -> Tuple[BrainParameters, List[AgentInfoActionPairProto], int]: """ Loads and parses a demonstration file. :param file_path: Location of demonstration file (.demo). :return: BrainParameter and list of AgentInfoActionPairProto containing demonstration data. """ # First 32 bytes of file dedicated to meta-data. INITIAL_POS = 33 file_paths = [] if os.path.isdir(file_path): all_files = os.listdir(file_path) for _file in all_files: if _file.endswith(".demo"): file_paths.append(os.path.join(file_path, _file)) if not all_files: raise ValueError("There are no '.demo' files in the provided directory.") elif os.path.isfile(file_path): file_paths.append(file_path) file_extension = pathlib.Path(file_path).suffix if file_extension != ".demo": raise ValueError( "The file is not a '.demo' file. Please provide a file with the " "correct extension." ) else: raise FileNotFoundError( "The demonstration file or directory {} does not exist.".format(file_path) ) brain_params = None brain_param_proto = None info_action_pairs = [] total_expected = 0 for _file_path in file_paths: with open(_file_path, "rb") as fp: with hierarchical_timer("read_file"): data = fp.read() next_pos, pos, obs_decoded = 0, 0, 0 while pos < len(data): next_pos, pos = _DecodeVarint32(data, pos) if obs_decoded == 0: meta_data_proto = DemonstrationMetaProto() meta_data_proto.ParseFromString(data[pos : pos + next_pos]) total_expected += meta_data_proto.number_steps pos = INITIAL_POS if obs_decoded == 1: brain_param_proto = BrainParametersProto() brain_param_proto.ParseFromString(data[pos : pos + next_pos]) pos += next_pos if obs_decoded > 1: agent_info_action = AgentInfoActionPairProto() agent_info_action.ParseFromString(data[pos : pos + next_pos]) if brain_params is None: brain_params = BrainParameters.from_proto( brain_param_proto, agent_info_action.agent_info ) info_action_pairs.append(agent_info_action) if len(info_action_pairs) == total_expected: break pos += next_pos obs_decoded += 1 if not brain_params: raise RuntimeError( f"No BrainParameters found in demonstration file at {file_path}." ) return brain_params, info_action_pairs, total_expected