def test_trainer_update_policy(dummy_config, use_discrete): mock_brain = mb.setup_mock_brain( use_discrete, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params["use_recurrent"] = True # Test curiosity reward signal trainer_params["reward_signals"]["curiosity"] = {} trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0 trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99 trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128 trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def test_add_get_policy(ppo_optimizer, dummy_config): brain_params = make_brain_parameters( discrete_action=False, visual_inputs=0, vec_obs_size=6 ) mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=NNPolicy) policy.get_current_step.return_value = 2000 trainer.add_policy(brain_params.brain_name, policy) assert trainer.get_policy(brain_params.brain_name) == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000 assert trainer.next_summary_step > 2000 # Test incorrect class of policy policy = mock.Mock() with pytest.raises(RuntimeError): trainer.add_policy(brain_params, policy)
def test_trainer_increment_step(ppo_optimizer, dummy_config): trainer_params = dummy_config mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) trainer = PPOTrainer( brain_params.brain_name, 0, trainer_params, True, False, 0, "0" ) policy_mock = mock.Mock(spec=NNPolicy) policy_mock.get_current_step.return_value = 0 step_count = ( 5 ) # 10 hacked because this function is no longer called through trainer policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.add_policy("testbehavior", policy_mock) trainer._increment_step(5, "testbehavior") policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_brain = mb.setup_mock_brain( use_discrete, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16 ) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def test_process_trajectory(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that GAE worked assert ( "advantages" in trainer.update_buffer and "discounted_returns" in trainer.update_buffer ) # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=time_horizon + 1, max_step_complete=False, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
def test_process_trajectory(dummy_config): behavior_spec = mb.setup_test_behavior_specs( True, False, vector_action_space=DISCRETE_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) mock_brain_name = "MockBrain" behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name) trainer = PPOTrainer("test_brain", 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(behavior_id, behavior_spec) trainer.add_policy(behavior_id, policy) trajectory_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) time_horizon = 15 trajectory = make_fake_trajectory( length=time_horizon, observation_shapes=behavior_spec.observation_shapes, max_step_complete=True, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that GAE worked assert ( "advantages" in trainer.update_buffer and "discounted_returns" in trainer.update_buffer ) # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=time_horizon + 1, max_step_complete=False, observation_shapes=behavior_spec.observation_shapes, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_behavior_spec = mb.setup_test_behavior_specs( use_discrete, False, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config mock_brain_name = "MockBrain" behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name) trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(behavior_id, mock_behavior_spec) trainer.add_policy(behavior_id, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not if use_discrete: n_agents = len(buffer["discrete_log_probs"]) buffer["discrete_log_probs"].reset_field() for _ in range(n_agents): buffer["discrete_log_probs"].append( np.ones( int(sum(mock_behavior_spec.action_spec.discrete_branches)), dtype=np.float32, )) else: n_agents = len(buffer["continuous_log_probs"]) buffer["continuous_log_probs"].reset_field() for _ in range(n_agents): buffer["continuous_log_probs"].append( np.ones(mock_behavior_spec.action_spec.continuous_size, dtype=np.float32)) trainer.update_buffer = buffer trainer._update_policy()
def test_add_get_policy(ppo_optimizer, mock_create_model_saver, dummy_config): mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=TFPolicy) policy.get_current_step.return_value = 2000 behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) trainer.add_policy(behavior_id, policy) assert trainer.get_policy("test_policy") == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000
def test_trainer_increment_step(ppo_optimizer): trainer_params = PPO_CONFIG mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0") policy_mock = mock.Mock(spec=NNPolicy) policy_mock.get_current_step.return_value = 0 step_count = ( 5 # 10 hacked because this function is no longer called through trainer ) policy_mock.increment_step = mock.Mock(return_value=step_count) trainer.add_policy("testbehavior", policy_mock) trainer._increment_step(5, "testbehavior") policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def test_trainer_increment_step(ppo_optimizer, mock_create_model_saver): trainer_params = PPO_CONFIG mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0") policy_mock = mock.Mock(spec=TFPolicy) policy_mock.get_current_step.return_value = 0 step_count = ( 5 # 10 hacked because this function is no longer called through trainer ) policy_mock.increment_step = mock.Mock(return_value=step_count) behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) trainer.add_policy(behavior_id, policy_mock) trainer._increment_step(5, trainer.brain_name) policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
def test_add_get_policy(ppo_optimizer, dummy_config): mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=NNPolicy) policy.get_current_step.return_value = 2000 trainer.add_policy("test_policy", policy) assert trainer.get_policy("test_policy") == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000 # Test incorrect class of policy policy = mock.Mock() with pytest.raises(RuntimeError): trainer.add_policy("test_policy", policy)
def test_trainer_update_policy(mock_env, dummy_config, use_discrete): env, mock_brain, _ = mb.setup_mock_env_and_brains( mock_env, use_discrete, False, num_agents=NUM_AGENTS, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params["use_recurrent"] = True # Test curiosity reward signal trainer_params["reward_signals"]["curiosity"] = {} trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0 trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99 trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128 trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0", False) policy = trainer.create_policy(mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["rewards"] buffer["extrinsic_returns"] = buffer["rewards"] buffer["extrinsic_value_estimates"] = buffer["rewards"] buffer["curiosity_rewards"] = buffer["rewards"] buffer["curiosity_returns"] = buffer["rewards"] buffer["curiosity_value_estimates"] = buffer["rewards"] trainer.update_buffer = buffer trainer._update_policy() # Make batch length a larger multiple of sequence length trainer.trainer_parameters["batch_size"] = 128 trainer._update_policy() # Make batch length a larger non-multiple of sequence length trainer.trainer_parameters["batch_size"] = 100 trainer._update_policy()
def test_normalization(dummy_config): brain_params = BrainParameters( brain_name="test_brain", vector_observation_space_size=1, camera_resolutions=[], vector_action_space_size=[2], vector_action_descriptions=[], vector_action_space_type=0, ) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = PPOTrainer(brain_params.brain_name, 0, dummy_config, True, False, 0, "0", False) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) policy = trainer.create_policy(brain_params) trainer.add_policy(brain_params.brain_name, policy) trainer._process_trajectory(trajectory) # Check that the running mean and variance is correct steps, mean, variance = trainer.policy.sess.run([ trainer.policy.model.normalization_steps, trainer.policy.model.running_mean, trainer.policy.model.running_variance, ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, vec_obs_size=1, num_vis_obs=0, action_space=[2], ) trainer._process_trajectory(trajectory) # Check that the running mean and variance is correct steps, mean, variance = trainer.policy.sess.run([ trainer.policy.model.normalization_steps, trainer.policy.model.running_mean, trainer.policy.model.running_variance, ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)