def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = ppo_dummy_config() optimizer = create_test_poca_optimizer( config, use_rnn=False, use_discrete=False, use_visual=False ) # Test update update_buffer = mb.simulate_rollout( BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec ) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), RewardSignalUtil.baseline_estimates_key("gail"), ], ) update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like( update_buffer[BufferKey.CONTINUOUS_ACTION] ) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval copy_buffer_fields( update_buffer, src_key=BufferKey.ENVIRONMENT_REWARDS, dst_keys=[ BufferKey.ADVANTAGES, RewardSignalUtil.returns_key("extrinsic"), RewardSignalUtil.value_estimates_key("extrinsic"), RewardSignalUtil.baseline_estimates_key("extrinsic"), RewardSignalUtil.returns_key("gail"), RewardSignalUtil.value_estimates_key("gail"), RewardSignalUtil.baseline_estimates_key("gail"), ], ) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate tf.reset_default_graph() dummy_config.reward_signals = gail_dummy_config optimizer = _create_ppo_optimizer_ops_mock( attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW), use_rnn=False, use_discrete=False, use_visual=False, ) # Test update behavior_spec = optimizer.policy.behavior_spec update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not n_agents = len(update_buffer["continuous_log_probs"]) update_buffer["continuous_log_probs"] = np.ones( (n_agents, behavior_spec.action_spec.continuous_size), dtype=np.float32) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_subprocess_failing_step(num_envs): def failing_step_env_factory(_worker_id, _config): env = UnexpectedExceptionEnvironment( ["1D"], use_discrete=True, to_raise=CustomTestOnlyException ) return env env_manager = SubprocessEnvManager(failing_step_env_factory, RunOptions()) # Expect the exception raised to be routed back up to the top level. with pytest.raises(CustomTestOnlyException): check_environment_trains( failing_step_env_factory(0, []), {"1D": ppo_dummy_config()}, env_manager=env_manager, success_threshold=None, ) env_manager.close()
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH) optimizer = create_test_ppo_optimizer(config, use_rnn=False, use_discrete=False, use_visual=False) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas # in PyTorch it is saved as the total probability per branch. So we need to modify the # log prob in the fake buffer here. update_buffer["action_probs"] = np.ones_like(update_buffer["actions"]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_initialize_ppo_trainer(BehaviorSpecMock, dummy_config): brain_name = "testbrain" training_behaviors = {"testbrain": BehaviorSpecMock()} output_path = "results_dir" train_model = True load_model = False seed = 11 expected_reward_buff_cap = 1 base_config = dummy_config.behaviors expected_config = ppo_dummy_config() def mock_constructor( self, brain, reward_buff_cap, trainer_settings, training, load, seed, artifact_path, ): assert brain == brain_name assert trainer_settings == expected_config assert reward_buff_cap == expected_reward_buff_cap assert training == train_model assert load == load_model assert seed == seed assert artifact_path == os.path.join(output_path, brain_name) with patch.object(PPOTrainer, "__init__", mock_constructor): trainer_factory = trainer_util.TrainerFactory( trainer_config=base_config, output_path=output_path, train_model=train_model, load_model=load_model, seed=seed, param_manager=EnvironmentParameterManager(), ) trainers = {} for brain_name in training_behaviors.keys(): trainers[brain_name] = trainer_factory.generate(brain_name) assert "testbrain" in trainers assert isinstance(trainers["testbrain"], PPOTrainer)
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate dummy_config.reward_signals = gail_dummy_config config = ppo_dummy_config() optimizer = create_test_ppo_optimizer(config, use_rnn=False, use_discrete=False, use_visual=False) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["continuous_log_probs"] = np.ones_like( update_buffer["continuous_action"]) optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config): # noqa: F811 # Test evaluate tf.reset_default_graph() dummy_config.reward_signals = gail_dummy_config optimizer = _create_ppo_optimizer_ops_mock( attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW), use_rnn=False, use_discrete=False, use_visual=False, ) # Test update update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, ) # Check if buffer size is too big update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec) # Mock out reward signal eval update_buffer["advantages"] = update_buffer["environment_rewards"] update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"] update_buffer["extrinsic_value_estimates"] = update_buffer[ "environment_rewards"] update_buffer["gail_returns"] = update_buffer["environment_rewards"] update_buffer["gail_value_estimates"] = update_buffer[ "environment_rewards"] optimizer.update( update_buffer, num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length, )
def test_subprocess_env_endtoend(num_envs): def simple_env_factory(worker_id, config): env = SimpleEnvironment(["1D"], use_discrete=True) return env env_manager = SubprocessEnvManager(simple_env_factory, EngineConfig.default_config(), num_envs) # Run PPO using env_manager check_environment_trains( simple_env_factory(0, []), {"1D": ppo_dummy_config()}, env_manager=env_manager, success_threshold=None, ) # Note we can't check the env's rewards directly (since they're in separate processes) so we # check the StatsReporter's debug stat writer's last reward. assert isinstance(StatsReporter.writers[0], DebugWriter) assert all(val > 0.7 for val in StatsReporter.writers[0].get_last_rewards().values()) env_manager.close()
def test_trainer_increment_step(ppo_optimizer, mock_create_model_saver): trainer_params = attr.evolve( attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW), framework=FrameworkType.TENSORFLOW, ) mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} ppo_optimizer.return_value = mock_optimizer trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0") policy_mock = mock.Mock(spec=TFPolicy) policy_mock.get_current_step.return_value = 0 step_count = ( 5 # 10 hacked because this function is no longer called through trainer ) policy_mock.increment_step = mock.Mock(return_value=step_count) behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) trainer.add_policy(behavior_id, policy_mock) trainer._increment_step(5, trainer.brain_name) policy_mock.increment_step.assert_called_with(5) assert trainer.step == step_count
GAILSettings, RewardSignalType, EncoderType, FrameworkType, ) from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager from mlagents_envs.side_channel.environment_parameters_channel import ( EnvironmentParametersChannel, ) from mlagents_envs.communicator_objects.demonstration_meta_pb2 import ( DemonstrationMetaProto, ) from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config PPO_TF_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW) SAC_TF_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.TENSORFLOW) BRAIN_NAME = "1D" # The reward processor is passed as an argument to _check_environment_trains. # It is applied to the list of all final rewards for each brain individually. # This is so that we can process all final rewards in different ways for different algorithms. # Custom reward processors should be built within the test function and passed to _check_environment_trains # Default is average over the last 5 final rewards def default_reward_processor(rewards, last_n_rewards=5): rewards_to_use = rewards[-last_n_rewards:] # For debugging tests
import pytest from mlagents.trainers.tests.simple_test_envs import ( SimpleEnvironment, MemoryEnvironment, ) from mlagents.trainers.settings import NetworkSettings, FrameworkType from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config from mlagents.trainers.tests.check_env_trains import check_environment_trains BRAIN_NAME = "1D" PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH) SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH) @pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)]) def test_hybrid_ppo(action_size): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8) new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings) new_hyperparams = attr.evolve( PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024 ) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=10000,
def dummy_config(): return attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
def dummy_config(): return attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW)
def dummy_config(): return RunOptions(behaviors={"testbrain": ppo_dummy_config()})
def dummy_config(): # poca has the same hyperparameters as ppo for now return ppo_dummy_config()
def dummy_config(): return ppo_dummy_config()
import attr import pytest from mlagents.trainers.tests.simple_test_envs import ( SimpleEnvironment, MemoryEnvironment, ) from mlagents.trainers.settings import NetworkSettings from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config from mlagents.trainers.tests.check_env_trains import check_environment_trains BRAIN_NAME = "1D" PPO_TORCH_CONFIG = ppo_dummy_config() SAC_TORCH_CONFIG = sac_dummy_config() @pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)]) def test_hybrid_ppo(action_size): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8) new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings) new_hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams,
assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,) def reward_signal_update(optimizer, reward_signal_name): buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec) feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update( optimizer.policy, buffer.make_mini_batch(0, 10), 2 ) out = optimizer.policy._execute_model( feed_dict, optimizer.reward_signals[reward_signal_name].update_dict ) assert type(out) is dict @pytest.mark.parametrize( "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] ) def test_gail_cc(trainer_config, gail_dummy_config): # noqa: F811 trainer_config.behavioral_cloning = BehavioralCloningSettings( demo_path=CONTINUOUS_DEMO_PATH ) optimizer = create_optimizer_mock( trainer_config, gail_dummy_config, False, False, False ) reward_signal_eval(optimizer, "gail") reward_signal_update(optimizer, "gail") @pytest.mark.parametrize( "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"] )