Beispiel #1
0
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config):  # noqa: F811
    # Test evaluate
    dummy_config.reward_signals = gail_dummy_config
    config = ppo_dummy_config()
    optimizer = create_test_poca_optimizer(
        config, use_rnn=False, use_discrete=False, use_visual=False
    )
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
    )
    # Mock out reward signal eval
    copy_buffer_fields(
        update_buffer,
        src_key=BufferKey.ENVIRONMENT_REWARDS,
        dst_keys=[
            BufferKey.ADVANTAGES,
            RewardSignalUtil.returns_key("extrinsic"),
            RewardSignalUtil.value_estimates_key("extrinsic"),
            RewardSignalUtil.baseline_estimates_key("extrinsic"),
            RewardSignalUtil.returns_key("gail"),
            RewardSignalUtil.value_estimates_key("gail"),
            RewardSignalUtil.baseline_estimates_key("gail"),
        ],
    )

    update_buffer[BufferKey.CONTINUOUS_LOG_PROBS] = np.ones_like(
        update_buffer[BufferKey.CONTINUOUS_ACTION]
    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    copy_buffer_fields(
        update_buffer,
        src_key=BufferKey.ENVIRONMENT_REWARDS,
        dst_keys=[
            BufferKey.ADVANTAGES,
            RewardSignalUtil.returns_key("extrinsic"),
            RewardSignalUtil.value_estimates_key("extrinsic"),
            RewardSignalUtil.baseline_estimates_key("extrinsic"),
            RewardSignalUtil.returns_key("gail"),
            RewardSignalUtil.value_estimates_key("gail"),
            RewardSignalUtil.baseline_estimates_key("gail"),
        ],
    )
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
Beispiel #2
0
def test_ppo_optimizer_update_gail(gail_dummy_config,
                                   dummy_config):  # noqa: F811
    # Test evaluate
    tf.reset_default_graph()
    dummy_config.reward_signals = gail_dummy_config
    optimizer = _create_ppo_optimizer_ops_mock(
        attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
        use_rnn=False,
        use_discrete=False,
        use_visual=False,
    )
    # Test update
    behavior_spec = optimizer.policy.behavior_spec
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
    n_agents = len(update_buffer["continuous_log_probs"])
    update_buffer["continuous_log_probs"] = np.ones(
        (n_agents, behavior_spec.action_spec.continuous_size),
        dtype=np.float32)
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
Beispiel #3
0
def test_subprocess_failing_step(num_envs):
    def failing_step_env_factory(_worker_id, _config):
        env = UnexpectedExceptionEnvironment(
            ["1D"], use_discrete=True, to_raise=CustomTestOnlyException
        )
        return env

    env_manager = SubprocessEnvManager(failing_step_env_factory, RunOptions())
    # Expect the exception raised to be routed back up to the top level.
    with pytest.raises(CustomTestOnlyException):
        check_environment_trains(
            failing_step_env_factory(0, []),
            {"1D": ppo_dummy_config()},
            env_manager=env_manager,
            success_threshold=None,
        )
    env_manager.close()
Beispiel #4
0
def test_ppo_optimizer_update_gail(gail_dummy_config,
                                   dummy_config):  # noqa: F811
    # Test evaluate
    dummy_config.reward_signals = gail_dummy_config
    config = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
    optimizer = create_test_ppo_optimizer(config,
                                          use_rnn=False,
                                          use_discrete=False,
                                          use_visual=False)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
Beispiel #5
0
def test_initialize_ppo_trainer(BehaviorSpecMock, dummy_config):
    brain_name = "testbrain"
    training_behaviors = {"testbrain": BehaviorSpecMock()}
    output_path = "results_dir"
    train_model = True
    load_model = False
    seed = 11
    expected_reward_buff_cap = 1

    base_config = dummy_config.behaviors
    expected_config = ppo_dummy_config()

    def mock_constructor(
        self,
        brain,
        reward_buff_cap,
        trainer_settings,
        training,
        load,
        seed,
        artifact_path,
    ):
        assert brain == brain_name
        assert trainer_settings == expected_config
        assert reward_buff_cap == expected_reward_buff_cap
        assert training == train_model
        assert load == load_model
        assert seed == seed
        assert artifact_path == os.path.join(output_path, brain_name)

    with patch.object(PPOTrainer, "__init__", mock_constructor):
        trainer_factory = trainer_util.TrainerFactory(
            trainer_config=base_config,
            output_path=output_path,
            train_model=train_model,
            load_model=load_model,
            seed=seed,
            param_manager=EnvironmentParameterManager(),
        )
        trainers = {}
        for brain_name in training_behaviors.keys():
            trainers[brain_name] = trainer_factory.generate(brain_name)
        assert "testbrain" in trainers
        assert isinstance(trainers["testbrain"], PPOTrainer)
Beispiel #6
0
def test_ppo_optimizer_update_gail(gail_dummy_config,
                                   dummy_config):  # noqa: F811
    # Test evaluate
    dummy_config.reward_signals = gail_dummy_config
    config = ppo_dummy_config()
    optimizer = create_test_ppo_optimizer(config,
                                          use_rnn=False,
                                          use_discrete=False,
                                          use_visual=False)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["continuous_log_probs"] = np.ones_like(
        update_buffer["continuous_action"])
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
Beispiel #7
0
def test_ppo_optimizer_update_gail(gail_dummy_config,
                                   dummy_config):  # noqa: F811
    # Test evaluate
    tf.reset_default_graph()
    dummy_config.reward_signals = gail_dummy_config
    optimizer = _create_ppo_optimizer_ops_mock(
        attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
        use_rnn=False,
        use_discrete=False,
        use_visual=False,
    )
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
Beispiel #8
0
def test_subprocess_env_endtoend(num_envs):
    def simple_env_factory(worker_id, config):
        env = SimpleEnvironment(["1D"], use_discrete=True)
        return env

    env_manager = SubprocessEnvManager(simple_env_factory,
                                       EngineConfig.default_config(), num_envs)
    # Run PPO using env_manager
    check_environment_trains(
        simple_env_factory(0, []),
        {"1D": ppo_dummy_config()},
        env_manager=env_manager,
        success_threshold=None,
    )
    # Note we can't check the env's rewards directly (since they're in separate processes) so we
    # check the StatsReporter's debug stat writer's last reward.
    assert isinstance(StatsReporter.writers[0], DebugWriter)
    assert all(val > 0.7
               for val in StatsReporter.writers[0].get_last_rewards().values())
    env_manager.close()
Beispiel #9
0
def test_trainer_increment_step(ppo_optimizer, mock_create_model_saver):
    trainer_params = attr.evolve(
        attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
        framework=FrameworkType.TENSORFLOW,
    )
    mock_optimizer = mock.Mock()
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

    trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0")
    policy_mock = mock.Mock(spec=TFPolicy)
    policy_mock.get_current_step.return_value = 0
    step_count = (
        5  # 10 hacked because this function is no longer called through trainer
    )
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    trainer.add_policy(behavior_id, policy_mock)

    trainer._increment_step(5, trainer.brain_name)
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == step_count
Beispiel #10
0
    GAILSettings,
    RewardSignalType,
    EncoderType,
    FrameworkType,
)
from mlagents.trainers.environment_parameter_manager import EnvironmentParameterManager
from mlagents_envs.side_channel.environment_parameters_channel import (
    EnvironmentParametersChannel, )
from mlagents_envs.communicator_objects.demonstration_meta_pb2 import (
    DemonstrationMetaProto, )
from mlagents_envs.communicator_objects.brain_parameters_pb2 import BrainParametersProto
from mlagents_envs.communicator_objects.space_type_pb2 import discrete, continuous

from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config

PPO_TF_CONFIG = attr.evolve(ppo_dummy_config(),
                            framework=FrameworkType.TENSORFLOW)
SAC_TF_CONFIG = attr.evolve(sac_dummy_config(),
                            framework=FrameworkType.TENSORFLOW)

BRAIN_NAME = "1D"


# The reward processor is passed as an argument to _check_environment_trains.
# It is applied to the list of all final rewards for each brain individually.
# This is so that we can process all final rewards in different ways for different algorithms.
# Custom reward processors should be built within the test function and passed to _check_environment_trains
# Default is average over the last 5 final rewards
def default_reward_processor(rewards, last_n_rewards=5):
    rewards_to_use = rewards[-last_n_rewards:]
    # For debugging tests
Beispiel #11
0
import pytest


from mlagents.trainers.tests.simple_test_envs import (
    SimpleEnvironment,
    MemoryEnvironment,
)

from mlagents.trainers.settings import NetworkSettings, FrameworkType

from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
from mlagents.trainers.tests.check_env_trains import check_environment_trains

BRAIN_NAME = "1D"

PPO_TORCH_CONFIG = attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
SAC_TORCH_CONFIG = attr.evolve(sac_dummy_config(), framework=FrameworkType.PYTORCH)


@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
def test_hybrid_ppo(action_size):
    env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8)
    new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings)
    new_hyperparams = attr.evolve(
        PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024
    )
    config = attr.evolve(
        PPO_TORCH_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_network_settings,
        max_steps=10000,
Beispiel #12
0
def dummy_config():
    return attr.evolve(ppo_dummy_config(), framework=FrameworkType.PYTORCH)
Beispiel #13
0
def dummy_config():
    return attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW)
Beispiel #14
0
def dummy_config():
    return RunOptions(behaviors={"testbrain": ppo_dummy_config()})
Beispiel #15
0
def dummy_config():
    # poca has the same hyperparameters as ppo for now
    return ppo_dummy_config()
Beispiel #16
0
def dummy_config():
    return ppo_dummy_config()
Beispiel #17
0
import attr
import pytest

from mlagents.trainers.tests.simple_test_envs import (
    SimpleEnvironment,
    MemoryEnvironment,
)

from mlagents.trainers.settings import NetworkSettings

from mlagents.trainers.tests.dummy_config import ppo_dummy_config, sac_dummy_config
from mlagents.trainers.tests.check_env_trains import check_environment_trains

BRAIN_NAME = "1D"

PPO_TORCH_CONFIG = ppo_dummy_config()
SAC_TORCH_CONFIG = sac_dummy_config()


@pytest.mark.parametrize("action_size", [(1, 1), (2, 2), (1, 2), (2, 1)])
def test_hybrid_ppo(action_size):
    env = SimpleEnvironment([BRAIN_NAME],
                            action_sizes=action_size,
                            step_size=0.8)
    new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings)
    new_hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters,
                                  batch_size=64,
                                  buffer_size=1024)
    config = attr.evolve(
        PPO_TORCH_CONFIG,
        hyperparameters=new_hyperparams,
    assert rsig_result.unscaled_reward.shape == (BATCH_SIZE,)


def reward_signal_update(optimizer, reward_signal_name):
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec)
    feed_dict = optimizer.reward_signals[reward_signal_name].prepare_update(
        optimizer.policy, buffer.make_mini_batch(0, 10), 2
    )
    out = optimizer.policy._execute_model(
        feed_dict, optimizer.reward_signals[reward_signal_name].update_dict
    )
    assert type(out) is dict


@pytest.mark.parametrize(
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)
def test_gail_cc(trainer_config, gail_dummy_config):  # noqa: F811
    trainer_config.behavioral_cloning = BehavioralCloningSettings(
        demo_path=CONTINUOUS_DEMO_PATH
    )
    optimizer = create_optimizer_mock(
        trainer_config, gail_dummy_config, False, False, False
    )
    reward_signal_eval(optimizer, "gail")
    reward_signal_update(optimizer, "gail")


@pytest.mark.parametrize(
    "trainer_config", [ppo_dummy_config(), sac_dummy_config()], ids=["ppo", "sac"]
)