def test_recurrent_sac(use_discrete):
    step_size = 0.5 if use_discrete else 0.2
    env = MemoryEnvironment([BRAIN_NAME],
                            use_discrete=use_discrete,
                            step_size=step_size)
    new_networksettings = attr.evolve(
        SAC_TF_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16,
                                              sequence_length=16),
    )
    new_hyperparams = attr.evolve(
        SAC_TF_CONFIG.hyperparameters,
        batch_size=128,
        learning_rate=1e-3,
        buffer_init_steps=1000,
        steps_per_update=2,
    )
    config = attr.evolve(
        SAC_TF_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_networksettings,
        max_steps=5000,
        framework=FrameworkType.TENSORFLOW,
    )
    _check_environment_trains(env, {BRAIN_NAME: config})
Beispiel #2
0
def test_trainer_update_policy(
    dummy_config, curiosity_dummy_config, use_discrete  # noqa: F811
):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_params = dummy_config
    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
        memory_size=10, sequence_length=16
    )

    # Test curiosity reward signal
    trainer_params.reward_signals = curiosity_dummy_config
    trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]

    trainer.update_buffer = buffer
    trainer._update_policy()
Beispiel #3
0
def test_networkbody_lstm():
    torch.manual_seed(0)
    obs_size = 4
    seq_len = 6
    network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings(
        sequence_length=seq_len, memory_size=12))
    obs_shapes = [(obs_size, )]

    networkbody = NetworkBody(create_observation_specs_with_shapes(obs_shapes),
                              network_settings)
    optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4)
    sample_obs = torch.ones((seq_len, obs_size))

    for _ in range(300):
        encoded, _ = networkbody([sample_obs],
                                 memories=torch.ones(1, 1, 12),
                                 sequence_length=seq_len)
        # Try to force output to 1
        loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # In the last step, values should be close to 1
    for _enc in encoded.flatten().tolist():
        assert _enc == pytest.approx(1.0, abs=0.1)
Beispiel #4
0
def test_recurrent_poca(action_sizes, is_multiagent):
    if is_multiagent:
        # This is not a recurrent environment, just check if LSTM doesn't crash
        env = MultiAgentEnvironment([BRAIN_NAME],
                                    action_sizes=action_sizes,
                                    num_agents=2)
    else:
        # Actually test LSTM here
        env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
    new_network_settings = attr.evolve(
        POCA_TORCH_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16),
    )
    new_hyperparams = attr.evolve(
        POCA_TORCH_CONFIG.hyperparameters,
        learning_rate=1.0e-3,
        batch_size=64,
        buffer_size=128,
    )
    config = attr.evolve(
        POCA_TORCH_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_network_settings,
        max_steps=500 if is_multiagent else 6000,
    )
    check_environment_trains(env, {BRAIN_NAME: config},
                             success_threshold=None if is_multiagent else 0.9)
Beispiel #5
0
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn,
                          use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = trainer_config
    trainer_settings.reward_signals = reward_signal_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = NNPolicy(0,
                      mock_specs,
                      trainer_settings,
                      False,
                      "test",
                      False,
                      create_tf_graph=False)
    if trainer_settings.trainer_type == TrainerType.SAC:
        optimizer = SACOptimizer(policy, trainer_settings)
    else:
        optimizer = PPOOptimizer(policy, trainer_settings)
    return optimizer
Beispiel #6
0
def create_policy_mock(
    dummy_config: TrainerSettings,
    use_rnn: bool = False,
    use_discrete: bool = True,
    use_visual: bool = False,
    model_path: str = "",
    load: bool = False,
    seed: int = 0,
) -> TFPolicy:
    mock_spec = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = dummy_config
    trainer_settings.keep_checkpoints = 3
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings() if use_rnn else None
    )
    policy = TFPolicy(
        seed, mock_spec, trainer_settings, model_path=model_path, load=load
    )
    return policy
Beispiel #7
0
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
    # model_path = env.external_brain_names[0]
    trainer_config = TrainerSettings()
    trainer_config.network_settings.memory = (NetworkSettings.MemorySettings()
                                              if use_rnn else None)
    policy = NNPolicy(
        0,
        mock_behavior_specs,
        trainer_config,
        False,
        "test",
        False,
        tanhresample,
        tanhresample,
    )
    with policy.graph.as_default():
        bc_module = BCModule(
            policy,
            policy_learning_rate=trainer_config.hyperparameters.learning_rate,
            default_batch_size=trainer_config.hyperparameters.batch_size,
            default_num_epoch=3,
            settings=bc_settings,
        )
    policy.initialize_or_load(
    )  # Normally the optimizer calls this after the BCModule is created
    return bc_module
Beispiel #8
0
def test_actor_critic(ac_type, lstm):
    obs_size = 4
    network_settings = NetworkSettings(
        memory=NetworkSettings.MemorySettings() if lstm else None,
        normalize=True)
    obs_spec = create_observation_specs_with_shapes([(obs_size, )])
    act_size = 2
    mask = torch.ones([1, act_size * 2])
    stream_names = [f"stream_name{n}" for n in range(4)]
    # action_spec = ActionSpec.create_continuous(act_size[0])
    action_spec = ActionSpec(act_size,
                             tuple(act_size for _ in range(act_size)))
    actor = ac_type(obs_spec, network_settings, action_spec, stream_names)
    if lstm:
        sample_obs = torch.ones(
            (1, network_settings.memory.sequence_length, obs_size))
        memories = torch.ones(
            (1, network_settings.memory.sequence_length, actor.memory_size))
    else:
        sample_obs = torch.ones((1, obs_size))
        memories = torch.tensor([])
        # memories isn't always set to None, the network should be able to
        # deal with that.
    # Test critic pass
    value_out, memories_out = actor.critic_pass([sample_obs],
                                                memories=memories)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (
                network_settings.memory.sequence_length, )
            assert memories_out.shape == memories.shape
        else:
            assert value_out[stream].shape == (1, )

    # Test get action stats and_value
    action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value(
        [sample_obs], memories=memories, masks=mask)
    if lstm:
        assert action.continuous_tensor.shape == (64, 2)
    else:
        assert action.continuous_tensor.shape == (1, 2)

    assert len(action.discrete_list) == 2
    for _disc in action.discrete_list:
        if lstm:
            assert _disc.shape == (64, 1)
        else:
            assert _disc.shape == (1, 1)

    if mem_out is not None:
        assert mem_out.shape == memories.shape
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (
                network_settings.memory.sequence_length, )
        else:
            assert value_out[stream].shape == (1, )
Beispiel #9
0
def test_trainer_update_policy(
        dummy_config,
        curiosity_dummy_config,
        use_discrete  # noqa: F811
):
    mock_behavior_spec = mb.setup_test_behavior_specs(
        use_discrete,
        False,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_params = dummy_config
    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
        memory_size=10, sequence_length=16)

    # Test curiosity reward signal
    trainer_params.reward_signals = curiosity_dummy_config
    mock_brain_name = "MockBrain"
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
    trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy(behavior_id, mock_behavior_spec)
    trainer.add_policy(behavior_id, policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
    if use_discrete:
        n_agents = len(buffer["discrete_log_probs"])
        buffer["discrete_log_probs"].reset_field()
        for _ in range(n_agents):
            buffer["discrete_log_probs"].append(
                np.ones(
                    int(sum(mock_behavior_spec.action_spec.discrete_branches)),
                    dtype=np.float32,
                ))
    else:
        n_agents = len(buffer["continuous_log_probs"])
        buffer["continuous_log_probs"].reset_field()
        for _ in range(n_agents):
            buffer["continuous_log_probs"].append(
                np.ones(mock_behavior_spec.action_spec.continuous_size,
                        dtype=np.float32))
    trainer.update_buffer = buffer
    trainer._update_policy()
Beispiel #10
0
def test_bad_config():
    brain_params = make_brain_parameters(discrete_action=False,
                                         visual_inputs=0,
                                         vec_obs_size=6)
    # Test that we throw an error if we have sequence length greater than batch size
    with pytest.raises(TrainerConfigError):
        TrainerSettings(
            network_settings=NetworkSettings(
                memory=NetworkSettings.MemorySettings(sequence_length=64)),
            hyperparameters=PPOSettings(batch_size=32),
        )
        _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
Beispiel #11
0
def test_multinetworkbody_lstm(with_actions):
    torch.manual_seed(0)
    obs_size = 4
    act_size = 2
    seq_len = 16
    n_agents = 3
    network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings(
        sequence_length=seq_len, memory_size=12))

    obs_shapes = [(obs_size, )]
    action_spec = ActionSpec(act_size,
                             tuple(act_size for _ in range(act_size)))
    networkbody = MultiAgentNetworkBody(
        create_observation_specs_with_shapes(obs_shapes), network_settings,
        action_spec)
    optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4)
    sample_obs = [[0.1 * torch.ones((seq_len, obs_size))]
                  for _ in range(n_agents)]
    # simulate baseline in POCA
    sample_act = [
        AgentAction(
            0.1 * torch.ones((seq_len, 2)),
            [0.1 * torch.ones(seq_len) for _ in range(act_size)],
        ) for _ in range(n_agents - 1)
    ]

    for _ in range(300):
        if with_actions:
            encoded, _ = networkbody(
                obs_only=sample_obs[:1],
                obs=sample_obs[1:],
                actions=sample_act,
                memories=torch.ones(1, 1, 12),
                sequence_length=seq_len,
            )
        else:
            encoded, _ = networkbody(
                obs_only=sample_obs,
                obs=[],
                actions=[],
                memories=torch.ones(1, 1, 12),
                sequence_length=seq_len,
            )
        # Try to force output to 1
        loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # In the last step, values should be close to 1
    for _enc in encoded.flatten().tolist():
        assert _enc == pytest.approx(1.0, abs=0.1)
Beispiel #12
0
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = dummy_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=12) if use_rnn else None)
    policy = TorchPolicy(0, mock_brain, trainer_settings)
    optimizer = TorchSACOptimizer(policy, trainer_settings)
    return optimizer
Beispiel #13
0
def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]:
    all_behavior_config_dict = {}
    default_config = old_trainer_config.get("default", {})
    for behavior_name, config in old_trainer_config.items():
        if behavior_name != "default":
            config = default_config.copy()
            config.update(old_trainer_config[behavior_name])

            # Convert to split TrainerSettings, Hyperparameters, NetworkSettings
            # Set trainer_type and get appropriate hyperparameter settings
            try:
                trainer_type = config["trainer"]
            except KeyError:
                raise TrainerConfigError(
                    "Config doesn't specify a trainer type. "
                    "Please specify trainer: in your config."
                )
            new_config = {}
            new_config["trainer_type"] = trainer_type
            hyperparam_cls = TrainerType(trainer_type).to_settings()
            # Try to absorb as much as possible into the hyperparam_cls
            new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls)

            # Try to absorb as much as possible into the network settings
            new_config["network_settings"] = cattr.structure(config, NetworkSettings)
            # Deal with recurrent
            try:
                if config["use_recurrent"]:
                    new_config[
                        "network_settings"
                    ].memory = NetworkSettings.MemorySettings(
                        sequence_length=config["sequence_length"],
                        memory_size=config["memory_size"],
                    )
            except KeyError:
                raise TrainerConfigError(
                    "Config doesn't specify use_recurrent. "
                    "Please specify true or false for use_recurrent in your config."
                )
            # Absorb the rest into the base TrainerSettings
            for key, val in config.items():
                if key in attr.fields_dict(TrainerSettings):
                    new_config[key] = val

            # Structure the whole thing
            all_behavior_config_dict[behavior_name] = cattr.structure(
                new_config, TrainerSettings
            )
    return all_behavior_config_dict
Beispiel #14
0
def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
    optimizer = TorchPPOOptimizer(policy, trainer_settings)
    return optimizer
Beispiel #15
0
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
    # model_path = env.external_brain_names[0]
    trainer_config = TrainerSettings()
    trainer_config.network_settings.memory = (NetworkSettings.MemorySettings()
                                              if use_rnn else None)
    policy = TorchPolicy(0, mock_behavior_specs, trainer_config, tanhresample,
                         tanhresample)
    bc_module = BCModule(
        policy,
        settings=bc_settings,
        policy_learning_rate=trainer_config.hyperparameters.learning_rate,
        default_batch_size=trainer_config.hyperparameters.batch_size,
        default_num_epoch=3,
    )
    return bc_module
def test_recurrent_ppo(use_discrete):
    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
    new_network_settings = attr.evolve(
        PPO_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16),
    )
    new_hyperparams = attr.evolve(
        PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128
    )
    config = attr.evolve(
        PPO_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_network_settings,
        max_steps=5000,
    )
    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
Beispiel #17
0
def test_actor_critic(ac_type, lstm):
    obs_size = 4
    network_settings = NetworkSettings(
        memory=NetworkSettings.MemorySettings() if lstm else None)
    obs_shapes = [(obs_size, )]
    act_size = [2]
    stream_names = [f"stream_name{n}" for n in range(4)]
    actor = ac_type(obs_shapes, network_settings, ActionType.CONTINUOUS,
                    act_size, stream_names)
    if lstm:
        sample_obs = torch.ones(
            (1, network_settings.memory.sequence_length, obs_size))
        memories = torch.ones((
            1,
            network_settings.memory.sequence_length,
            network_settings.memory.memory_size,
        ))
    else:
        sample_obs = torch.ones((1, obs_size))
        memories = torch.tensor([])
        # memories isn't always set to None, the network should be able to
        # deal with that.
    # Test critic pass
    value_out, memories_out = actor.critic_pass([sample_obs], [],
                                                memories=memories)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (
                network_settings.memory.sequence_length, )
            assert memories_out.shape == memories.shape
        else:
            assert value_out[stream].shape == (1, )

    # Test get_dist_and_value
    dists, value_out, mem_out = actor.get_dist_and_value([sample_obs], [],
                                                         memories=memories)
    if mem_out is not None:
        assert mem_out.shape == memories.shape
    for dist in dists:
        assert isinstance(dist, GaussianDistInstance)
    for stream in stream_names:
        if lstm:
            assert value_out[stream].shape == (
                network_settings.memory.sequence_length, )
        else:
            assert value_out[stream].shape == (1, )
Beispiel #18
0
def test_hybrid_recurrent_ppo():
    env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5)
    new_network_settings = attr.evolve(
        PPO_TORCH_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16),
    )
    new_hyperparams = attr.evolve(
        PPO_TORCH_CONFIG.hyperparameters,
        learning_rate=1.0e-3,
        batch_size=64,
        buffer_size=512,
    )
    config = attr.evolve(
        PPO_TORCH_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_network_settings,
        max_steps=3000,
    )
    check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
Beispiel #19
0
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        use_visual,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )
    trainer_settings = dummy_config
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
        if use_rnn
        else None
    )
    policy = NNPolicy(
        0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False
    )
    optimizer = SACOptimizer(policy, trainer_settings)
    return optimizer
Beispiel #20
0
def test_recurrent_ppo(action_sizes):
    env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes)
    new_network_settings = attr.evolve(
        PPO_TF_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16),
    )
    new_hyperparams = attr.evolve(
        PPO_TF_CONFIG.hyperparameters,
        learning_rate=1.0e-3,
        batch_size=64,
        buffer_size=128,
    )
    config = attr.evolve(
        PPO_TF_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_network_settings,
        max_steps=5000,
        framework=FrameworkType.TENSORFLOW,
    )
    _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
Beispiel #21
0
def test_hybrid_recurrent_sac():
    env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5)
    new_networksettings = attr.evolve(
        SAC_TORCH_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
    )
    new_hyperparams = attr.evolve(
        SAC_TORCH_CONFIG.hyperparameters,
        batch_size=256,
        learning_rate=1e-3,
        buffer_init_steps=1000,
        steps_per_update=2,
    )
    config = attr.evolve(
        SAC_TORCH_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_networksettings,
        max_steps=4000,
    )
    check_environment_trains(env, {BRAIN_NAME: config})
Beispiel #22
0
def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete,
                               use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.reward_signals = {
        RewardSignalType.EXTRINSIC: RewardSignalSettings(strength=1.0,
                                                         gamma=0.99)
    }

    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=8, memory_size=10) if use_rnn else None)
    policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
    optimizer = TorchPOCAOptimizer(policy, trainer_settings)
    return optimizer
def test_recurrent_sac(use_discrete):
    env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete)
    new_networksettings = attr.evolve(
        SAC_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16,
                                              sequence_length=32),
    )
    new_hyperparams = attr.evolve(
        SAC_CONFIG.hyperparameters,
        batch_size=64,
        learning_rate=1e-3,
        buffer_init_steps=500,
        steps_per_update=2,
    )
    config = attr.evolve(
        SAC_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_networksettings,
        max_steps=5000,
    )
    _check_environment_trains(env, {BRAIN_NAME: config})
Beispiel #24
0
def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config, framework=FrameworkType.TENSORFLOW)
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
        if use_rnn
        else None
    )
    policy = TFPolicy(
        0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
    )
    optimizer = PPOOptimizer(policy, trainer_settings)
    policy.initialize()
    return optimizer
Beispiel #25
0
def create_policy_mock(
    dummy_config: TrainerSettings,
    use_rnn: bool = False,
    use_discrete: bool = True,
    use_visual: bool = False,
    load: bool = False,
    seed: int = 0,
) -> NNPolicy:
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        use_visual,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_settings = dummy_config
    trainer_settings.keep_checkpoints = 3
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings() if use_rnn else None)
    policy = NNPolicy(seed, mock_brain, trainer_settings, False, load)
    return policy
Beispiel #26
0
def test_recurrent_sac(action_sizes):
    step_size = 0.2 if action_sizes == (0, 1) else 0.5
    env = MemoryEnvironment(
        [BRAIN_NAME], action_sizes=action_sizes, step_size=step_size
    )
    new_networksettings = attr.evolve(
        SAC_TORCH_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16),
    )
    new_hyperparams = attr.evolve(
        SAC_TORCH_CONFIG.hyperparameters,
        batch_size=256,
        learning_rate=3e-4,
        buffer_init_steps=1000,
        steps_per_update=2,
    )
    config = attr.evolve(
        SAC_TORCH_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_networksettings,
        max_steps=4000,
    )
    check_environment_trains(env, {BRAIN_NAME: config}, training_seed=1337)
Beispiel #27
0
def test_recurrent_sac(action_sizes):
    step_size = 0.2 if action_sizes == (0, 1) else 0.5
    env = MemoryEnvironment([BRAIN_NAME],
                            action_sizes=action_sizes,
                            step_size=step_size)
    new_networksettings = attr.evolve(
        SAC_TF_CONFIG.network_settings,
        memory=NetworkSettings.MemorySettings(memory_size=16),
    )
    new_hyperparams = attr.evolve(
        SAC_TF_CONFIG.hyperparameters,
        batch_size=128,
        learning_rate=1e-3,
        buffer_init_steps=1000,
        steps_per_update=2,
    )
    config = attr.evolve(
        SAC_TF_CONFIG,
        hyperparameters=new_hyperparams,
        network_settings=new_networksettings,
        max_steps=4000,
        framework=FrameworkType.TENSORFLOW,
    )
    _check_environment_trains(env, {BRAIN_NAME: config})
Beispiel #28
0
                      EXPORT_FILE,
                      opset_version=9,
                      input_names=input_names,
                      output_names=output_names,
                      dynamic_axes=dynamic_axes)


if __name__ == '__main__':
    obs_spec = [
        ObservationSpec(shape=(16, ),
                        dimension_property=(DimensionProperty.UNSPECIFIED, ),
                        observation_type=ObservationType.DEFAULT)
    ]
    act_spec = ActionSpec(continuous_size=4, discrete_branches=())
    net_settings = NetworkSettings(normalize=False,
                                   hidden_units=256,
                                   num_layers=2,
                                   vis_encode_type=EncoderType.SIMPLE,
                                   memory=NetworkSettings.MemorySettings(
                                       sequence_length=64, memory_size=256))

    network = SerializableSimpleActor(obs_spec, net_settings, act_spec)
    state_dict = torch.load(MODEL_FILE, map_location=torch.device('cpu'))
    filtered_sd = {
        i: j
        for i, j in state_dict['Policy'].items() if 'critic' not in i
    }
    network.load_state_dict(filtered_sd)

    export_model(network)
def test_memory_settings_validation():
    with pytest.raises(TrainerConfigError):
        NetworkSettings.MemorySettings(sequence_length=128, memory_size=63)

    with pytest.raises(TrainerConfigError):
        NetworkSettings.MemorySettings(sequence_length=128, memory_size=0)