コード例 #1
0
def test_demo_mismatch():
    path_prefix = os.path.dirname(os.path.abspath(__file__))
    # observation size mismatch
    with pytest.raises(RuntimeError):
        mismatch_obs = setup_test_behavior_specs(False,
                                                 False,
                                                 vector_action_space=2,
                                                 vector_obs_space=9)
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        mismatch_obs)
    # action mismatch
    with pytest.raises(RuntimeError):
        mismatch_act = setup_test_behavior_specs(False,
                                                 False,
                                                 vector_action_space=3,
                                                 vector_obs_space=9)
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        mismatch_act)
    # action type mismatch
    with pytest.raises(RuntimeError):
        mismatch_act_type = setup_test_behavior_specs(True,
                                                      False,
                                                      vector_action_space=[2],
                                                      vector_obs_space=9)
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        mismatch_act_type)
    # number obs mismatch
    with pytest.raises(RuntimeError):
        mismatch_obs_number = setup_test_behavior_specs(False,
                                                        True,
                                                        vector_action_space=2,
                                                        vector_obs_space=9)
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        mismatch_obs_number)
コード例 #2
0
def test_normalization():
    behavior_spec = mb.setup_test_behavior_specs(use_discrete=True,
                                                 use_visual=False,
                                                 vector_action_space=[2],
                                                 vector_obs_space=1)

    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
    policy = NNPolicy(
        0,
        behavior_spec,
        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
        False,
        "testdir",
        False,
    )

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run([
        policy.normalization_steps, policy.running_mean,
        policy.running_variance
    ])

    assert steps == 6
    assert mean[0] == 0.5
    # Note: variance is divided by number of steps, and initialized to 1 to avoid
    # divide by 0. The right answer is 0.25
    assert (variance[0] - 1) / steps == 0.25

    # Make another update, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run([
        policy.normalization_steps, policy.running_mean,
        policy.running_variance
    ])

    assert steps == 16
    assert mean[0] == 0.8125
    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
コード例 #3
0
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn,
                          use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = trainer_config
    trainer_settings.reward_signals = reward_signal_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = NNPolicy(0,
                      mock_specs,
                      trainer_settings,
                      False,
                      "test",
                      False,
                      create_tf_graph=False)
    if trainer_settings.trainer_type == TrainerType.SAC:
        optimizer = SACOptimizer(policy, trainer_settings)
    else:
        optimizer = PPOOptimizer(policy, trainer_settings)
    return optimizer
コード例 #4
0
ファイル: test_ppo.py プロジェクト: yirui-wang-0212/ml-agents
def test_trainer_update_policy(
        dummy_config,
        curiosity_dummy_config,
        use_discrete  # noqa: F811
):
    mock_brain = mb.setup_test_behavior_specs(
        use_discrete,
        False,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_params = dummy_config
    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
        memory_size=10, sequence_length=16)

    # Test curiosity reward signal
    trainer_params.reward_signals = curiosity_dummy_config
    trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy("test", mock_brain)
    trainer.add_policy("test", policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]

    trainer.update_buffer = buffer
    trainer._update_policy()
コード例 #5
0
def test_normalizer_after_load(tmp_path):
    behavior_spec = mb.setup_test_behavior_specs(
        use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
    )
    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_spec=behavior_spec.action_spec,
    )
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)

    trainer_params = TrainerSettings(network_settings=NetworkSettings(normalize=True))
    policy = TFPolicy(0, behavior_spec, trainer_params)

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )

    assert steps == 6
    assert mean[0] == 0.5
    assert variance[0] / steps == pytest.approx(0.25, abs=0.01)
    # Save ckpt and load into another policy
    path1 = os.path.join(tmp_path, "runid1")
    model_saver = TFModelSaver(trainer_params, path1)
    model_saver.register(policy)
    mock_brain_name = "MockBrain"
    model_saver.save_checkpoint(mock_brain_name, 6)
    assert len(os.listdir(tmp_path)) > 0
    policy1 = TFPolicy(0, behavior_spec, trainer_params)
    model_saver = TFModelSaver(trainer_params, path1, load=True)
    model_saver.register(policy1)
    model_saver.initialize_or_load(policy1)

    # Make another update to new policy, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_spec=behavior_spec.action_spec,
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy1.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy1.sess.run(
        [policy1.normalization_steps, policy1.running_mean, policy1.running_variance]
    )

    assert steps == 16
    assert mean[0] == 0.8125
    assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
コード例 #6
0
ファイル: test_ghost.py プロジェクト: zereyak13/ml-agents
def test_load_and_set(dummy_config, use_discrete):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        False,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_params = dummy_config
    trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
    trainer.seed = 1
    policy = trainer.create_policy("test", mock_specs)
    trainer.seed = 20  # otherwise graphs are the same
    to_load_policy = trainer.create_policy("test", mock_specs)

    weights = policy.get_weights()
    load_weights = to_load_policy.get_weights()
    try:
        for w, lw in zip(weights, load_weights):
            np.testing.assert_array_equal(w, lw)
    except AssertionError:
        pass

    to_load_policy.load_weights(weights)
    load_weights = to_load_policy.get_weights()

    for w, lw in zip(weights, load_weights):
        np.testing.assert_array_equal(w, lw)
コード例 #7
0
ファイル: test_sac.py プロジェクト: vitoJackLove/ml-agents
def test_sac_save_load_buffer(tmpdir, dummy_config):
    mock_specs = mb.setup_test_behavior_specs(
        False,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )
    trainer_params = dummy_config
    trainer_params.hyperparameters.save_replay_buffer = True
    trainer = SACTrainer("test", 1, trainer_params, True, False, 0, "testdir")
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    policy = trainer.create_policy(behavior_id, mock_specs)
    trainer.add_policy(behavior_id, policy)

    trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                                policy.behavior_spec)
    buffer_len = trainer.update_buffer.num_experiences
    trainer.save_model()

    # Wipe Trainer and try to load
    trainer2 = SACTrainer("test", 1, trainer_params, True, True, 0, "testdir")

    policy = trainer2.create_policy(behavior_id, mock_specs)
    trainer2.add_policy(behavior_id, policy)
    assert trainer2.update_buffer.num_experiences == buffer_len
コード例 #8
0
ファイル: test_nn_policy.py プロジェクト: junpilan/ml-agents
def create_policy_mock(
    dummy_config: TrainerSettings,
    use_rnn: bool = False,
    use_discrete: bool = True,
    use_visual: bool = False,
    model_path: str = "",
    load: bool = False,
    seed: int = 0,
) -> TFPolicy:
    mock_spec = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = dummy_config
    trainer_settings.keep_checkpoints = 3
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings() if use_rnn else None
    )
    policy = TFPolicy(
        seed, mock_spec, trainer_settings, model_path=model_path, load=load
    )
    return policy
コード例 #9
0
ファイル: test_ppo.py プロジェクト: saleh9292/ml-agents
def test_trainer_update_policy(
        dummy_config,
        curiosity_dummy_config,
        use_discrete  # noqa: F811
):
    mock_behavior_spec = mb.setup_test_behavior_specs(
        use_discrete,
        False,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_params = dummy_config
    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
        memory_size=10, sequence_length=16)

    # Test curiosity reward signal
    trainer_params.reward_signals = curiosity_dummy_config
    mock_brain_name = "MockBrain"
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
    trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy(behavior_id, mock_behavior_spec)
    trainer.add_policy(behavior_id, policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
    if use_discrete:
        n_agents = len(buffer["discrete_log_probs"])
        buffer["discrete_log_probs"].reset_field()
        for _ in range(n_agents):
            buffer["discrete_log_probs"].append(
                np.ones(
                    int(sum(mock_behavior_spec.action_spec.discrete_branches)),
                    dtype=np.float32,
                ))
    else:
        n_agents = len(buffer["continuous_log_probs"])
        buffer["continuous_log_probs"].reset_field()
        for _ in range(n_agents):
            buffer["continuous_log_probs"].append(
                np.ones(mock_behavior_spec.action_spec.continuous_size,
                        dtype=np.float32))
    trainer.update_buffer = buffer
    trainer._update_policy()
コード例 #10
0
ファイル: test_ppo.py プロジェクト: rahzaazhar/ml-agents
def test_process_trajectory(dummy_config):
    behavior_spec = mb.setup_test_behavior_specs(
        True,
        False,
        vector_action_space=DISCRETE_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )
    mock_brain_name = "MockBrain"
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
    trainer = PPOTrainer("test_brain", 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(behavior_id, behavior_spec)
    trainer.add_policy(behavior_id, policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        observation_shapes=behavior_spec.observation_shapes,
        max_step_complete=True,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15

    # Check that GAE worked
    assert (
        "advantages" in trainer.update_buffer
        and "discounted_returns" in trainer.update_buffer
    )

    # Check that the stats are being collected as episode isn't complete
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent > 0

    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
        length=time_horizon + 1,
        max_step_complete=False,
        observation_shapes=behavior_spec.observation_shapes,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent == 0
    assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
コード例 #11
0
def test_process_trajectory(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[2],
                                              vector_obs_space=1)
    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"
    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0).brain_name

    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0,
                           dummy_config, True, "0")

    # first policy encountered becomes policy trained by wrapped PPO
    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)
    trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
    trainer.subscribe_trajectory_queue(trajectory_queue0)

    # Ghost trainer should ignore this queue because off policy
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)
    trajectory_queue1 = AgentManagerQueue(behavior_id_team1)
    trainer.subscribe_trajectory_queue(trajectory_queue1)

    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    trajectory_queue0.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.trainer.update_buffer.num_experiences == 15

    trajectory_queue1.put(trajectory)
    trainer.advance()

    # Check that ghost trainer ignored off policy queue
    assert trainer.trainer.update_buffer.num_experiences == 15
    # Check that it emptied the queue
    assert trajectory_queue1.empty()
コード例 #12
0
ファイル: test_sac.py プロジェクト: donlee90/ml-agents
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = dummy_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=12) if use_rnn else None)
    policy = TorchPolicy(0, mock_brain, trainer_settings)
    optimizer = TorchSACOptimizer(policy, trainer_settings)
    return optimizer
コード例 #13
0
def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
    optimizer = TorchPPOOptimizer(policy, trainer_settings)
    return optimizer
コード例 #14
0
def test_resume(dummy_config, tmp_path):
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[2],
                                              vector_obs_space=1)
    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"
    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0).brain_name
    tmp_path = tmp_path.as_posix()
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0,
                             tmp_path)
    controller = GhostController(100)
    trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0,
                           dummy_config, True, tmp_path)

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)

    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)

    trainer.save_model()

    # Make a new trainer, check that the policies are the same
    ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0,
                              tmp_path)
    trainer2 = GhostTrainer(ppo_trainer2, brain_name, controller, 0,
                            dummy_config, True, tmp_path)
    policy = trainer2.create_policy(parsed_behavior_id0, mock_specs)
    trainer2.add_policy(parsed_behavior_id0, policy)

    policy = trainer2.create_policy(parsed_behavior_id1, mock_specs)
    trainer2.add_policy(parsed_behavior_id1, policy)

    trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id)
    trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id)
    weights = trainer1_policy.get_weights()
    weights2 = trainer2_policy.get_weights()

    for w, lw in zip(weights, weights2):
        np.testing.assert_array_equal(w, lw)
コード例 #15
0
ファイル: test_trainers.py プロジェクト: donlee90/ml-agents
def test_sac_trainer_update_normalization(sac_config):
    behavior_id_team0 = "test_brain?team=0"
    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0).brain_name
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[2],
                                              vector_obs_space=1)
    base_config = sac_config.behaviors
    output_path = "results_dir"
    train_model = True
    load_model = False
    seed = 42
    trainer_factory = TrainerFactory(
        trainer_config=base_config,
        output_path=output_path,
        train_model=train_model,
        load_model=load_model,
        seed=seed,
        param_manager=EnvironmentParameterManager(),
    )
    sac_trainer = trainer_factory.generate(brain_name)
    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)
    policy = sac_trainer.create_policy(parsed_behavior_id0, mock_specs)
    sac_trainer.add_policy(parsed_behavior_id0, policy)
    trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
    sac_trainer.subscribe_trajectory_queue(trajectory_queue0)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        action_spec=mock_specs.action_spec,
    )
    trajectory_queue0.put(trajectory)
    # mocking out update_normalization in both the policy and critic
    with patch(
            "mlagents.trainers.torch.networks.ValueNetwork.update_normalization"
    ) as optimizer_update_normalization_mock, patch(
            "mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
    ) as policy_update_normalization_mock:
        sac_trainer.advance()
        optimizer_update_normalization_mock.assert_called_once()
        policy_update_normalization_mock.assert_called_once()
コード例 #16
0
def test_step_overflow():
    behavior_spec = mb.setup_test_behavior_specs(use_discrete=True,
                                                 use_visual=False,
                                                 vector_action_space=[2],
                                                 vector_obs_space=1)

    policy = TFPolicy(
        0,
        behavior_spec,
        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
        create_tf_graph=False,
    )
    policy.create_input_placeholders()
    policy.initialize()

    policy.set_step(2**31 - 1)
    assert policy.get_current_step() == 2**31 - 1
    policy.increment_step(3)
    assert policy.get_current_step() == 2**31 + 2
コード例 #17
0
def create_test_poca_optimizer(dummy_config, use_rnn, use_discrete,
                               use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.reward_signals = {
        RewardSignalType.EXTRINSIC: RewardSignalSettings(strength=1.0,
                                                         gamma=0.99)
    }

    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=8, memory_size=10) if use_rnn else None)
    policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False)
    optimizer = TorchPOCAOptimizer(policy, trainer_settings)
    return optimizer
コード例 #18
0
def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config, framework=FrameworkType.TENSORFLOW)
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
        if use_rnn
        else None
    )
    policy = TFPolicy(
        0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
    )
    optimizer = PPOOptimizer(policy, trainer_settings)
    policy.initialize()
    return optimizer
コード例 #19
0
ファイル: test_nn_policy.py プロジェクト: junpilan/ml-agents
def test_large_normalization():
    behavior_spec = mb.setup_test_behavior_specs(
        use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
    )
    # Taken from Walker seed 3713 which causes NaN without proper initialization
    large_obs1 = [
        1800.00036621,
        1799.96972656,
        1800.01245117,
        1800.07214355,
        1800.02758789,
        1799.98303223,
        1799.88647461,
        1799.89575195,
        1800.03479004,
        1800.14025879,
        1800.17675781,
        1800.20581055,
        1800.33740234,
        1800.36450195,
        1800.43457031,
        1800.45544434,
        1800.44604492,
        1800.56713867,
        1800.73901367,
    ]
    large_obs2 = [
        1799.99975586,
        1799.96679688,
        1799.92980957,
        1799.89550781,
        1799.93774414,
        1799.95300293,
        1799.94067383,
        1799.92993164,
        1799.84057617,
        1799.69873047,
        1799.70605469,
        1799.82849121,
        1799.85095215,
        1799.76977539,
        1799.78283691,
        1799.76708984,
        1799.67163086,
        1799.59191895,
        1799.5135498,
        1799.45556641,
        1799.3717041,
    ]
    policy = TFPolicy(
        0,
        behavior_spec,
        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
        "testdir",
        False,
    )
    time_horizon = len(large_obs1)
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_space=[2],
    )
    for i in range(time_horizon):
        trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32)
    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )
    assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01)
    assert variance[0] / steps == pytest.approx(
        np.var(large_obs1, dtype=np.float32), abs=0.01
    )

    time_horizon = len(large_obs2)
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_space=[2],
    )
    for i in range(time_horizon):
        trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32)

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )

    assert mean[0] == pytest.approx(
        np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
    )
    assert variance[0] / steps == pytest.approx(
        np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
    )
コード例 #20
0
ファイル: test_ghost.py プロジェクト: zereyak13/ml-agents
def test_publish_queue(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(
        True, False, vector_action_space=[1], vector_obs_space=8
    )

    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)

    brain_name = parsed_behavior_id0.brain_name

    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
    )

    # First policy encountered becomes policy trained by wrapped PPO
    # This queue should remain empty after swap snapshot
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)
    policy_queue0 = AgentManagerQueue(behavior_id_team0)
    trainer.publish_policy_queue(policy_queue0)

    # Ghost trainer should use this queue for ghost policy swap
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)
    policy_queue1 = AgentManagerQueue(behavior_id_team1)
    trainer.publish_policy_queue(policy_queue1)

    # check ghost trainer swap pushes to ghost queue and not trainer
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer._swap_snapshots()
    assert policy_queue0.empty() and not policy_queue1.empty()
    # clear
    policy_queue1.get_nowait()

    mock_specs = mb.setup_test_behavior_specs(
        False,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
    trainer.trainer.update_buffer = buffer

    # when ghost trainer advance and wrapped trainer buffers full
    # the wrapped trainer pushes updated policy to correct queue
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer.advance()
    assert not policy_queue0.empty() and policy_queue1.empty()
コード例 #21
0
ファイル: test_ghost.py プロジェクト: SancySwachitha/Drone
def test_publish_queue(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[1],
                                              vector_obs_space=8)

    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)

    brain_name = parsed_behavior_id0.brain_name

    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0,
                           dummy_config, True, "0")

    # First policy encountered becomes policy trained by wrapped PPO
    # This queue should remain empty after swap snapshot
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)
    policy_queue0 = AgentManagerQueue(behavior_id_team0)
    trainer.publish_policy_queue(policy_queue0)

    # Ghost trainer should use this queue for ghost policy swap
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)
    policy_queue1 = AgentManagerQueue(behavior_id_team1)
    trainer.publish_policy_queue(policy_queue1)

    # check ghost trainer swap pushes to ghost queue and not trainer
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer._swap_snapshots()
    assert policy_queue0.empty() and not policy_queue1.empty()
    # clear
    policy_queue1.get_nowait()

    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
    # Mock out reward signal eval
    copy_buffer_fields(
        buffer,
        src_key=BufferKey.ENVIRONMENT_REWARDS,
        dst_keys=[
            BufferKey.ADVANTAGES,
            RewardSignalUtil.rewards_key("extrinsic"),
            RewardSignalUtil.returns_key("extrinsic"),
            RewardSignalUtil.value_estimates_key("extrinsic"),
            RewardSignalUtil.rewards_key("curiosity"),
            RewardSignalUtil.returns_key("curiosity"),
            RewardSignalUtil.value_estimates_key("curiosity"),
        ],
    )

    trainer.trainer.update_buffer = buffer

    # when ghost trainer advance and wrapped trainer buffers full
    # the wrapped trainer pushes updated policy to correct queue
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer.advance()
    assert not policy_queue0.empty() and policy_queue1.empty()
コード例 #22
0
ファイル: test_sac.py プロジェクト: vitoJackLove/ml-agents
def test_advance(dummy_config):
    specs = setup_test_behavior_specs(use_discrete=False,
                                      use_visual=False,
                                      vector_action_space=2)
    dummy_config.hyperparameters.steps_per_update = 20
    dummy_config.hyperparameters.reward_signal_steps_per_update = 20
    dummy_config.hyperparameters.buffer_init_steps = 0
    trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0")
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    policy = trainer.create_policy(behavior_id, specs)
    trainer.add_policy(behavior_id, policy)

    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)

    trajectory = make_fake_trajectory(
        length=15,
        observation_shapes=specs.observation_shapes,
        max_step_complete=True,
        action_space=2,
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15

    # Check that the stats are being collected as episode isn't complete
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent > 0

    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
        length=6,
        observation_shapes=specs.observation_shapes,
        max_step_complete=False,
        action_space=2,
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent == 0
    assert trainer.stats_reporter.get_stats_summaries(
        "Policy/Extrinsic Reward").num > 0
    # Assert we're not just using the default values
    assert (trainer.stats_reporter.get_stats_summaries(
        "Policy/Extrinsic Reward").mean > 0)

    # Make sure there is a policy on the queue
    policy_queue.get_nowait()

    # Add another trajectory. Since this is less than 20 steps total (enough for)
    # two updates, there should NOT be a policy on the queue.
    trajectory = make_fake_trajectory(
        length=5,
        observation_shapes=specs.observation_shapes,
        max_step_complete=False,
        action_space=2,
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
    with pytest.raises(AgentManagerQueue.Empty):
        policy_queue.get_nowait()

    # Call add_policy and check that we update the correct number of times.
    # This is to emulate a load from checkpoint.
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    policy = trainer.create_policy(behavior_id, specs)
    policy.get_current_step = lambda: 200
    trainer.add_policy(behavior_id, policy)
    trainer.saver.initialize_or_load(policy)
    trainer.optimizer.update = mock.Mock()
    trainer.optimizer.update_reward_signals = mock.Mock()
    trainer.optimizer.update_reward_signals.return_value = {}
    trainer.optimizer.update.return_value = {}
    trajectory_queue.put(trajectory)
    trainer.advance()
    # Make sure we did exactly 1 update
    assert trainer.optimizer.update.call_count == 1
    assert trainer.optimizer.update_reward_signals.call_count == 1