Example #1
0
def test_update_buffer_append():
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        max_step_complete=True,
        action_spec=ActionSpec.create_discrete((2, )),
    )
    agentbuffer_trajectory = trajectory.to_agentbuffer()
    assert trainer.update_buffer.num_experiences == 0

    # Check that if we append, our update buffer gets longer.
    # max_steps = 100
    for i in range(10):
        trainer._process_trajectory(trajectory)
        trainer._append_to_update_buffer(agentbuffer_trajectory)
        assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon

    # Check that if we append after stopping training, nothing happens.
    # We process enough trajectories to hit max steps
    trainer.set_is_policy_updating(False)
    trainer._process_trajectory(trajectory)
    trainer._append_to_update_buffer(agentbuffer_trajectory)
    assert trainer.update_buffer.num_experiences == (i + 1) * time_horizon
Example #2
0
def test_process_trajectory(dummy_config):
    brain_params_team0 = BrainParameters(
        brain_name="test_brain?team=0",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )

    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        brain_params_team0.brain_name).brain_name

    brain_params_team1 = BrainParameters(
        brain_name="test_brain?team=1",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0",
                             False)
    trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")

    # first policy encountered becomes policy trained by wrapped PPO
    policy = trainer.create_policy(brain_params_team0)
    trainer.add_policy(brain_params_team0.brain_name, policy)
    trajectory_queue0 = AgentManagerQueue(brain_params_team0.brain_name)
    trainer.subscribe_trajectory_queue(trajectory_queue0)

    # Ghost trainer should ignore this queue because off policy
    policy = trainer.create_policy(brain_params_team1)
    trainer.add_policy(brain_params_team1.brain_name, policy)
    trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
    trainer.subscribe_trajectory_queue(trajectory_queue1)

    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_queue0.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.trainer.update_buffer.num_experiences == 15

    trajectory_queue1.put(trajectory)
    trainer.advance()

    # Check that ghost trainer ignored off policy queue
    assert trainer.trainer.update_buffer.num_experiences == 15
    # Check that it emptied the queue
    assert trajectory_queue1.empty()
Example #3
0
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    summary_freq = trainer.trainer_settings.summary_freq
    checkpoint_interval = trainer.trainer_settings.checkpoint_interval
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        max_step_complete=True,
        action_spec=ActionSpec.create_discrete((2, )),
    )
    # Check that we can turn off the trainer and that the buffer is cleared
    num_trajectories = 5
    for _ in range(0, num_trajectories):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that we have called write_summary the appropriate number of times
    calls = [
        mock.call(step) for step in range(summary_freq, num_trajectories *
                                          time_horizon, summary_freq)
    ]
    mock_write_summary.assert_has_calls(calls, any_order=True)

    checkpoint_range = range(checkpoint_interval,
                             num_trajectories * time_horizon,
                             checkpoint_interval)
    calls = [mock.call(trainer.brain_name, step) for step in checkpoint_range]

    trainer.model_saver.save_checkpoint.assert_has_calls(calls, any_order=True)
    export_ext = "onnx"

    add_checkpoint_calls = [
        mock.call(
            trainer.brain_name,
            ModelCheckpoint(
                step,
                f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.{export_ext}",
                None,
                mock.ANY,
                [
                    f"{trainer.model_saver.model_path}{os.path.sep}{trainer.brain_name}-{step}.pt"
                ],
            ),
            trainer.trainer_settings.keep_checkpoints,
        ) for step in checkpoint_range
    ]
    mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
Example #4
0
def test_summary_checkpoint(mock_add_checkpoint, mock_write_summary):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    mock_policy.model_path = "mock_model_path"
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    summary_freq = trainer.trainer_settings.summary_freq
    checkpoint_interval = trainer.trainer_settings.checkpoint_interval
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_shapes=[(1, )],
        max_step_complete=True,
        action_space=[2],
    )
    # Check that we can turn off the trainer and that the buffer is cleared
    num_trajectories = 5
    for _ in range(0, num_trajectories):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that we have called write_summary the appropriate number of times
    calls = [
        mock.call(step) for step in range(summary_freq, num_trajectories *
                                          time_horizon, summary_freq)
    ]
    mock_write_summary.assert_has_calls(calls, any_order=True)

    checkpoint_range = range(checkpoint_interval,
                             num_trajectories * time_horizon,
                             checkpoint_interval)
    calls = [
        mock.call(f"{mock_policy.model_path}/{trainer.brain_name}-{step}",
                  mock.ANY) for step in checkpoint_range
    ]
    mock_policy.checkpoint.assert_has_calls(calls, any_order=True)

    add_checkpoint_calls = [
        mock.call(
            trainer.brain_name,
            NNCheckpoint(
                step,
                f"{mock_policy.model_path}/{trainer.brain_name}-{step}.nn",
                None,
                mock.ANY,
            ),
            trainer.trainer_settings.keep_checkpoints,
        ) for step in checkpoint_range
    ]
    mock_add_checkpoint.assert_has_calls(add_checkpoint_calls)
Example #5
0
def test_process_trajectory(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[2],
                                              vector_obs_space=1)
    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"
    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0).brain_name

    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0,
                           dummy_config, True, "0")

    # first policy encountered becomes policy trained by wrapped PPO
    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)
    trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
    trainer.subscribe_trajectory_queue(trajectory_queue0)

    # Ghost trainer should ignore this queue because off policy
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)
    trajectory_queue1 = AgentManagerQueue(behavior_id_team1)
    trainer.subscribe_trajectory_queue(trajectory_queue1)

    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    trajectory_queue0.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.trainer.update_buffer.num_experiences == 15

    trajectory_queue1.put(trajectory)
    trainer.advance()

    # Check that ghost trainer ignored off policy queue
    assert trainer.trainer.update_buffer.num_experiences == 15
    # Check that it emptied the queue
    assert trajectory_queue1.empty()
Example #6
0
    def publish_policy_queue(self,
                             policy_queue: AgentManagerQueue[Policy]) -> None:
        """
        Adds a policy queue for every member of the team to the list of queues to publish to when this Trainer
        makes a policy update.  Creates an internal policy queue for the wrapped
        trainer to push to.  The GhostTrainer pushes all policies to the env.
        :param queue: Policy queue to publish to.
        """
        super().publish_policy_queue(policy_queue)
        parsed_behavior_id = self._name_to_parsed_behavior_id[
            policy_queue.behavior_id]
        try:
            self._team_to_name_to_policy_queue[parsed_behavior_id.team_id][
                parsed_behavior_id.brain_name] = policy_queue
        except KeyError:
            self._team_to_name_to_policy_queue[parsed_behavior_id.team_id] = {
                parsed_behavior_id.brain_name: policy_queue
            }
        if parsed_behavior_id.team_id == self.wrapped_trainer_team:
            # With a future multiagent trainer, this will be indexed by 'role'
            internal_policy_queue: AgentManagerQueue[
                Policy] = AgentManagerQueue(parsed_behavior_id.brain_name)

            self._internal_policy_queues[
                parsed_behavior_id.brain_name] = internal_policy_queue
            self.trainer.publish_policy_queue(internal_policy_queue)
def test_advance(mocked_clear_update_buffer):
    trainer = create_rl_trainer()
    trajectory_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    time_horizon = 15
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)

    trainer.advance()
    # Check that get_step is correct
    assert trainer.get_step == time_horizon
    # Check that we can turn off the trainer and that the buffer is cleared
    for _ in range(0, 10):
        trajectory_queue.put(trajectory)
        trainer.advance()

    # Check that the buffer has been cleared
    assert not trainer.should_still_train
    assert mocked_clear_update_buffer.call_count > 0
Example #8
0
def test_agent_manager_queue():
    queue = AgentManagerQueue(behavior_id="testbehavior")
    trajectory = mock.Mock(spec=Trajectory)
    assert queue.empty()
    queue.put(trajectory)
    assert not queue.empty()
    queue_traj = queue.get_nowait()
    assert isinstance(queue_traj, Trajectory)
    assert queue.empty()
Example #9
0
def test_process_trajectory(dummy_config):
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params)
    trainer.add_policy(brain_params.brain_name, policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15

    # Check that GAE worked
    assert (
        "advantages" in trainer.update_buffer
        and "discounted_returns" in trainer.update_buffer
    )

    # Check that the stats are being collected as episode isn't complete
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent > 0

    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
        length=time_horizon + 1,
        max_step_complete=False,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent == 0
    assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
Example #10
0
def test_advance(mocked_clear_update_buffer, mocked_save_model):
    trainer = create_rl_trainer()
    mock_policy = mock.Mock()
    mock_policy.model_path = "mock_model_path"
    trainer.add_policy("TestBrain", mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_shapes=[(1, )],
        max_step_complete=True,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)

    trainer.advance()
    policy_queue.get_nowait()
    # Check that get_step is correct
    assert trainer.get_step == time_horizon
    # Check that we can turn off the trainer and that the buffer is cleared
    for _ in range(0, 5):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that if the policy doesn't update, we don't push it to the queue
    trainer.set_is_policy_updating(False)
    for _ in range(0, 10):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there nothing  in the policy queue
        with pytest.raises(AgentManagerQueue.Empty):
            policy_queue.get_nowait()

    # Check that the buffer has been cleared
    assert not trainer.should_still_train
    assert mocked_clear_update_buffer.call_count > 0
    assert mocked_save_model.call_count == 0
Example #11
0
def test_process_trajectory(dummy_config):
    behavior_spec = mb.setup_test_behavior_specs(
        True,
        False,
        vector_action_space=DISCRETE_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )
    mock_brain_name = "MockBrain"
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
    trainer = PPOTrainer("test_brain", 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(behavior_id, behavior_spec)
    trainer.add_policy(behavior_id, policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        observation_shapes=behavior_spec.observation_shapes,
        max_step_complete=True,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15

    # Check that GAE worked
    assert (
        "advantages" in trainer.update_buffer
        and "discounted_returns" in trainer.update_buffer
    )

    # Check that the stats are being collected as episode isn't complete
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent > 0

    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
        length=time_horizon + 1,
        max_step_complete=False,
        observation_shapes=behavior_spec.observation_shapes,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent == 0
    assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
Example #12
0
def test_poca_end_episode():
    name_behavior_id = "test_trainer"
    trainer = POCATrainer(
        name_behavior_id,
        10,
        TrainerSettings(max_steps=100, checkpoint_interval=10,
                        summary_freq=20),
        True,
        False,
        0,
        "mock_model_path",
    )
    behavior_spec = BehaviorSpec(create_observation_specs_with_shapes([(1, )]),
                                 ActionSpec.create_discrete((2, )))
    parsed_behavior_id = BehaviorIdentifiers.from_name_behavior_id(
        name_behavior_id)
    mock_policy = trainer.create_policy(parsed_behavior_id, behavior_spec)
    trainer.add_policy(parsed_behavior_id, mock_policy)
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        observation_specs=behavior_spec.observation_specs,
        max_step_complete=False,
        action_spec=behavior_spec.action_spec,
        num_other_agents_in_group=2,
        group_reward=1.0,
        is_terminal=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
    # Test that some trajectoories have been injested
    for reward in trainer.collected_group_rewards.values():
        assert reward == 10
    # Test end episode
    trainer.end_episode()
    assert len(trainer.collected_group_rewards.keys()) == 0
Example #13
0
def test_process_trajectory(dummy_config):
    brain_params = make_brain_parameters(discrete_action=False,
                                         visual_inputs=0,
                                         vec_obs_size=6)
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params.brain_name, brain_params)
    trainer.add_policy(brain_params.brain_name, policy)

    trajectory_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)

    trajectory = make_fake_trajectory(
        length=15,
        max_step_complete=True,
        vec_obs_size=6,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15

    # Check that the stats are being collected as episode isn't complete
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent > 0

    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
        length=15,
        max_step_complete=False,
        vec_obs_size=6,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent == 0
    assert trainer.stats_reporter.get_stats_summaries(
        "Policy/Extrinsic Reward").num > 0
    # Assert we're not just using the default values
    assert (trainer.stats_reporter.get_stats_summaries(
        "Policy/Extrinsic Reward").mean > 0)
Example #14
0
def test_summary_checkpoint(mock_write_summary, mock_save_model):
    trainer = create_rl_trainer()
    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)
    time_horizon = 10
    summary_freq = trainer.trainer_settings.summary_freq
    checkpoint_interval = trainer.trainer_settings.checkpoint_interval
    trajectory = mb.make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    # Check that we can turn off the trainer and that the buffer is cleared
    num_trajectories = 5
    for _ in range(0, num_trajectories):
        trajectory_queue.put(trajectory)
        trainer.advance()
        # Check that there is stuff in the policy queue
        policy_queue.get_nowait()

    # Check that we have called write_summary the appropriate number of times
    calls = [
        mock.call(step) for step in range(summary_freq, num_trajectories *
                                          time_horizon, summary_freq)
    ]
    mock_write_summary.assert_has_calls(calls, any_order=True)

    calls = [
        mock.call(trainer.brain_name)
        for step in range(checkpoint_interval, num_trajectories *
                          time_horizon, checkpoint_interval)
    ]
    mock_save_model.assert_has_calls(calls, any_order=True)
Example #15
0
    def subscribe_trajectory_queue(
            self, trajectory_queue: AgentManagerQueue[Trajectory]) -> None:
        """
        Adds a trajectory queue to the list of queues for the trainer to ingest Trajectories from.
        :param queue: Trajectory queue to publish to.
        """

        if trajectory_queue.behavior_id == self.learning_behavior_name:
            super().subscribe_trajectory_queue(trajectory_queue)

            internal_trajectory_queue: AgentManagerQueue[
                Trajectory] = AgentManagerQueue(trajectory_queue.behavior_id)

            self.internal_trajectory_queues.append(internal_trajectory_queue)
            self.trainer.subscribe_trajectory_queue(internal_trajectory_queue)
Example #16
0
    def publish_policy_queue(self, policy_queue: AgentManagerQueue[Policy]) -> None:
        """
        Adds a policy queue to the list of queues to publish to when this Trainer
        makes a policy update
        :param queue: Policy queue to publish to.
        """
        super().publish_policy_queue(policy_queue)
        if policy_queue.behavior_id == self.learning_behavior_name:

            internal_policy_queue: AgentManagerQueue[Policy] = AgentManagerQueue(
                policy_queue.behavior_id
            )

            self.internal_policy_queues.append(internal_policy_queue)
            self.learning_policy_queues[policy_queue.behavior_id] = policy_queue
            self.trainer.publish_policy_queue(internal_policy_queue)
Example #17
0
def test_sac_trainer_update_normalization(sac_config):
    behavior_id_team0 = "test_brain?team=0"
    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0).brain_name
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[2],
                                              vector_obs_space=1)
    base_config = sac_config.behaviors
    output_path = "results_dir"
    train_model = True
    load_model = False
    seed = 42
    trainer_factory = TrainerFactory(
        trainer_config=base_config,
        output_path=output_path,
        train_model=train_model,
        load_model=load_model,
        seed=seed,
        param_manager=EnvironmentParameterManager(),
    )
    sac_trainer = trainer_factory.generate(brain_name)
    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)
    policy = sac_trainer.create_policy(parsed_behavior_id0, mock_specs)
    sac_trainer.add_policy(parsed_behavior_id0, policy)
    trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
    sac_trainer.subscribe_trajectory_queue(trajectory_queue0)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_specs=create_observation_specs_with_shapes([(1, )]),
        action_spec=mock_specs.action_spec,
    )
    trajectory_queue0.put(trajectory)
    # mocking out update_normalization in both the policy and critic
    with patch(
            "mlagents.trainers.torch.networks.ValueNetwork.update_normalization"
    ) as optimizer_update_normalization_mock, patch(
            "mlagents.trainers.policy.torch_policy.TorchPolicy.update_normalization"
    ) as policy_update_normalization_mock:
        sac_trainer.advance()
        optimizer_update_normalization_mock.assert_called_once()
        policy_update_normalization_mock.assert_called_once()
Example #18
0
    def subscribe_trajectory_queue(
            self, trajectory_queue: AgentManagerQueue[Trajectory]) -> None:
        """
        Adds a trajectory queue for every member of the team to the list of queues for the trainer
        to ingest Trajectories from. Creates an internal trajectory queue to push trajectories from
        the learning team.  The wrapped trainer subscribes to this queue.
        :param queue: Trajectory queue to publish to.
        """
        super().subscribe_trajectory_queue(trajectory_queue)
        parsed_behavior_id = self._name_to_parsed_behavior_id[
            trajectory_queue.behavior_id]
        if parsed_behavior_id.team_id == self.wrapped_trainer_team:
            # With a future multiagent trainer, this will be indexed by 'role'
            internal_trajectory_queue: AgentManagerQueue[
                Trajectory] = AgentManagerQueue(parsed_behavior_id.brain_name)

            self._internal_trajectory_queues[
                parsed_behavior_id.brain_name] = internal_trajectory_queue
            self.trainer.subscribe_trajectory_queue(internal_trajectory_queue)
Example #19
0
def test_publish_queue(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(
        True, False, vector_action_space=[1], vector_obs_space=8
    )

    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)

    brain_name = parsed_behavior_id0.brain_name

    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
    )

    # First policy encountered becomes policy trained by wrapped PPO
    # This queue should remain empty after swap snapshot
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)
    policy_queue0 = AgentManagerQueue(behavior_id_team0)
    trainer.publish_policy_queue(policy_queue0)

    # Ghost trainer should use this queue for ghost policy swap
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)
    policy_queue1 = AgentManagerQueue(behavior_id_team1)
    trainer.publish_policy_queue(policy_queue1)

    # check ghost trainer swap pushes to ghost queue and not trainer
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer._swap_snapshots()
    assert policy_queue0.empty() and not policy_queue1.empty()
    # clear
    policy_queue1.get_nowait()

    mock_specs = mb.setup_test_behavior_specs(
        False,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
    trainer.trainer.update_buffer = buffer

    # when ghost trainer advance and wrapped trainer buffers full
    # the wrapped trainer pushes updated policy to correct queue
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer.advance()
    assert not policy_queue0.empty() and policy_queue1.empty()
Example #20
0
def test_publish_queue(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[1],
                                              vector_obs_space=8)

    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)

    brain_name = parsed_behavior_id0.brain_name

    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0,
                           dummy_config, True, "0")

    # First policy encountered becomes policy trained by wrapped PPO
    # This queue should remain empty after swap snapshot
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)
    policy_queue0 = AgentManagerQueue(behavior_id_team0)
    trainer.publish_policy_queue(policy_queue0)

    # Ghost trainer should use this queue for ghost policy swap
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)
    policy_queue1 = AgentManagerQueue(behavior_id_team1)
    trainer.publish_policy_queue(policy_queue1)

    # check ghost trainer swap pushes to ghost queue and not trainer
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer._swap_snapshots()
    assert policy_queue0.empty() and not policy_queue1.empty()
    # clear
    policy_queue1.get_nowait()

    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
    # Mock out reward signal eval
    copy_buffer_fields(
        buffer,
        src_key=BufferKey.ENVIRONMENT_REWARDS,
        dst_keys=[
            BufferKey.ADVANTAGES,
            RewardSignalUtil.rewards_key("extrinsic"),
            RewardSignalUtil.returns_key("extrinsic"),
            RewardSignalUtil.value_estimates_key("extrinsic"),
            RewardSignalUtil.rewards_key("curiosity"),
            RewardSignalUtil.returns_key("curiosity"),
            RewardSignalUtil.value_estimates_key("curiosity"),
        ],
    )

    trainer.trainer.update_buffer = buffer

    # when ghost trainer advance and wrapped trainer buffers full
    # the wrapped trainer pushes updated policy to correct queue
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer.advance()
    assert not policy_queue0.empty() and policy_queue1.empty()
Example #21
0
def test_advance(dummy_config):
    brain_params = make_brain_parameters(discrete_action=False,
                                         visual_inputs=0,
                                         vec_obs_size=6)
    dummy_config["output_path"] = "./results/test_trainer_models/TestModel"
    dummy_config["steps_per_update"] = 20
    trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params.brain_name, brain_params)
    trainer.add_policy(brain_params.brain_name, policy)

    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)

    trajectory = make_fake_trajectory(
        length=15,
        max_step_complete=True,
        vec_obs_size=6,
        num_vis_obs=0,
        action_space=[2],
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15

    # Check that the stats are being collected as episode isn't complete
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent > 0

    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
        length=6,
        max_step_complete=False,
        vec_obs_size=6,
        num_vis_obs=0,
        action_space=[2],
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent == 0
    assert trainer.stats_reporter.get_stats_summaries(
        "Policy/Extrinsic Reward").num > 0
    # Assert we're not just using the default values
    assert (trainer.stats_reporter.get_stats_summaries(
        "Policy/Extrinsic Reward").mean > 0)

    # Make sure there is a policy on the queue
    policy_queue.get_nowait()

    # Add another trajectory. Since this is less than 20 steps total (enough for)
    # two updates, there should NOT be a policy on the queue.
    trajectory = make_fake_trajectory(
        length=5,
        max_step_complete=False,
        vec_obs_size=6,
        num_vis_obs=0,
        action_space=[2],
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
    with pytest.raises(AgentManagerQueue.Empty):
        policy_queue.get_nowait()
Example #22
0
def test_publish_queue(dummy_config):
    brain_params_team0 = BrainParameters(
        brain_name="test_brain?team=0",
        vector_observation_space_size=8,
        camera_resolutions=[],
        vector_action_space_size=[1],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        brain_params_team0.brain_name
    )

    brain_name = parsed_behavior_id0.brain_name

    brain_params_team1 = BrainParameters(
        brain_name="test_brain?team=1",
        vector_observation_space_size=8,
        camera_resolutions=[],
        vector_action_space_size=[1],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
    )

    # First policy encountered becomes policy trained by wrapped PPO
    # This queue should remain empty after swap snapshot
    policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0)
    trainer.add_policy(parsed_behavior_id0, policy)
    policy_queue0 = AgentManagerQueue(brain_params_team0.brain_name)
    trainer.publish_policy_queue(policy_queue0)

    # Ghost trainer should use this queue for ghost policy swap
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        brain_params_team1.brain_name
    )
    policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1)
    trainer.add_policy(parsed_behavior_id1, policy)
    policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
    trainer.publish_policy_queue(policy_queue1)

    # check ghost trainer swap pushes to ghost queue and not trainer
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer._swap_snapshots()
    assert policy_queue0.empty() and not policy_queue1.empty()
    # clear
    policy_queue1.get_nowait()

    mock_brain = mb.setup_mock_brain(
        False,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
    trainer.trainer.update_buffer = buffer

    # when ghost trainer advance and wrapped trainer buffers full
    # the wrapped trainer pushes updated policy to correct queue
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer.advance()
    assert not policy_queue0.empty() and policy_queue1.empty()
Example #23
0
def test_advance(dummy_config):
    brain_params = make_brain_parameters(
        discrete_action=False, visual_inputs=0, vec_obs_size=6
    )
    dummy_config.hyperparameters.steps_per_update = 20
    dummy_config.hyperparameters.reward_signal_steps_per_update = 20
    dummy_config.hyperparameters.buffer_init_steps = 0
    trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = trainer.create_policy(brain_params.brain_name, brain_params)
    trainer.add_policy(brain_params.brain_name, policy)

    trajectory_queue = AgentManagerQueue("testbrain")
    policy_queue = AgentManagerQueue("testbrain")
    trainer.subscribe_trajectory_queue(trajectory_queue)
    trainer.publish_policy_queue(policy_queue)

    trajectory = make_fake_trajectory(
        length=15,
        max_step_complete=True,
        vec_obs_size=6,
        num_vis_obs=0,
        action_space=[2],
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15

    # Check that the stats are being collected as episode isn't complete
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent > 0

    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
        length=6,
        max_step_complete=False,
        vec_obs_size=6,
        num_vis_obs=0,
        action_space=[2],
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent == 0
    assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0
    # Assert we're not just using the default values
    assert (
        trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").mean > 0
    )

    # Make sure there is a policy on the queue
    policy_queue.get_nowait()

    # Add another trajectory. Since this is less than 20 steps total (enough for)
    # two updates, there should NOT be a policy on the queue.
    trajectory = make_fake_trajectory(
        length=5,
        max_step_complete=False,
        vec_obs_size=6,
        num_vis_obs=0,
        action_space=[2],
        is_discrete=False,
    )
    trajectory_queue.put(trajectory)
    trainer.advance()
    with pytest.raises(AgentManagerQueue.Empty):
        policy_queue.get_nowait()

    # Call add_policy and check that we update the correct number of times.
    # This is to emulate a load from checkpoint.
    policy = trainer.create_policy(brain_params.brain_name, brain_params)
    policy.get_current_step = lambda: 200
    trainer.add_policy(brain_params.brain_name, policy)
    trainer.optimizer.update = mock.Mock()
    trainer.optimizer.update_reward_signals = mock.Mock()
    trainer.optimizer.update_reward_signals.return_value = {}
    trainer.optimizer.update.return_value = {}
    trajectory_queue.put(trajectory)
    trainer.advance()
    # Make sure we did exactly 1 update
    assert trainer.optimizer.update.call_count == 1
    assert trainer.optimizer.update_reward_signals.call_count == 1