def test_add_rewards_output(dummy_config):
    brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0)
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0",
                         False)
    rewardsout = AllRewardsOutput(
        reward_signals={
            "extrinsic":
            RewardSignalResult(scaled_reward=np.array([1.0, 1.0]),
                               unscaled_reward=np.array([1.0, 1.0]))
        },
        environment=np.array([1.0, 1.0]),
    )
    values = {"extrinsic": np.array([[2.0]])}
    agent_id = "123"
    idx = 0
    # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail.
    next_idx = 1
    trainer.add_rewards_outputs(
        rewardsout,
        values=values,
        agent_id=agent_id,
        agent_idx=idx,
        agent_next_idx=next_idx,
    )
    assert trainer.training_buffer[agent_id]["extrinsic_value_estimates"][
        0] == 2.0
    assert trainer.training_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
def test_trainer_increment_step(dummy_config):
    trainer_params = dummy_config
    brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0)

    trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0", False)
    policy_mock = mock.Mock()
    step_count = 10
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    trainer.policy = policy_mock

    trainer.increment_step(5)
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == 10
Example #3
0
def test_trainer_update_policy(
    dummy_config, curiosity_dummy_config, use_discrete  # noqa: F811
):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_params = dummy_config
    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
        memory_size=10, sequence_length=16
    )

    # Test curiosity reward signal
    trainer_params.reward_signals = curiosity_dummy_config
    trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]

    trainer.update_buffer = buffer
    trainer._update_policy()
Example #4
0
def test_trainer_update_policy(dummy_config, use_discrete):
    mock_brain = mb.setup_mock_brain(
        use_discrete,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_params = dummy_config
    trainer_params["use_recurrent"] = True

    # Test curiosity reward signal
    trainer_params["reward_signals"]["curiosity"] = {}
    trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0
    trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
    trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128

    trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False,
                         0, "0")
    policy = trainer.create_policy(mock_brain.brain_name, mock_brain)
    trainer.add_policy(mock_brain.brain_name, policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]

    trainer.update_buffer = buffer
    trainer._update_policy()
Example #5
0
def test_add_get_policy(ppo_optimizer, mock_create_model_saver, dummy_config):
    mock_optimizer = mock.Mock()
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

    trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0")
    policy = mock.Mock(spec=TFPolicy)
    policy.get_current_step.return_value = 2000

    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    trainer.add_policy(behavior_id, policy)
    assert trainer.get_policy("test_policy") == policy

    # Make sure the summary steps were loaded properly
    assert trainer.get_step == 2000
Example #6
0
def test_process_trajectory(dummy_config):
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params.brain_name, 0, dummy_config, True, False,
                         0, "0", False)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=2,
    )
    policy = trainer.create_policy(brain_params)
    trainer.add_policy(brain_params.brain_name, policy)
    trainer.process_trajectory(trajectory)

    # Check that trainer put trajectory in update buffer
    assert trainer.update_buffer.num_experiences == 15

    # Check that GAE worked
    assert ("advantages" in trainer.update_buffer
            and "discounted_returns" in trainer.update_buffer)

    # Check that the stats are being collected as episode isn't complete
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent > 0

    # Add a terminal trajectory
    trajectory = make_fake_trajectory(
        length=time_horizon + 1,
        max_step_complete=False,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=2,
    )
    trainer.process_trajectory(trajectory)

    # Check that the stats are reset as episode is finished
    for reward in trainer.collected_rewards.values():
        for agent in reward.values():
            assert agent == 0
    assert trainer.stats_reporter.get_stats_summaries(
        "Policy/Extrinsic Reward").num > 0
Example #7
0
def test_process_trajectory(dummy_config):
    brain_params_team0 = BrainParameters(
        brain_name="test_brain?team=0",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )

    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        brain_params_team0.brain_name).brain_name

    brain_params_team1 = BrainParameters(
        brain_name="test_brain?team=1",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0",
                             False)
    trainer = GhostTrainer(ppo_trainer, brain_name, 0, dummy_config, True, "0")

    # first policy encountered becomes policy trained by wrapped PPO
    policy = trainer.create_policy(brain_params_team0)
    trainer.add_policy(brain_params_team0.brain_name, policy)
    trajectory_queue0 = AgentManagerQueue(brain_params_team0.brain_name)
    trainer.subscribe_trajectory_queue(trajectory_queue0)

    # Ghost trainer should ignore this queue because off policy
    policy = trainer.create_policy(brain_params_team1)
    trainer.add_policy(brain_params_team1.brain_name, policy)
    trajectory_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
    trainer.subscribe_trajectory_queue(trajectory_queue1)

    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    trajectory_queue0.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.trainer.update_buffer.num_experiences == 15

    trajectory_queue1.put(trajectory)
    trainer.advance()

    # Check that ghost trainer ignored off policy queue
    assert trainer.trainer.update_buffer.num_experiences == 15
    # Check that it emptied the queue
    assert trajectory_queue1.empty()
Example #8
0
def test_resume(dummy_config, tmp_path):
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[2],
                                              vector_obs_space=1)
    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"
    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0).brain_name
    tmp_path = tmp_path.as_posix()
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0,
                             tmp_path)
    controller = GhostController(100)
    trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0,
                           dummy_config, True, tmp_path)

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)

    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)

    trainer.save_model()

    # Make a new trainer, check that the policies are the same
    ppo_trainer2 = PPOTrainer(brain_name, 0, dummy_config, True, True, 0,
                              tmp_path)
    trainer2 = GhostTrainer(ppo_trainer2, brain_name, controller, 0,
                            dummy_config, True, tmp_path)
    policy = trainer2.create_policy(parsed_behavior_id0, mock_specs)
    trainer2.add_policy(parsed_behavior_id0, policy)

    policy = trainer2.create_policy(parsed_behavior_id1, mock_specs)
    trainer2.add_policy(parsed_behavior_id1, policy)

    trainer1_policy = trainer.get_policy(parsed_behavior_id1.behavior_id)
    trainer2_policy = trainer2.get_policy(parsed_behavior_id1.behavior_id)
    weights = trainer1_policy.get_weights()
    weights2 = trainer2_policy.get_weights()

    for w, lw in zip(weights, weights2):
        np.testing.assert_array_equal(w, lw)
def test_trainer_increment_step():
    trainer_params = {
        "trainer": "ppo",
        "batch_size": 2048,
        "beta": 0.005,
        "buffer_size": 20480,
        "epsilon": 0.2,
        "gamma": 0.995,
        "hidden_units": 512,
        "lambd": 0.95,
        "learning_rate": 0.0003,
        "max_steps": "2e6",
        "memory_size": 256,
        "normalize": True,
        "num_epoch": 3,
        "num_layers": 3,
        "time_horizon": 1000,
        "sequence_length": 64,
        "summary_freq": 3000,
        "use_recurrent": False,
        "use_curiosity": False,
        "curiosity_strength": 0.01,
        "curiosity_enc_size": 128,
        "summary_path": "./summaries/test_trainer_summary",
        "model_path": "./models/test_trainer_models/TestModel",
        "keep_checkpoints": 5,
        "reward_signals": {
            "extrinsic": {
                "strength": 1.0,
                "gamma": 0.99
            }
        },
    }
    brain_params = BrainParameters("test_brain", 1, 1, [], [2], [], 0)

    trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0",
                         False)
    policy_mock = mock.Mock()
    step_count = 10
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    trainer.policy = policy_mock

    trainer.increment_step(5)
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == 10
Example #10
0
def test_add_get_policy(ppo_optimizer, dummy_config):
    brain_params = make_brain_parameters(
        discrete_action=False, visual_inputs=0, vec_obs_size=6
    )
    mock_optimizer = mock.Mock()
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
    policy = mock.Mock(spec=NNPolicy)
    policy.get_current_step.return_value = 2000

    trainer.add_policy(brain_params.brain_name, policy)
    assert trainer.get_policy(brain_params.brain_name) == policy

    # Make sure the summary steps were loaded properly
    assert trainer.get_step == 2000
    assert trainer.next_summary_step > 2000

    # Test incorrect class of policy
    policy = mock.Mock()
    with pytest.raises(RuntimeError):
        trainer.add_policy(brain_params, policy)
Example #11
0
def test_bad_config(dummy_config):
    brain_params = make_brain_parameters(discrete_action=False,
                                         visual_inputs=0,
                                         vec_obs_size=6)
    # Test that we throw an error if we have sequence length greater than batch size
    dummy_config["sequence_length"] = 64
    dummy_config["batch_size"] = 32
    dummy_config["use_recurrent"] = True
    with pytest.raises(UnityTrainerException):
        _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
Example #12
0
def test_trainer_update_policy(
        dummy_config,
        curiosity_dummy_config,
        use_discrete  # noqa: F811
):
    mock_behavior_spec = mb.setup_test_behavior_specs(
        use_discrete,
        False,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_params = dummy_config
    trainer_params.network_settings.memory = NetworkSettings.MemorySettings(
        memory_size=10, sequence_length=16)

    # Test curiosity reward signal
    trainer_params.reward_signals = curiosity_dummy_config
    mock_brain_name = "MockBrain"
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name)
    trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
    policy = trainer.create_policy(behavior_id, mock_behavior_spec)
    trainer.add_policy(behavior_id, policy)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
    if use_discrete:
        n_agents = len(buffer["discrete_log_probs"])
        buffer["discrete_log_probs"].reset_field()
        for _ in range(n_agents):
            buffer["discrete_log_probs"].append(
                np.ones(
                    int(sum(mock_behavior_spec.action_spec.discrete_branches)),
                    dtype=np.float32,
                ))
    else:
        n_agents = len(buffer["continuous_log_probs"])
        buffer["continuous_log_probs"].reset_field()
        for _ in range(n_agents):
            buffer["continuous_log_probs"].append(
                np.ones(mock_behavior_spec.action_spec.continuous_size,
                        dtype=np.float32))
    trainer.update_buffer = buffer
    trainer._update_policy()
Example #13
0
 def initialize_trainers(self, trainer_config: Dict[str, Dict[str, str]]):
     """
     Initialization of the trainers
     :param trainer_config: The configurations of the trainers
     """
     trainer_parameters_dict = {}
     print("External Brains")
     print(self.external_brains)
     for brain_name in self.external_brains:
         print(brain_name)
         trainer_parameters = trainer_config['default'].copy()
         trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
             basedir=self.summaries_dir,
             name=str(self.run_id) + '_' + brain_name)
         trainer_parameters['model_path'] = '{basedir}/{name}'.format(
             basedir=self.model_path, name=brain_name)
         trainer_parameters['keep_checkpoints'] = self.keep_checkpoints
         if brain_name in trainer_config:
             _brain_key = brain_name
             while not isinstance(trainer_config[_brain_key], dict):
                 _brain_key = trainer_config[_brain_key]
             for k in trainer_config[_brain_key]:
                 trainer_parameters[k] = trainer_config[_brain_key][k]
         trainer_parameters_dict[brain_name] = trainer_parameters.copy()
     for brain_name in self.external_brains:
         if trainer_parameters_dict[brain_name]['trainer'] == 'offline_bc':
             self.trainers[brain_name] = OfflineBCTrainer(
                 self.external_brains[brain_name],
                 trainer_parameters_dict[brain_name], self.train_model,
                 self.load_model, self.seed, self.run_id)
         elif trainer_parameters_dict[brain_name]['trainer'] == 'online_bc':
             self.trainers[brain_name] = OnlineBCTrainer(
                 self.external_brains[brain_name],
                 trainer_parameters_dict[brain_name], self.train_model,
                 self.load_model, self.seed, self.run_id)
         elif trainer_parameters_dict[brain_name]['trainer'] == 'ppo':
             print("Now implement ppo method in trainer_controller")
             print(
                 "Parameters brain name for PP////////O trainer in trainer_controller"
             )
             print(self.external_brains[brain_name])
             print("Now end of parameters")
             self.trainers[brain_name] = PPOTrainer(
                 self.external_brains[brain_name],
                 self.meta_curriculum.brains_to_curriculums[brain_name].
                 min_lesson_length if self.meta_curriculum else 0,
                 trainer_parameters_dict[brain_name], self.train_model,
                 self.load_model, self.seed, self.run_id)
             self.trainer_metrics[brain_name] = self.trainers[
                 brain_name].trainer_metrics
         else:
             raise UnityEnvironmentException('The trainer config contains '
                                             'an unknown trainer type for '
                                             'brain {}'.format(brain_name))
Example #14
0
def test_trainer_increment_step(dummy_config):
    trainer_params = dummy_config
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )

    trainer = PPOTrainer(brain_params, 0, trainer_params, True, False, 0, "0", False)
    policy_mock = mock.Mock()
    step_count = 10
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    trainer.policy = policy_mock

    trainer.increment_step(5)
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == 10
Example #15
0
def test_bad_config():
    brain_params = make_brain_parameters(discrete_action=False,
                                         visual_inputs=0,
                                         vec_obs_size=6)
    # Test that we throw an error if we have sequence length greater than batch size
    with pytest.raises(TrainerConfigError):
        TrainerSettings(
            network_settings=NetworkSettings(
                memory=NetworkSettings.MemorySettings(sequence_length=64)),
            hyperparameters=PPOSettings(batch_size=32),
        )
        _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
Example #16
0
def test_normalization(dummy_config):
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0",
                         False)
    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=2,
    )
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
    trainer.process_trajectory(trajectory)

    # Check that the running mean and variance is correct
    steps, mean, variance = trainer.ppo_policy.sess.run([
        trainer.policy.model.normalization_steps,
        trainer.policy.model.running_mean,
        trainer.policy.model.running_variance,
    ])

    assert steps == 6
    assert mean[0] == 0.5
    # Note: variance is divided by number of steps, and initialized to 1 to avoid
    # divide by 0. The right answer is 0.25
    assert (variance[0] - 1) / steps == 0.25

    # Make another update, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=2,
    )
    trainer.process_trajectory(trajectory)

    # Check that the running mean and variance is correct
    steps, mean, variance = trainer.ppo_policy.sess.run([
        trainer.policy.model.normalization_steps,
        trainer.policy.model.running_mean,
        trainer.policy.model.running_variance,
    ])

    assert steps == 16
    assert mean[0] == 0.8125
    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
Example #17
0
def test_load_and_set(dummy_config, use_discrete):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        False,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_params = dummy_config
    trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0")
    trainer.seed = 1
    policy = trainer.create_policy("test", mock_specs)
    trainer.seed = 20  # otherwise graphs are the same
    to_load_policy = trainer.create_policy("test", mock_specs)

    weights = policy.get_weights()
    load_weights = to_load_policy.get_weights()
    try:
        for w, lw in zip(weights, load_weights):
            np.testing.assert_array_equal(w, lw)
    except AssertionError:
        pass

    to_load_policy.load_weights(weights)
    load_weights = to_load_policy.get_weights()

    for w, lw in zip(weights, load_weights):
        np.testing.assert_array_equal(w, lw)
Example #18
0
def test_trainer_increment_step(ppo_optimizer, dummy_config):
    trainer_params = dummy_config
    mock_optimizer = mock.Mock()
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )

    trainer = PPOTrainer(
        brain_params.brain_name, 0, trainer_params, True, False, 0, "0"
    )
    policy_mock = mock.Mock(spec=NNPolicy)
    policy_mock.get_current_step.return_value = 0
    step_count = (
        5
    )  # 10 hacked because this function is no longer called through trainer
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    trainer.add_policy("testbehavior", policy_mock)

    trainer._increment_step(5, "testbehavior")
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == step_count
Example #19
0
def test_process_trajectory(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(True,
                                              False,
                                              vector_action_space=[2],
                                              vector_obs_space=1)
    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"
    brain_name = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0).brain_name

    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(ppo_trainer, brain_name, controller, 0,
                           dummy_config, True, "0")

    # first policy encountered becomes policy trained by wrapped PPO
    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team0)
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)
    trajectory_queue0 = AgentManagerQueue(behavior_id_team0)
    trainer.subscribe_trajectory_queue(trajectory_queue0)

    # Ghost trainer should ignore this queue because off policy
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)
    trajectory_queue1 = AgentManagerQueue(behavior_id_team1)
    trainer.subscribe_trajectory_queue(trajectory_queue1)

    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    trajectory_queue0.put(trajectory)
    trainer.advance()

    # Check that trainer put trajectory in update buffer
    assert trainer.trainer.update_buffer.num_experiences == 15

    trajectory_queue1.put(trajectory)
    trainer.advance()

    # Check that ghost trainer ignored off policy queue
    assert trainer.trainer.update_buffer.num_experiences == 15
    # Check that it emptied the queue
    assert trajectory_queue1.empty()
Example #20
0
def test_add_rewards_output(dummy_config):
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    trainer = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0",
                         False)
    rewardsout = AllRewardsOutput(
        reward_signals={
            "extrinsic":
            RewardSignalResult(
                scaled_reward=np.array([1.0, 1.0], dtype=np.float32),
                unscaled_reward=np.array([1.0, 1.0], dtype=np.float32),
            )
        },
        environment=np.array([1.0, 1.0], dtype=np.float32),
    )
    values = {"extrinsic": np.array([[2.0]], dtype=np.float32)}
    agent_id = "123"
    idx = 0
    # make sure that we're grabbing from the next_idx for rewards. If we're not, the test will fail.
    next_idx = 1
    trainer.add_rewards_outputs(
        rewardsout,
        values=values,
        agent_id=agent_id,
        agent_idx=idx,
        agent_next_idx=next_idx,
    )
    assert trainer.processing_buffer[agent_id]["extrinsic_value_estimates"][
        0] == 2.0
    assert trainer.processing_buffer[agent_id]["extrinsic_rewards"][0] == 1.0
Example #21
0
def test_trainer_increment_step(dummy_config):
    trainer_params = dummy_config
    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )

    trainer = PPOTrainer(brain_params.brain_name, 0, trainer_params, True,
                         False, 0, "0", False)
    policy_mock = mock.Mock()
    step_count = (
        5
    )  # 10 hacked becausee this function is no longer called through trainer
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    trainer.policy = policy_mock

    trainer.increment_step(5)
    print(trainer.policy.increment_step(5))
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == step_count
    def _initialize_trainers(self, trainer_config, sess):
        trainer_parameters_dict = {}
        # TODO: This probably doesn't need to be reinitialized.
        self.trainers = {}
        for brain_name in self.env.external_brain_names:
            trainer_parameters = trainer_config['default'].copy()
            if len(self.env.external_brain_names) > 1:
                graph_scope = re.sub('[^0-9a-zA-Z]+', '-', brain_name)
                trainer_parameters['graph_scope'] = graph_scope
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir,
                    name=str(self.run_id) + '_' + graph_scope)
            else:
                trainer_parameters['graph_scope'] = ''
                trainer_parameters['summary_path'] = '{basedir}/{name}'.format(
                    basedir=self.summaries_dir, name=str(self.run_id))
            if brain_name in trainer_config:
                _brain_key = brain_name
                while not isinstance(trainer_config[_brain_key], dict):
                    _brain_key = trainer_config[_brain_key]
                for k in trainer_config[_brain_key]:
                    trainer_parameters[k] = trainer_config[_brain_key][k]
            trainer_parameters_dict[brain_name] = trainer_parameters.copy()
        for brain_name in self.env.external_brain_names:
            if trainer_parameters_dict[brain_name]['trainer'] == 'imitation':
                self.trainers[brain_name] = BehavioralCloningTrainer(
                    sess, self.env.brains[brain_name],
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed, self.run_id)
            elif trainer_parameters_dict[brain_name]['trainer'] == 'ppo':

                ###############################################################################
                #######         External brain becomes internal brain in here        ##########
                ###############################################################################

                self.trainers[brain_name] = PPOTrainer(
                    sess, self.env.brains[brain_name],
                    self.meta_curriculum.brains_to_curriculums[brain_name].
                    min_lesson_length if self.meta_curriculum else 0,
                    trainer_parameters_dict[brain_name], self.train_model,
                    self.seed, self.run_id)
            else:
                raise UnityEnvironmentException('The trainer config contains '
                                                'an unknown trainer type for '
                                                'brain {}'.format(brain_name))
Example #23
0
def test_add_get_policy(ppo_optimizer, dummy_config):
    mock_optimizer = mock.Mock()
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

    trainer = PPOTrainer("test_policy", 0, dummy_config, True, False, 0, "0")
    policy = mock.Mock(spec=NNPolicy)
    policy.get_current_step.return_value = 2000

    trainer.add_policy("test_policy", policy)
    assert trainer.get_policy("test_policy") == policy

    # Make sure the summary steps were loaded properly
    assert trainer.get_step == 2000

    # Test incorrect class of policy
    policy = mock.Mock()
    with pytest.raises(RuntimeError):
        trainer.add_policy("test_policy", policy)
Example #24
0
def test_trainer_increment_step(ppo_optimizer):
    trainer_params = PPO_CONFIG
    mock_optimizer = mock.Mock()
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

    trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0")
    policy_mock = mock.Mock(spec=NNPolicy)
    policy_mock.get_current_step.return_value = 0
    step_count = (
        5  # 10 hacked because this function is no longer called through trainer
    )
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    trainer.add_policy("testbehavior", policy_mock)

    trainer._increment_step(5, "testbehavior")
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == step_count
Example #25
0
def test_trainer_increment_step(ppo_optimizer, mock_create_model_saver):
    trainer_params = PPO_CONFIG
    mock_optimizer = mock.Mock()
    mock_optimizer.reward_signals = {}
    ppo_optimizer.return_value = mock_optimizer

    trainer = PPOTrainer("test_brain", 0, trainer_params, True, False, 0, "0")
    policy_mock = mock.Mock(spec=TFPolicy)
    policy_mock.get_current_step.return_value = 0
    step_count = (
        5  # 10 hacked because this function is no longer called through trainer
    )
    policy_mock.increment_step = mock.Mock(return_value=step_count)
    behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name)
    trainer.add_policy(behavior_id, policy_mock)

    trainer._increment_step(5, trainer.brain_name)
    policy_mock.increment_step.assert_called_with(5)
    assert trainer.step == step_count
Example #26
0
def test_trainer_update_policy(mock_env, dummy_config, use_discrete):
    env, mock_brain, _ = mb.setup_mock_env_and_brains(
        mock_env,
        use_discrete,
        False,
        num_agents=NUM_AGENTS,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    trainer_params = dummy_config
    trainer_params["use_recurrent"] = True

    # Test curiosity reward signal
    trainer_params["reward_signals"]["curiosity"] = {}
    trainer_params["reward_signals"]["curiosity"]["strength"] = 1.0
    trainer_params["reward_signals"]["curiosity"]["gamma"] = 0.99
    trainer_params["reward_signals"]["curiosity"]["encoding_size"] = 128

    trainer = PPOTrainer(mock_brain, 0, trainer_params, True, False, 0, "0",
                         False)
    # Test update with sequence length smaller than batch size
    buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["rewards"]
    buffer["extrinsic_returns"] = buffer["rewards"]
    buffer["extrinsic_value_estimates"] = buffer["rewards"]
    buffer["curiosity_rewards"] = buffer["rewards"]
    buffer["curiosity_returns"] = buffer["rewards"]
    buffer["curiosity_value_estimates"] = buffer["rewards"]

    trainer.update_buffer = buffer
    trainer.update_policy()
    # Make batch length a larger multiple of sequence length
    trainer.trainer_parameters["batch_size"] = 128
    trainer.update_policy()
    # Make batch length a larger non-multiple of sequence length
    trainer.trainer_parameters["batch_size"] = 100
    trainer.update_policy()
Example #27
0
def test_publish_queue(dummy_config):
    mock_specs = mb.setup_test_behavior_specs(
        True, False, vector_action_space=[1], vector_obs_space=8
    )

    behavior_id_team0 = "test_brain?team=0"
    behavior_id_team1 = "test_brain?team=1"

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team0)

    brain_name = parsed_behavior_id0.brain_name

    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
    )

    # First policy encountered becomes policy trained by wrapped PPO
    # This queue should remain empty after swap snapshot
    policy = trainer.create_policy(parsed_behavior_id0, mock_specs)
    trainer.add_policy(parsed_behavior_id0, policy)
    policy_queue0 = AgentManagerQueue(behavior_id_team0)
    trainer.publish_policy_queue(policy_queue0)

    # Ghost trainer should use this queue for ghost policy swap
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(behavior_id_team1)
    policy = trainer.create_policy(parsed_behavior_id1, mock_specs)
    trainer.add_policy(parsed_behavior_id1, policy)
    policy_queue1 = AgentManagerQueue(behavior_id_team1)
    trainer.publish_policy_queue(policy_queue1)

    # check ghost trainer swap pushes to ghost queue and not trainer
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer._swap_snapshots()
    assert policy_queue0.empty() and not policy_queue1.empty()
    # clear
    policy_queue1.get_nowait()

    mock_specs = mb.setup_test_behavior_specs(
        False,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_specs)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
    trainer.trainer.update_buffer = buffer

    # when ghost trainer advance and wrapped trainer buffers full
    # the wrapped trainer pushes updated policy to correct queue
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer.advance()
    assert not policy_queue0.empty() and policy_queue1.empty()
Example #28
0
def initialize_trainer(
    trainer_config: Any,
    brain_parameters: BrainParameters,
    summaries_dir: str,
    run_id: str,
    model_path: str,
    keep_checkpoints: int,
    train_model: bool,
    load_model: bool,
    seed: int,
    meta_curriculum: MetaCurriculum = None,
    multi_gpu: bool = False,
) -> Trainer:
    """
    Initializes a trainer given a provided trainer configuration and brain parameters, as well as
    some general training session options.

    :param trainer_config: Original trainer configuration loaded from YAML
    :param brain_parameters: BrainParameters provided by the Unity environment
    :param summaries_dir: Directory to store trainer summary statistics
    :param run_id: Run ID to associate with this training run
    :param model_path: Path to save the model
    :param keep_checkpoints: How many model checkpoints to keep
    :param train_model: Whether to train the model (vs. run inference)
    :param load_model: Whether to load the model or randomly initialize
    :param seed: The random seed to use
    :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
    :param multi_gpu: Whether to use multi-GPU training
    :return:
    """
    brain_name = brain_parameters.brain_name
    if "default" not in trainer_config and brain_name not in trainer_config:
        raise TrainerConfigError(
            f'Trainer config must have either a "default" section, or a section for the brain name ({brain_name}). '
            "See config/trainer_config.yaml for an example.")

    trainer_parameters = trainer_config.get("default", {}).copy()
    trainer_parameters["summary_path"] = str(run_id) + "_" + brain_name
    trainer_parameters["model_path"] = "{basedir}/{name}".format(
        basedir=model_path, name=brain_name)
    trainer_parameters["keep_checkpoints"] = keep_checkpoints
    if brain_name in trainer_config:
        _brain_key: Any = brain_name
        while not isinstance(trainer_config[_brain_key], dict):
            _brain_key = trainer_config[_brain_key]
        trainer_parameters.update(trainer_config[_brain_key])

    min_lesson_length = 1
    if meta_curriculum:
        if brain_name in meta_curriculum.brains_to_curriculums:
            min_lesson_length = meta_curriculum.brains_to_curriculums[
                brain_name].min_lesson_length
        else:
            logger.warning(
                f"Metacurriculum enabled, but no curriculum for brain {brain_name}. "
                f"Brains with curricula: {meta_curriculum.brains_to_curriculums.keys()}. "
            )

    trainer: Trainer = None  # type: ignore  # will be set to one of these, or raise
    if "trainer" not in trainer_parameters:
        raise TrainerConfigError(
            f'The "trainer" key must be set in your trainer config for brain {brain_name} (or the default brain).'
        )
    trainer_type = trainer_parameters["trainer"]

    if trainer_type == "offline_bc":
        raise UnityTrainerException(
            "The offline_bc trainer has been removed. To train with demonstrations, "
            "please use a PPO or SAC trainer with the GAIL Reward Signal and/or the "
            "Behavioral Cloning feature enabled.")
    elif trainer_type == "ppo":
        trainer = PPOTrainer(
            brain_parameters,
            min_lesson_length,
            trainer_parameters,
            train_model,
            load_model,
            seed,
            run_id,
            multi_gpu,
        )
    elif trainer_type == "sac":
        trainer = SACTrainer(
            brain_parameters,
            min_lesson_length,
            trainer_parameters,
            train_model,
            load_model,
            seed,
            run_id,
        )
    else:
        raise TrainerConfigError(
            f'The trainer config contains an unknown trainer type "{trainer_type}" for brain {brain_name}'
        )
    return trainer
Example #29
0
def test_publish_queue(dummy_config):
    brain_params_team0 = BrainParameters(
        brain_name="test_brain?team=0",
        vector_observation_space_size=8,
        camera_resolutions=[],
        vector_action_space_size=[1],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )

    parsed_behavior_id0 = BehaviorIdentifiers.from_name_behavior_id(
        brain_params_team0.brain_name
    )

    brain_name = parsed_behavior_id0.brain_name

    brain_params_team1 = BrainParameters(
        brain_name="test_brain?team=1",
        vector_observation_space_size=8,
        camera_resolutions=[],
        vector_action_space_size=[1],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    ppo_trainer = PPOTrainer(brain_name, 0, dummy_config, True, False, 0, "0")
    controller = GhostController(100)
    trainer = GhostTrainer(
        ppo_trainer, brain_name, controller, 0, dummy_config, True, "0"
    )

    # First policy encountered becomes policy trained by wrapped PPO
    # This queue should remain empty after swap snapshot
    policy = trainer.create_policy(parsed_behavior_id0, brain_params_team0)
    trainer.add_policy(parsed_behavior_id0, policy)
    policy_queue0 = AgentManagerQueue(brain_params_team0.brain_name)
    trainer.publish_policy_queue(policy_queue0)

    # Ghost trainer should use this queue for ghost policy swap
    parsed_behavior_id1 = BehaviorIdentifiers.from_name_behavior_id(
        brain_params_team1.brain_name
    )
    policy = trainer.create_policy(parsed_behavior_id1, brain_params_team1)
    trainer.add_policy(parsed_behavior_id1, policy)
    policy_queue1 = AgentManagerQueue(brain_params_team1.brain_name)
    trainer.publish_policy_queue(policy_queue1)

    # check ghost trainer swap pushes to ghost queue and not trainer
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer._swap_snapshots()
    assert policy_queue0.empty() and not policy_queue1.empty()
    # clear
    policy_queue1.get_nowait()

    mock_brain = mb.setup_mock_brain(
        False,
        False,
        vector_action_space=VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
        discrete_action_space=DISCRETE_ACTION_SPACE,
    )

    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    buffer["extrinsic_returns"] = buffer["environment_rewards"]
    buffer["extrinsic_value_estimates"] = buffer["environment_rewards"]
    buffer["curiosity_rewards"] = buffer["environment_rewards"]
    buffer["curiosity_returns"] = buffer["environment_rewards"]
    buffer["curiosity_value_estimates"] = buffer["environment_rewards"]
    buffer["advantages"] = buffer["environment_rewards"]
    trainer.trainer.update_buffer = buffer

    # when ghost trainer advance and wrapped trainer buffers full
    # the wrapped trainer pushes updated policy to correct queue
    assert policy_queue0.empty() and policy_queue1.empty()
    trainer.advance()
    assert not policy_queue0.empty() and policy_queue1.empty()
def initialize_trainers(
    trainer_config: Dict[str, Any],
    external_brains: Dict[str, BrainParameters],
    summaries_dir: str,
    run_id: str,
    model_path: str,
    keep_checkpoints: int,
    train_model: bool,
    load_model: bool,
    seed: int,
    meta_curriculum: MetaCurriculum = None,
    multi_gpu: bool = False,
) -> Dict[str, Trainer]:
    """
    Initializes trainers given a provided trainer configuration and set of brains from the environment, as well as
    some general training session options.

    :param trainer_config: Original trainer configuration loaded from YAML
    :param external_brains: BrainParameters provided by the Unity environment
    :param summaries_dir: Directory to store trainer summary statistics
    :param run_id: Run ID to associate with this training run
    :param model_path: Path to save the model
    :param keep_checkpoints: How many model checkpoints to keep
    :param train_model: Whether to train the model (vs. run inference)
    :param load_model: Whether to load the model or randomly initialize
    :param seed: The random seed to use
    :param meta_curriculum: Optional meta_curriculum, used to determine a reward buffer length for PPOTrainer
    :param multi_gpu: Whether to use multi-GPU training
    :return:
    """
    trainers = {}
    trainer_parameters_dict = {}
    for brain_name in external_brains:
        trainer_parameters = trainer_config["default"].copy()
        trainer_parameters["summary_path"] = "{basedir}/{name}".format(
            basedir=summaries_dir, name=str(run_id) + "_" + brain_name)
        trainer_parameters["model_path"] = "{basedir}/{name}".format(
            basedir=model_path, name=brain_name)
        trainer_parameters["keep_checkpoints"] = keep_checkpoints
        if brain_name in trainer_config:
            _brain_key: Any = brain_name
            while not isinstance(trainer_config[_brain_key], dict):
                _brain_key = trainer_config[_brain_key]
            trainer_parameters.update(trainer_config[_brain_key])
        trainer_parameters_dict[brain_name] = trainer_parameters.copy()
    for brain_name in external_brains:
        if trainer_parameters_dict[brain_name]["trainer"] == "offline_bc":
            trainers[brain_name] = OfflineBCTrainer(
                external_brains[brain_name],
                trainer_parameters_dict[brain_name],
                train_model,
                load_model,
                seed,
                run_id,
            )
        elif trainer_parameters_dict[brain_name]["trainer"] == "online_bc":
            trainers[brain_name] = OnlineBCTrainer(
                external_brains[brain_name],
                trainer_parameters_dict[brain_name],
                train_model,
                load_model,
                seed,
                run_id,
            )
        elif trainer_parameters_dict[brain_name]["trainer"] == "ppo":
            trainers[brain_name] = PPOTrainer(
                external_brains[brain_name],
                meta_curriculum.brains_to_curriculums[brain_name].
                min_lesson_length if meta_curriculum else 1,
                trainer_parameters_dict[brain_name],
                train_model,
                load_model,
                seed,
                run_id,
                multi_gpu,
            )
        elif trainer_parameters_dict[brain_name]["trainer"] == "sac":
            trainers[brain_name] = SACTrainer(
                external_brains[brain_name],
                meta_curriculum.brains_to_curriculums[brain_name].
                min_lesson_length if meta_curriculum else 1,
                trainer_parameters_dict[brain_name],
                train_model,
                load_model,
                seed,
                run_id,
            )
        else:
            raise UnityEnvironmentException("The trainer config contains "
                                            "an unknown trainer type for "
                                            "brain {}".format(brain_name))
    return trainers