Example #1
0
 def start_learning(self, env_manager: EnvManager) -> None:
     self._create_output_path(self.output_path)
     tf.reset_default_graph()
     try:
         # Initial reset
         self._reset_env(env_manager)
         while self._not_done_training():
             n_steps = self.advance(env_manager)
             for _ in range(n_steps):
                 self.reset_env_if_ready(env_manager)
         # Stop advancing trainers
         self.join_threads()
     except (
         KeyboardInterrupt,
         UnityCommunicationException,
         UnityEnvironmentException,
         UnityCommunicatorStoppedException,
     ) as ex:
         self.join_threads()
         self.logger.info(
             "Learning was interrupted. Please wait while the graph is generated."
         )
         if isinstance(ex, KeyboardInterrupt) or isinstance(
             ex, UnityCommunicatorStoppedException
         ):
             pass
         else:
             # If the environment failed, we want to make sure to raise
             # the exception so we exit the process with an return code of 1.
             raise ex
     finally:
         if self.train_model:
             self._save_models()
Example #2
0
def test_ppo_optimizer_update_curiosity(
    dummy_config, curiosity_dummy_config, rnn, visual, discrete  # noqa: F811
):
    # Test evaluate
    tf.reset_default_graph()
    dummy_config.reward_signals = curiosity_dummy_config
    optimizer = create_test_ppo_optimizer(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES,
        optimizer.policy.behavior_spec,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_value_estimates"] = update_buffer["environment_rewards"]
    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
Example #3
0
def test_sac_rnn_policy(dummy_config):
    # Test evaluate
    tf.reset_default_graph()
    policy = create_sac_policy_mock(dummy_config,
                                    use_rnn=True,
                                    use_discrete=True,
                                    use_visual=False)
    step = mb.create_batchedstep_from_brainparams(policy.brain,
                                                  num_agents=NUM_AGENTS)
    run_out = policy.evaluate(step, list(step.agent_id))
    assert run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))

    # Test update
    buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                 policy.brain,
                                 memory_size=8)
    # Mock out reward signal eval
    buffer["extrinsic_rewards"] = buffer["environment_rewards"]
    update_buffer = AgentBuffer()
    buffer.resequence_and_append(update_buffer,
                                 training_length=policy.sequence_length)
    run_out = policy.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // policy.sequence_length,
    )
Example #4
0
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
    # Test evaluate
    tf.reset_default_graph()
    optimizer = create_test_ppo_optimizer(dummy_config,
                                          use_rnn=rnn,
                                          use_discrete=discrete,
                                          use_visual=visual)
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES,
        optimizer.policy.behavior_spec,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]

    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
    # Make sure we have the right stats
    required_stats = [
        "Losses/Policy Loss",
        "Losses/Value Loss",
        "Policy/Learning Rate",
        "Policy/Epsilon",
        "Policy/Beta",
    ]
    for stat in required_stats:
        assert stat in return_stats.keys()
Example #5
0
def test_update(mock_get_devices, mock_construct_feed_dict, mock_execute_model,
                dummy_config):
    tf.reset_default_graph()
    mock_get_devices.return_value = ["/device:GPU:0", "/device:GPU:1"]
    mock_construct_feed_dict.return_value = {}
    mock_execute_model.return_value = {
        "value_loss": 0.1,
        "policy_loss": 0.3,
        "update_batch": None,
    }

    trainer_parameters = dummy_config
    trainer_parameters["model_path"] = ""
    trainer_parameters["keep_checkpoints"] = 3
    brain = create_mock_brainparams()
    policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
    mock_mini_batch = mock.Mock()
    mock_mini_batch.items.return_value = [("action", [1, 2]), ("value", [3,
                                                                         4])]
    run_out = policy.update(mock_mini_batch, 1)

    assert mock_mini_batch.items.call_count == len(
        mock_get_devices.return_value)
    assert mock_construct_feed_dict.call_count == len(
        mock_get_devices.return_value)
    assert run_out["Losses/Value Loss"] == 0.1
    assert run_out["Losses/Policy Loss"] == 0.3
Example #6
0
def test_ppo_optimizer_update_curiosity(
        curiosity_dummy_config,
        dummy_config,
        rnn,
        visual,
        discrete  # noqa: F811
):
    # Test evaluate
    tf.reset_default_graph()
    dummy_config["reward_signals"].update(curiosity_dummy_config)
    optimizer = _create_ppo_optimizer_ops_mock(dummy_config,
                                               use_rnn=rnn,
                                               use_discrete=discrete,
                                               use_visual=visual)
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES,
                                        optimizer.policy.brain)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["curiosity_returns"] = update_buffer["environment_rewards"]
    update_buffer["curiosity_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
Example #7
0
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
    # Test evaluate
    tf.reset_default_graph()
    optimizer = _create_ppo_optimizer_ops_mock(dummy_config,
                                               use_rnn=rnn,
                                               use_discrete=discrete,
                                               use_visual=visual)
    # Test update
    behavior_spec = optimizer.policy.behavior_spec
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
    if discrete:
        n_agents = len(update_buffer["discrete_log_probs"])
        update_buffer["discrete_log_probs"] = np.ones(
            (n_agents, int(sum(behavior_spec.action_spec.discrete_branches))),
            dtype=np.float32,
        )
    else:
        n_agents = len(update_buffer["continuous_log_probs"])
        update_buffer["continuous_log_probs"] = np.ones(
            (n_agents, behavior_spec.action_spec.continuous_size),
            dtype=np.float32)

    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
Example #8
0
def test_ppo_optimizer_update_gail(gail_dummy_config, dummy_config):  # noqa: F811
    # Test evaluate
    tf.reset_default_graph()
    dummy_config.reward_signals = gail_dummy_config
    optimizer = _create_ppo_optimizer_ops_mock(
        PPO_CONFIG, use_rnn=False, use_discrete=False, use_visual=False
    )
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES, optimizer.policy.behavior_spec
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer["environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
Example #9
0
def test_sac_update_reward_signals(mock_env, dummy_config, discrete):
    # Test evaluate
    tf.reset_default_graph()
    # Add a Curiosity module
    dummy_config["reward_signals"]["curiosity"] = {}
    dummy_config["reward_signals"]["curiosity"]["strength"] = 1.0
    dummy_config["reward_signals"]["curiosity"]["gamma"] = 0.99
    dummy_config["reward_signals"]["curiosity"]["encoding_size"] = 128
    env, policy = create_sac_policy_mock(mock_env,
                                         dummy_config,
                                         use_rnn=False,
                                         use_discrete=discrete,
                                         use_visual=False)

    # Test update, while removing PPO-specific buffer elements.
    update_buffer = mb.simulate_rollout(
        env,
        policy,
        BUFFER_INIT_SAMPLES,
        exclude_key_list=["advantages", "actions_pre"])

    # Mock out reward signal eval
    update_buffer["extrinsic_rewards"] = update_buffer["rewards"]
    update_buffer["curiosity_rewards"] = update_buffer["rewards"]
    policy.update_reward_signals({"curiosity": update_buffer},
                                 num_sequences=update_buffer.num_experiences)
    env.close()
 def start_learning(self, env_manager: EnvManager) -> None:
     self._create_model_path(self.model_path)
     tf.reset_default_graph()
     global_step = 0
     last_brain_names: Set[str] = set()
     try:
         self._reset_env(env_manager)
         while self._not_done_training():
             external_brains = set(env_manager.external_brains.keys())
             new_brains = external_brains - last_brain_names
             if last_brain_names != env_manager.external_brains.keys():
                 for name in new_brains:
                     trainer = self.trainer_factory.generate(
                         env_manager.external_brains[name])
                     self.start_trainer(trainer, env_manager)
                 last_brain_names = external_brains
             n_steps = self.advance(env_manager)
             for i in range(n_steps):
                 global_step += 1
                 self.reset_env_if_ready(env_manager, global_step)
                 if self._should_save_model(global_step):
                     # Save Tensorflow model
                     self._save_model()
                 self.write_to_tensorboard(global_step)
         # Final save Tensorflow model
         if global_step != 0 and self.train_model:
             self._save_model()
     except (KeyboardInterrupt, UnityCommunicationException):
         if self.train_model:
             self._save_model_when_interrupted()
         pass
     if self.train_model:
         self._write_training_metrics()
         self._export_graph()
     self._write_timing_tree()
Example #11
0
def test_ppo_model_dc_visual():
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            model = PPOModel(
                make_brain_parameters(discrete_action=True, visual_inputs=2))
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output,
                model.all_log_probs,
                model.value,
                model.entropy,
                model.learning_rate,
            ]
            feed_dict = {
                model.batch_size: 2,
                model.sequence_length: 1,
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.visual_in[0]: np.ones([2, 40, 30, 3], dtype=np.float32),
                model.visual_in[1]: np.ones([2, 40, 30, 3], dtype=np.float32),
                model.action_masks: np.ones([2, 2], dtype=np.float32),
            }
            sess.run(run_list, feed_dict=feed_dict)
Example #12
0
def test_ppo_get_value_estimates(mock_communicator, mock_launcher,
                                 dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(" ")
    brain_infos = env.reset()
    brain_info = brain_infos[env.external_brain_names[0]]

    trainer_parameters = dummy_config
    model_path = env.external_brain_names[0]
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    policy = PPOPolicy(0, env.brains[env.external_brain_names[0]],
                       trainer_parameters, False, False)
    run_out = policy.get_value_estimates(brain_info, 0, done=False)
    for key, val in run_out.items():
        assert type(key) is str
        assert type(val) is float

    run_out = policy.get_value_estimates(brain_info, 0, done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val == 0.0

    # Check if we ignore terminal states properly
    policy.reward_signals["extrinsic"].use_terminal_states = False
    run_out = policy.get_value_estimates(brain_info, 0, done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val != 0.0

    env.close()
Example #13
0
def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete):
    tf.reset_default_graph()

    optimizer = _create_ppo_optimizer_ops_mock(dummy_config,
                                               use_rnn=rnn,
                                               use_discrete=discrete,
                                               use_visual=visual)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        observation_shapes=optimizer.policy.behavior_spec.observation_shapes,
        max_step_complete=True,
        action_spec=DISCRETE_ACTION_SPEC
        if discrete else CONTINUOUS_ACTION_SPEC,
    )
    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=False)
    for key, val in run_out.items():
        assert type(key) is str
        assert len(val) == 15

    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=True)
    for key, val in final_value_out.items():
        assert type(key) is str
        assert val == 0.0

    # Check if we ignore terminal states properly
    optimizer.reward_signals["extrinsic"].use_terminal_states = False
    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=False)
    for key, val in final_value_out.items():
        assert type(key) is str
        assert val != 0.0
Example #14
0
 def start_learning(self, env_manager: EnvManager) -> None:
     self._create_model_path(self.model_path)
     tf.reset_default_graph()
     global_step = 0
     last_brain_behavior_ids: Set[str] = set()
     try:
         self._reset_env(env_manager)
         while self._not_done_training():
             external_brain_behavior_ids = set(
                 env_manager.external_brains.keys())
             new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
             for name_behavior_id in new_behavior_ids:
                 self._create_trainer_and_manager(env_manager,
                                                  name_behavior_id)
             last_brain_behavior_ids = external_brain_behavior_ids
             n_steps = self.advance(env_manager)
             for _ in range(n_steps):
                 global_step += 1
                 self.reset_env_if_ready(env_manager, global_step)
                 if self._should_save_model(global_step):
                     # Save Tensorflow model
                     self._save_model()
         # Final save Tensorflow model
         if global_step != 0 and self.train_model:
             self._save_model()
     except (KeyboardInterrupt, UnityCommunicationException):
         if self.train_model:
             self._save_model_when_interrupted()
         pass
     if self.train_model:
         self._export_graph()
     self._write_timing_tree()
Example #15
0
def test_ppo_model_cc_vector_rnn():
    tf.reset_default_graph()
    with tf.Session() as sess:
        with tf.variable_scope("FakeGraphScope"):
            memory_size = 128
            model = PPOModel(
                make_brain_parameters(discrete_action=False, visual_inputs=0),
                use_recurrent=True,
                m_size=memory_size,
            )
            init = tf.global_variables_initializer()
            sess.run(init)

            run_list = [
                model.output,
                model.all_log_probs,
                model.value,
                model.entropy,
                model.learning_rate,
                model.memory_out,
            ]
            feed_dict = {
                model.batch_size: 1,
                model.sequence_length: 2,
                model.memory_in: np.zeros((1, memory_size), dtype=np.float32),
                model.vector_in: np.array([[1, 2, 3, 1, 2, 3],
                                           [3, 4, 5, 3, 4, 5]]),
                model.epsilon: np.array([[0, 1]]),
            }
            sess.run(run_list, feed_dict=feed_dict)
Example #16
0
def test_average_gradients(mock_get_devices, dummy_config):
    tf.reset_default_graph()
    mock_get_devices.return_value = [
        "/device:GPU:0",
        "/device:GPU:1",
        "/device:GPU:2",
        "/device:GPU:3",
    ]

    trainer_parameters = dummy_config
    trainer_parameters["model_path"] = ""
    trainer_parameters["keep_checkpoints"] = 3
    brain = create_mock_brainparams()
    with tf.Session() as sess:
        policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
        var = tf.Variable(0)
        tower_grads = [
            [(tf.constant(0.1), var)],
            [(tf.constant(0.2), var)],
            [(tf.constant(0.3), var)],
            [(tf.constant(0.4), var)],
        ]
        avg_grads = policy.average_gradients(tower_grads)

        init = tf.global_variables_initializer()
        sess.run(init)
        run_out = sess.run(avg_grads)
    assert run_out == [(0.25, 0)]
Example #17
0
def test_ppo_get_value_estimates(dummy_config, rnn, visual, discrete):
    tf.reset_default_graph()

    optimizer = _create_ppo_optimizer_ops_mock(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    time_horizon = 15
    trajectory = _create_fake_trajectory(discrete, visual, time_horizon)
    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=False
    )
    for key, val in run_out.items():
        assert type(key) is str
        assert len(val) == 15

    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=True
    )
    for key, val in final_value_out.items():
        assert type(key) is str
        assert val == 0.0

    # Check if we ignore terminal states properly
    optimizer.reward_signals["extrinsic"].use_terminal_states = False
    run_out, final_value_out = optimizer.get_trajectory_value_estimates(
        trajectory.to_agentbuffer(), trajectory.next_obs, done=False
    )
    for key, val in final_value_out.items():
        assert type(key) is str
        assert val != 0.0
Example #18
0
def test_ppo_optimizer_update(dummy_config, rnn, visual, discrete):
    # Test evaluate
    tf.reset_default_graph()
    optimizer = create_test_ppo_optimizer(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
    update_buffer = mb.simulate_rollout(
        BUFFER_INIT_SAMPLES,
        optimizer.policy.behavior_spec,
        memory_size=optimizer.policy.m_size,
    )
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer["environment_rewards"]

    # NOTE: In TensorFlow, the log_probs are saved as one for every discrete action, whereas
    # in PyTorch it is saved as the total probability per branch. So we need to modify the
    # log prob in the fake buffer here.
    update_buffer["action_probs"] = np.ones_like(update_buffer["actions"])
    return_stats = optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
    # Make sure we have the right stats
    required_stats = [
        "Losses/Policy Loss",
        "Losses/Value Loss",
        "Policy/Learning Rate",
        "Policy/Epsilon",
        "Policy/Beta",
    ]
    for stat in required_stats:
        assert stat in return_stats.keys()
 def init_weights(self, env_manager):
     self._reset_env(env_manager)
     tf.reset_default_graph()
     last_brain_behavior_ids: Set[str] = set()
     external_brain_behavior_ids = set(env_manager.external_brains.keys())
     new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
     self._create_trainers_and_managers(env_manager, new_behavior_ids)
     self.weights = deepcopy(
         self.trainers['Brain'].get_policy(0).get_weights())
Example #20
0
def test_checkpoint_conversion(tmpdir, rnn, visual, discrete):
    tf.reset_default_graph()
    dummy_config = TrainerSettings()
    model_path = os.path.join(tmpdir, "Mock_Brain")
    policy = create_policy_mock(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    trainer_params = TrainerSettings()
    model_saver = TFModelSaver(trainer_params, model_path)
    model_saver.register(policy)
    model_saver.save_checkpoint("Mock_Brain", 100)
    assert os.path.isfile(model_path + "/Mock_Brain-100.nn")
Example #21
0
def test_policy_evaluate(dummy_config, rnn, visual, discrete):
    # Test evaluate
    tf.reset_default_graph()
    policy = create_policy_mock(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    step = mb.create_batchedstep_from_brainparams(policy.brain, num_agents=NUM_AGENTS)

    run_out = policy.evaluate(step, list(step.agent_id))
    if discrete:
        run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
    else:
        assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE[0])
Example #22
0
def test_ppo_get_value_estimates(mock_communicator, mock_launcher,
                                 dummy_config):
    tf.reset_default_graph()

    brain_params = BrainParameters(
        brain_name="test_brain",
        vector_observation_space_size=1,
        camera_resolutions=[],
        vector_action_space_size=[2],
        vector_action_descriptions=[],
        vector_action_space_type=0,
    )
    dummy_config["summary_path"] = "./summaries/test_trainer_summary"
    dummy_config["model_path"] = "./models/test_trainer_models/TestModel"
    policy = PPOPolicy(0, brain_params, dummy_config, False, False)
    time_horizon = 15
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        vec_obs_size=1,
        num_vis_obs=0,
        action_space=[2],
    )
    run_out = policy.get_value_estimates(trajectory.next_obs,
                                         "test_agent",
                                         done=False)
    for key, val in run_out.items():
        assert type(key) is str
        assert type(val) is float

    run_out = policy.get_value_estimates(trajectory.next_obs,
                                         "test_agent",
                                         done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val == 0.0

    # Check if we ignore terminal states properly
    policy.reward_signals["extrinsic"].use_terminal_states = False
    run_out = policy.get_value_estimates(trajectory.next_obs,
                                         "test_agent",
                                         done=True)
    for key, val in run_out.items():
        assert type(key) is str
        assert val != 0.0

    agentbuffer = trajectory.to_agentbuffer()
    batched_values = policy.get_batched_value_estimates(agentbuffer)
    for values in batched_values.values():
        assert len(values) == 15
Example #23
0
def test_sac_optimizer_update(dummy_config, rnn, visual, discrete):
    # Test evaluate
    tf.reset_default_graph()
    optimizer = create_sac_optimizer_mock(
        dummy_config, use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    # Test update
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, optimizer.policy.brain)
    # Mock out reward signal eval
    update_buffer["extrinsic_rewards"] = update_buffer["environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences // optimizer.policy.sequence_length,
    )
Example #24
0
def test_policy_evaluate(rnn, visual, discrete):
    # Test evaluate
    tf.reset_default_graph()
    policy = create_policy_mock(
        TrainerSettings(), use_rnn=rnn, use_discrete=discrete, use_visual=visual
    )
    decision_step, terminal_step = mb.create_steps_from_behavior_spec(
        policy.behavior_spec, num_agents=NUM_AGENTS
    )

    run_out = policy.evaluate(decision_step, list(decision_step.agent_id))
    if discrete:
        run_out["action"].shape == (NUM_AGENTS, len(DISCRETE_ACTION_SPACE))
    else:
        assert run_out["action"].shape == (NUM_AGENTS, VECTOR_ACTION_SPACE)
def test_policy_conversion(dummy_config, tmpdir, rnn, visual, discrete):
    tf.reset_default_graph()
    dummy_config["output_path"] = os.path.join(tmpdir, "test")
    policy = create_policy_mock(dummy_config,
                                use_rnn=rnn,
                                use_discrete=discrete,
                                use_visual=visual)
    policy.save_model(1000)
    settings = SerializationSettings(
        policy.model_path, os.path.join(tmpdir, policy.brain.brain_name))
    export_policy_model(settings, policy.graph, policy.sess)

    # These checks taken from test_barracuda_converter
    assert os.path.isfile(os.path.join(tmpdir, "test.nn"))
    assert os.path.getsize(os.path.join(tmpdir, "test.nn")) > 100
Example #26
0
def test_ppo_optimizer_update_gail(gail_dummy_config,
                                   dummy_config):  # noqa: F811
    # Test evaluate
    tf.reset_default_graph()
    dummy_config.reward_signals = gail_dummy_config
    optimizer = _create_ppo_optimizer_ops_mock(
        attr.evolve(ppo_dummy_config(), framework=FrameworkType.TENSORFLOW),
        use_rnn=False,
        use_discrete=False,
        use_visual=False,
    )
    # Test update
    behavior_spec = optimizer.policy.behavior_spec
    update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not
    n_agents = len(update_buffer["continuous_log_probs"])
    update_buffer["continuous_log_probs"] = np.ones(
        (n_agents, behavior_spec.action_spec.continuous_size),
        dtype=np.float32)
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )

    # Check if buffer size is too big
    update_buffer = mb.simulate_rollout(3000, optimizer.policy.behavior_spec)
    # Mock out reward signal eval
    update_buffer["advantages"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_returns"] = update_buffer["environment_rewards"]
    update_buffer["extrinsic_value_estimates"] = update_buffer[
        "environment_rewards"]
    update_buffer["gail_returns"] = update_buffer["environment_rewards"]
    update_buffer["gail_value_estimates"] = update_buffer[
        "environment_rewards"]
    optimizer.update(
        update_buffer,
        num_sequences=update_buffer.num_experiences //
        optimizer.policy.sequence_length,
    )
Example #27
0
def test_create_model(mock_get_devices, dummy_config):
    tf.reset_default_graph()
    mock_get_devices.return_value = [
        "/device:GPU:0",
        "/device:GPU:1",
        "/device:GPU:2",
        "/device:GPU:3",
    ]

    trainer_parameters = dummy_config
    trainer_parameters["model_path"] = ""
    trainer_parameters["keep_checkpoints"] = 3
    brain = create_mock_brainparams()

    policy = MultiGpuPPOPolicy(0, brain, trainer_parameters, False, False)
    assert len(policy.towers) == len(mock_get_devices.return_value)
    def start_learning(self, env_manager: EnvManager) -> None:
        self._create_output_path(self.output_path)
        tf.reset_default_graph()
        global_step = 0
        last_brain_behavior_ids: Set[str] = set()
        try:
            # Initial reset
            self._reset_env(env_manager)
            while self._not_done_training():
                external_brain_behavior_ids = set(
                    env_manager.external_brains.keys())
                new_behavior_ids = external_brain_behavior_ids - last_brain_behavior_ids
                self._create_trainers_and_managers(env_manager,
                                                   new_behavior_ids)
                last_brain_behavior_ids = external_brain_behavior_ids
                n_steps = self.advance(env_manager)
                for _ in range(n_steps):
                    global_step += 1
                    self.reset_env_if_ready(env_manager, global_step)
                    if self._should_save_model(global_step):
                        self._save_model()
            # Stop advancing trainers
            self.join_threads()
            # Final save Tensorflow model
            if global_step != 0 and self.train_model:
                self._save_model()
        except (
                KeyboardInterrupt,
                UnityCommunicationException,
                UnityEnvironmentException,
                UnityCommunicatorStoppedException,
        ) as ex:
            self.join_threads()
            if self.train_model:
                self._save_model_when_interrupted()

            if isinstance(ex, KeyboardInterrupt) or isinstance(
                    ex, UnityCommunicatorStoppedException):
                pass
            else:
                # If the environment failed, we want to make sure to raise
                # the exception so we exit the process with an return code of 1.
                raise ex
        finally:
            if self.train_model:
                self._export_graph()
Example #29
0
def test_policy_conversion(tmpdir, rnn, visual, discrete):
    tf.reset_default_graph()
    dummy_config = TrainerSettings()
    policy = create_policy_mock(
        dummy_config,
        use_rnn=rnn,
        model_path=os.path.join(tmpdir, "test"),
        use_discrete=discrete,
        use_visual=visual,
    )
    settings = SerializationSettings(policy.model_path, "MockBrain")
    checkpoint_path = f"{tmpdir}/MockBrain-1"
    policy.checkpoint(checkpoint_path, settings)

    # These checks taken from test_barracuda_converter
    assert os.path.isfile(checkpoint_path + ".nn")
    assert os.path.getsize(checkpoint_path + ".nn") > 100
Example #30
0
def test_ppo_policy_evaluate(mock_communicator, mock_launcher, dummy_config):
    tf.reset_default_graph()
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0)
    env = UnityEnvironment(" ")
    brain_infos = env.reset()
    brain_info = brain_infos[env.external_brain_names[0]]

    trainer_parameters = dummy_config
    model_path = env.external_brain_names[0]
    trainer_parameters["model_path"] = model_path
    trainer_parameters["keep_checkpoints"] = 3
    policy = PPOPolicy(0, env.brains[env.external_brain_names[0]],
                       trainer_parameters, False, False)
    run_out = policy.evaluate(brain_info)
    assert run_out["action"].shape == (3, 2)
    env.close()