def testSingleStepArrays(self):
     observation = ()
     action = ()
     policy_info = ()
     reward = np.array([1.0, 1.0, 2.0])
     discount = np.array([1.0, 1.0, 1.0])
     traj = trajectory.single_step(observation, action, policy_info, reward,
                                   discount)
     self.assertFalse(tf.is_tensor(traj.step_type))
     self.assertAllEqual(traj.step_type, [ts.StepType.FIRST] * 3)
     self.assertAllEqual(traj.next_step_type, [ts.StepType.LAST] * 3)
 def testSingleStepTensors(self):
     observation = ()
     action = ()
     policy_info = ()
     reward = tf.constant([1.0, 1.0, 2.0])
     discount = tf.constant([1.0, 1.0, 1.0])
     traj = trajectory.single_step(observation, action, policy_info, reward,
                                   discount)
     self.assertTrue(tf.is_tensor(traj.step_type))
     traj_val = self.evaluate(traj)
     self.assertAllEqual(traj_val.step_type, [ts.StepType.FIRST] * 3)
     self.assertAllEqual(traj_val.next_step_type, [ts.StepType.LAST] * 3)
Example #3
0
def test_agent_trainer_with_environment_steps_metric(mocker):
    """
    Use a mock agent and the environment steps metric to trigger the training, as in the
    experiment harness.
    """
    multi_component_agent_trainer = MultiComponentAgentTrainer()
    mock_agent = mocker.MagicMock(spec=TFAgent)
    mock_train_argspec = mocker.PropertyMock(
        return_value={
            TRAIN_ARGSPEC_COMPONENT_ID: BoundedTensorSpec((), tf.int64, 0, 2)
        })
    type(mock_agent).train_argspec = mock_train_argspec
    mock_replay_buffer = mocker.MagicMock(spec=ReplayBuffer)

    training_scheduler = multi_component_agent_trainer.create_training_scheduler(
        mock_agent, mock_replay_buffer)

    # After zero environment steps, do not train any models
    environment_steps_metric = EnvironmentSteps()
    loss_dictionary = training_scheduler.maybe_train(
        environment_steps_metric.result())
    assert not loss_dictionary

    # After one environment step, train the first model
    single_step_trajectory = single_step(tf.zeros(()), tf.zeros(()), (),
                                         tf.zeros(()), tf.zeros(()))
    environment_steps_metric.call(single_step_trajectory)
    loss_dictionary_1 = training_scheduler.maybe_train(
        environment_steps_metric.result())
    assert len(loss_dictionary_1) == 1
    assert (loss_dictionary_1[MultiComponentAgent.COMPONENT_1].extra ==
            MultiComponentAgent.COMPONENT_1.name)

    # After two environment steps, train the first and second models
    environment_steps_metric.call(single_step_trajectory)
    loss_dictionary_2 = training_scheduler.maybe_train(
        environment_steps_metric.result())
    assert len(loss_dictionary_2) == 2
    assert (loss_dictionary_2[MultiComponentAgent.COMPONENT_1].extra ==
            MultiComponentAgent.COMPONENT_1.name)
    assert (loss_dictionary_2[MultiComponentAgent.COMPONENT_2].extra ==
            MultiComponentAgent.COMPONENT_2.name)

    # After three environment steps, train the first and third models
    environment_steps_metric.call(single_step_trajectory)
    loss_dictionary_3 = training_scheduler.maybe_train(
        environment_steps_metric.result())
    assert len(loss_dictionary_3) == 2
    assert (loss_dictionary_3[MultiComponentAgent.COMPONENT_1].extra ==
            MultiComponentAgent.COMPONENT_1.name)
    assert (loss_dictionary_3[MultiComponentAgent.COMPONENT_3].extra ==
            MultiComponentAgent.COMPONENT_3.name)
Example #4
0
    def _train(self, experience, weights=None):
        del weights  # unused
        experience = self._as_trajectory(experience)

        reward, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.reward, self._time_step_spec.reward)
        action, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.action, self._action_spec)
        observation, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.observation, self._time_step_spec.observation)
        policy_choice, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.policy_info[mixture_policy.MIXTURE_AGENT_ID],
            self._time_step_spec.reward)
        original_infos, _ = nest_utils.flatten_multi_batched_nested_tensors(
            experience.policy_info[mixture_policy.SUBPOLICY_INFO],
            self._original_info_spec)

        partitioned_nested_infos = nest_utils.batch_nested_tensors(
            _dynamic_partition_of_nested_tensors(original_infos, policy_choice,
                                                 self._num_agents))

        partitioned_nested_rewards = [
            nest_utils.batch_nested_tensors(t)
            for t in _dynamic_partition_of_nested_tensors(
                reward, policy_choice, self._num_agents)
        ]
        partitioned_nested_actions = [
            nest_utils.batch_nested_tensors(t)
            for t in _dynamic_partition_of_nested_tensors(
                action, policy_choice, self._num_agents)
        ]
        partitioned_nested_observations = [
            nest_utils.batch_nested_tensors(t)
            for t in _dynamic_partition_of_nested_tensors(
                observation, policy_choice, self._num_agents)
        ]
        loss = 0
        for k in range(self._num_agents):
            per_policy_experience = trajectory.single_step(
                observation=partitioned_nested_observations[k],
                action=partitioned_nested_actions[k],
                policy_info=partitioned_nested_infos[k],
                reward=partitioned_nested_rewards[k],
                discount=tf.zeros_like(partitioned_nested_rewards[k]))
            loss_info = self._agents[k].train(per_policy_experience)
            loss += loss_info.loss
        common.function_in_tf1()(self._update_mixture_distribution)(experience)
        return tf_agent.LossInfo(loss=(loss), extra=())