def testSingleStepArrays(self): observation = () action = () policy_info = () reward = np.array([1.0, 1.0, 2.0]) discount = np.array([1.0, 1.0, 1.0]) traj = trajectory.single_step(observation, action, policy_info, reward, discount) self.assertFalse(tf.is_tensor(traj.step_type)) self.assertAllEqual(traj.step_type, [ts.StepType.FIRST] * 3) self.assertAllEqual(traj.next_step_type, [ts.StepType.LAST] * 3)
def testSingleStepTensors(self): observation = () action = () policy_info = () reward = tf.constant([1.0, 1.0, 2.0]) discount = tf.constant([1.0, 1.0, 1.0]) traj = trajectory.single_step(observation, action, policy_info, reward, discount) self.assertTrue(tf.is_tensor(traj.step_type)) traj_val = self.evaluate(traj) self.assertAllEqual(traj_val.step_type, [ts.StepType.FIRST] * 3) self.assertAllEqual(traj_val.next_step_type, [ts.StepType.LAST] * 3)
def test_agent_trainer_with_environment_steps_metric(mocker): """ Use a mock agent and the environment steps metric to trigger the training, as in the experiment harness. """ multi_component_agent_trainer = MultiComponentAgentTrainer() mock_agent = mocker.MagicMock(spec=TFAgent) mock_train_argspec = mocker.PropertyMock( return_value={ TRAIN_ARGSPEC_COMPONENT_ID: BoundedTensorSpec((), tf.int64, 0, 2) }) type(mock_agent).train_argspec = mock_train_argspec mock_replay_buffer = mocker.MagicMock(spec=ReplayBuffer) training_scheduler = multi_component_agent_trainer.create_training_scheduler( mock_agent, mock_replay_buffer) # After zero environment steps, do not train any models environment_steps_metric = EnvironmentSteps() loss_dictionary = training_scheduler.maybe_train( environment_steps_metric.result()) assert not loss_dictionary # After one environment step, train the first model single_step_trajectory = single_step(tf.zeros(()), tf.zeros(()), (), tf.zeros(()), tf.zeros(())) environment_steps_metric.call(single_step_trajectory) loss_dictionary_1 = training_scheduler.maybe_train( environment_steps_metric.result()) assert len(loss_dictionary_1) == 1 assert (loss_dictionary_1[MultiComponentAgent.COMPONENT_1].extra == MultiComponentAgent.COMPONENT_1.name) # After two environment steps, train the first and second models environment_steps_metric.call(single_step_trajectory) loss_dictionary_2 = training_scheduler.maybe_train( environment_steps_metric.result()) assert len(loss_dictionary_2) == 2 assert (loss_dictionary_2[MultiComponentAgent.COMPONENT_1].extra == MultiComponentAgent.COMPONENT_1.name) assert (loss_dictionary_2[MultiComponentAgent.COMPONENT_2].extra == MultiComponentAgent.COMPONENT_2.name) # After three environment steps, train the first and third models environment_steps_metric.call(single_step_trajectory) loss_dictionary_3 = training_scheduler.maybe_train( environment_steps_metric.result()) assert len(loss_dictionary_3) == 2 assert (loss_dictionary_3[MultiComponentAgent.COMPONENT_1].extra == MultiComponentAgent.COMPONENT_1.name) assert (loss_dictionary_3[MultiComponentAgent.COMPONENT_3].extra == MultiComponentAgent.COMPONENT_3.name)
def _train(self, experience, weights=None): del weights # unused experience = self._as_trajectory(experience) reward, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.reward, self._time_step_spec.reward) action, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.action, self._action_spec) observation, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.observation, self._time_step_spec.observation) policy_choice, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.policy_info[mixture_policy.MIXTURE_AGENT_ID], self._time_step_spec.reward) original_infos, _ = nest_utils.flatten_multi_batched_nested_tensors( experience.policy_info[mixture_policy.SUBPOLICY_INFO], self._original_info_spec) partitioned_nested_infos = nest_utils.batch_nested_tensors( _dynamic_partition_of_nested_tensors(original_infos, policy_choice, self._num_agents)) partitioned_nested_rewards = [ nest_utils.batch_nested_tensors(t) for t in _dynamic_partition_of_nested_tensors( reward, policy_choice, self._num_agents) ] partitioned_nested_actions = [ nest_utils.batch_nested_tensors(t) for t in _dynamic_partition_of_nested_tensors( action, policy_choice, self._num_agents) ] partitioned_nested_observations = [ nest_utils.batch_nested_tensors(t) for t in _dynamic_partition_of_nested_tensors( observation, policy_choice, self._num_agents) ] loss = 0 for k in range(self._num_agents): per_policy_experience = trajectory.single_step( observation=partitioned_nested_observations[k], action=partitioned_nested_actions[k], policy_info=partitioned_nested_infos[k], reward=partitioned_nested_rewards[k], discount=tf.zeros_like(partitioned_nested_rewards[k])) loss_info = self._agents[k].train(per_policy_experience) loss += loss_info.loss common.function_in_tf1()(self._update_mixture_distribution)(experience) return tf_agent.LossInfo(loss=(loss), extra=())