def testScalarizeObjectivesWrongInputRankRaisesError(self): objectives_tensor = tf.constant([1], dtype=tf.float32) with self.assertRaisesRegexp( ValueError, 'The objectives_tensor should be rank-3, but is rank-1'): greedy_multi_objective_policy.scalarize_objectives( objectives_tensor, self._scalarizer)
def testPredictedRewards(self): policy = greedy_multi_objective_policy.GreedyMultiObjectiveNeuralPolicy( self._time_step_spec, self._action_spec, self._scalarizer, self._create_objective_networks(), emit_policy_info=( policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN, policy_utilities.InfoFields. MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN)) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllEqual(self.evaluate(action_step.action), [2, 0]) # The expected values are obtained by passing the observation through the # Keras dense layer of the DummyNet (defined above). predicted_rewards_expected_array = np.array([[[8, 11, 14], [12, 8, 13], [11, 14, 8]], [[5, 8, 11], [10, 5, 9], [8, 11, 5]]]) p_info = self.evaluate(action_step.info) predicted_rewards = getattr( p_info, policy_utilities.InfoFields.PREDICTED_REWARDS_MEAN) self.assertAllClose(predicted_rewards, predicted_rewards_expected_array) self.assertAllClose( getattr( p_info, policy_utilities.InfoFields. MULTIOBJECTIVE_SCALARIZED_PREDICTED_REWARDS_MEAN), greedy_multi_objective_policy.scalarize_objectives( predicted_rewards, policy.scalarizer))
def testScalarizeObjectivesWrongNumberOfObjectiveRaisesError(self): objectives_tensor = tf.constant([[[1, 2, 3]], [[4, 5, 6]]], dtype=tf.float32) with self.assertRaisesRegexp( ValueError, 'The number of input objectives should be 3, but is 1'): self.evaluate( greedy_multi_objective_policy.scalarize_objectives( objectives_tensor, self._scalarizer))
def testScalarizeObjectives(self): objectives_tensor = tf.constant( [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], dtype=tf.float32) scalarized_reward = greedy_multi_objective_policy.scalarize_objectives( objectives_tensor, self._scalarizer) self.assertAllClose(self.evaluate(scalarized_reward), [[3, 4], [9, 10]], rtol=1e-4, atol=1e-3)