def testTrainAgentWithHeteroscedasticNetworks(self): optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.01) networks_and_loss_fns = self._create_objective_network_and_loss_fn_sequence( ) networks_and_loss_fns[-1] = (HeteroscedasticDummyNet( self._kernel_weights[-1], self._bias), tf.compat.v1.losses.mean_squared_error) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_network_and_loss_fn_sequence=networks_and_loss_fns, optimizer=optimizer) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, objectives) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) loss_after, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss_before), 350.2502672) self.assertLess(self.evaluate(loss_after), 350.2502672)
def testTrainAgentWithMask(self): time_step_spec = ts.time_step_spec( observation_spec=(tensor_spec.TensorSpec([2], tf.float32), tensor_spec.TensorSpec([3], tf.int32)), reward_spec=tensor_spec.TensorSpec([3], tf.float32)) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.01) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( time_step_spec, self._action_spec, self._scalarizer, objective_networks=self._create_objective_networks(), optimizer=optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1])) observations = (np.array([[1, 2], [3, 4]], dtype=np.float32), np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32)) actions = np.array([0, 1], dtype=np.int32) objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps_with_action_mask( observations, objectives) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) loss_after, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss_before), 763.5) self.assertLess(self.evaluate(loss_after), 763.5)
def testInitializeRestoreAgent(self): agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=self._create_objective_networks(), optimizer=None) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) self.evaluate(tf.compat.v1.initialize_all_variables()) checkpoint = tf.train.Checkpoint(agent=agent) latest_checkpoint = tf.train.latest_checkpoint(self.get_temp_dir()) checkpoint_load_status = checkpoint.restore(latest_checkpoint) if tf.executing_eagerly(): self.evaluate(checkpoint_load_status.initialize_or_restore()) self.assertAllEqual(self.evaluate(action_step.action), [2, 0]) else: with self.cached_session() as sess: checkpoint_load_status.initialize_or_restore(sess) self.assertAllEqual(sess.run(action_step.action), [2, 0])
def testPolicySetScalarizationParameters(self): agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_network_and_loss_fn_sequence=self. _create_objective_network_and_loss_fn_sequence(), optimizer=None) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) policy = agent.policy policy.scalarizer.set_parameters( direction=tf.constant([[0, 1, 0], [0, 0, 1]], dtype=tf.float32), transform_params={ multi_objective_scalarizer.HyperVolumeScalarizer.SLOPE_KEY: tf.constant([[0.2, 0.2, 0.2], [0.1, 0.1, 0.1]], dtype=tf.float32), multi_objective_scalarizer.HyperVolumeScalarizer.OFFSET_KEY: tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.float32) }) action_step = policy.action(time_steps) # Batch size 2. self.assertAllEqual([2], action_step.action.shape) self.evaluate(tf.compat.v1.initialize_all_variables()) actions = self.evaluate(action_step.action) self.assertAllEqual(actions, [2, 1])
def testObjectiveDependentLosses(self): networks_and_loss_fns = self._create_objective_network_and_loss_fn_sequence( ) networks_and_loss_fns[1] = (networks_and_loss_fns[1][0], tf.compat.v1.losses.sigmoid_cross_entropy) networks_and_loss_fns[2] = (networks_and_loss_fns[2][0], tf.compat.v1.losses.absolute_difference) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_network_and_loss_fn_sequence=networks_and_loss_fns, optimizer=None) observations = np.array([[0.1, 0.2], [1, 0.5]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) objectives = np.array([[0.2, 1, 1.5], [4, 0, 5.5]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, objectives) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) init_op = agent.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), 2.410641)
def testCreateAgent(self): agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=self._create_objective_networks(), optimizer=None) self.assertIsNotNone(agent.policy) self.assertEqual(len(agent._variables_to_train()), 6)
def testCreateAgentWithTooFewObjectiveNetworksRaisesError(self): with self.assertRaisesRegexp( ValueError, 'Number of objectives should be at least two, but found to be 1'): greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=[self._create_objective_networks()[0]], optimizer=None)
def testCreateAgentWithWrongActionsRaisesError(self): action_spec = tensor_spec.BoundedTensorSpec((5, 6, 7), tf.float32, 0, 2) with self.assertRaisesRegexp(ValueError, 'Action spec must be a scalar; got shape'): greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, action_spec, self._scalarizer, objective_networks=self._create_objective_networks(), optimizer=None)
def testInitializeAgent(self): agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=self._create_objective_networks(), optimizer=None) init_op = agent.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op))
def testCreateAgentWithHeteroscedasticNetworks(self): objective_networks = self._create_objective_networks() objective_networks[-1] = HeteroscedasticDummyNet( self._kernel_weights[-1], self._bias) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=objective_networks, optimizer=None) self.assertIsNotNone(agent.policy) self.assertEqual(len(agent._variables_to_train()), 8) self.assertAllEqual(agent._heteroscedastic, [False, False, True])
def testCreateAgentWithHeteroscedasticNetworks(self): networks_and_loss_fns = self._create_objective_network_and_loss_fn_sequence( ) networks_and_loss_fns[-1] = (HeteroscedasticDummyNet( self._kernel_weights[-1], self._bias), tf.compat.v1.losses.mean_squared_error) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_network_and_loss_fn_sequence=networks_and_loss_fns, optimizer=None) self.assertIsNotNone(agent.policy) self.assertEqual(len(agent._variables_to_train()), 8) self.assertAllEqual(agent._heteroscedastic, [False, False, True])
def testPolicy(self): agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=self._create_objective_networks(), optimizer=None) observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) # Batch size 2. self.assertAllEqual([2], action_step.action.shape) self.evaluate(tf.compat.v1.initialize_all_variables()) actions = self.evaluate(action_step.action) self.assertAllEqual(actions, [2, 0])
def testLoss(self): agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=self._create_objective_networks(), optimizer=None) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) actions = tf.constant([0, 1], dtype=tf.int32) objectives = tf.constant([[8, 12, 11], [25, 18, 32]], dtype=tf.float32) init_op = agent.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss, _ = agent.loss(observations, actions, objectives) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), 0.0)
def testTrainPerArmAgent(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, 4, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(observation_spec=obs_spec, reward_spec=tensor_spec.TensorSpec( [3], tf.float32)) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3) networks_and_loss_fns = [ (global_and_arm_feature_network. create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4), (4, 2)), tf.compat.v1.losses.mean_squared_error) for _ in range(3) ] optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.01) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( time_step_spec, action_spec, self._scalarizer, objective_network_and_loss_fn_sequence=networks_and_loss_fns, accepts_per_arm_features=True, optimizer=optimizer) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32), bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.ones([2], dtype=tf.int32) } actions = np.array([0, 3], dtype=np.int32) objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, objectives) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testTrainAgent(self): optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.01) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=self._create_objective_networks(), optimizer=optimizer) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, objectives) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) loss_after, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss_before), 763.5) self.assertLess(self.evaluate(loss_after), 763.5)
def testTrainAgentWithWrongNumberOfObjectivesRaisesError(self): optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.01) objective_networks = self._create_objective_networks() objective_networks.pop(0) agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_networks=objective_networks, optimizer=optimizer) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, objectives) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) with self.assertRaisesRegexp( ValueError, 'The number of objectives in the objective_values tensor: 3 is ' 'different from the number of objective networks: 2'): agent.train(experience, None)
def testLoss(self): agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent( self._time_step_spec, self._action_spec, self._scalarizer, objective_network_and_loss_fn_sequence=self. _create_objective_network_and_loss_fn_sequence(), optimizer=None) observations = np.array([[1, 2], [3, 4]], dtype=np.float32) actions = np.array([0, 1], dtype=np.int32) objectives = np.array([[8, 12, 11], [25, 18, 32]], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, objectives) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) init_op = agent.initialize() if not tf.executing_eagerly(): with self.cached_session() as sess: common.initialize_uninitialized_variables(sess) self.assertIsNone(sess.run(init_op)) loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), 0.0)