Exemple #1
0
 def testTrainAgentWithHeteroscedasticNetworks(self):
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.01)
     networks_and_loss_fns = self._create_objective_network_and_loss_fn_sequence(
     )
     networks_and_loss_fns[-1] = (HeteroscedasticDummyNet(
         self._kernel_weights[-1],
         self._bias), tf.compat.v1.losses.mean_squared_error)
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_network_and_loss_fn_sequence=networks_and_loss_fns,
         optimizer=optimizer)
     observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
     actions = np.array([0, 1], dtype=np.int32)
     objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, objectives)
     action_step = _get_action_step(actions)
     experience = _get_experience(initial_step, action_step, final_step)
     loss_before, _ = agent.train(experience, None)
     loss_after, _ = agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     self.assertAllClose(self.evaluate(loss_before), 350.2502672)
     self.assertLess(self.evaluate(loss_after), 350.2502672)
Exemple #2
0
 def testTrainAgentWithMask(self):
     time_step_spec = ts.time_step_spec(
         observation_spec=(tensor_spec.TensorSpec([2], tf.float32),
                           tensor_spec.TensorSpec([3], tf.int32)),
         reward_spec=tensor_spec.TensorSpec([3], tf.float32))
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.01)
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_networks=self._create_objective_networks(),
         optimizer=optimizer,
         observation_and_action_constraint_splitter=lambda x: (x[0], x[1]))
     observations = (np.array([[1, 2], [3, 4]], dtype=np.float32),
                     np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32))
     actions = np.array([0, 1], dtype=np.int32)
     objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps_with_action_mask(
         observations, objectives)
     action_step = _get_action_step(actions)
     experience = _get_experience(initial_step, action_step, final_step)
     loss_before, _ = agent.train(experience, None)
     loss_after, _ = agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     self.assertAllClose(self.evaluate(loss_before), 763.5)
     self.assertLess(self.evaluate(loss_after), 763.5)
Exemple #3
0
    def testInitializeRestoreAgent(self):
        agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
            self._time_step_spec,
            self._action_spec,
            self._scalarizer,
            objective_networks=self._create_objective_networks(),
            optimizer=None)
        observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        policy = agent.policy
        action_step = policy.action(time_steps)
        self.evaluate(tf.compat.v1.initialize_all_variables())

        checkpoint = tf.train.Checkpoint(agent=agent)

        latest_checkpoint = tf.train.latest_checkpoint(self.get_temp_dir())
        checkpoint_load_status = checkpoint.restore(latest_checkpoint)

        if tf.executing_eagerly():
            self.evaluate(checkpoint_load_status.initialize_or_restore())
            self.assertAllEqual(self.evaluate(action_step.action), [2, 0])
        else:
            with self.cached_session() as sess:
                checkpoint_load_status.initialize_or_restore(sess)
                self.assertAllEqual(sess.run(action_step.action), [2, 0])
Exemple #4
0
 def testPolicySetScalarizationParameters(self):
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_network_and_loss_fn_sequence=self.
         _create_objective_network_and_loss_fn_sequence(),
         optimizer=None)
     observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
     time_steps = ts.restart(observations, batch_size=2)
     policy = agent.policy
     policy.scalarizer.set_parameters(
         direction=tf.constant([[0, 1, 0], [0, 0, 1]], dtype=tf.float32),
         transform_params={
             multi_objective_scalarizer.HyperVolumeScalarizer.SLOPE_KEY:
             tf.constant([[0.2, 0.2, 0.2], [0.1, 0.1, 0.1]],
                         dtype=tf.float32),
             multi_objective_scalarizer.HyperVolumeScalarizer.OFFSET_KEY:
             tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.float32)
         })
     action_step = policy.action(time_steps)
     # Batch size 2.
     self.assertAllEqual([2], action_step.action.shape)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     actions = self.evaluate(action_step.action)
     self.assertAllEqual(actions, [2, 1])
Exemple #5
0
    def testObjectiveDependentLosses(self):
        networks_and_loss_fns = self._create_objective_network_and_loss_fn_sequence(
        )
        networks_and_loss_fns[1] = (networks_and_loss_fns[1][0],
                                    tf.compat.v1.losses.sigmoid_cross_entropy)
        networks_and_loss_fns[2] = (networks_and_loss_fns[2][0],
                                    tf.compat.v1.losses.absolute_difference)
        agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
            self._time_step_spec,
            self._action_spec,
            self._scalarizer,
            objective_network_and_loss_fn_sequence=networks_and_loss_fns,
            optimizer=None)
        observations = np.array([[0.1, 0.2], [1, 0.5]], dtype=np.float32)
        actions = np.array([0, 1], dtype=np.int32)
        objectives = np.array([[0.2, 1, 1.5], [4, 0, 5.5]], dtype=np.float32)
        initial_step, final_step = _get_initial_and_final_steps(
            observations, objectives)
        action_step = _get_action_step(actions)
        experience = _get_experience(initial_step, action_step, final_step)

        init_op = agent.initialize()
        if not tf.executing_eagerly():
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
                self.assertIsNone(sess.run(init_op))
        loss, _ = agent._loss(experience)
        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertAllClose(self.evaluate(loss), 2.410641)
Exemple #6
0
 def testCreateAgent(self):
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_networks=self._create_objective_networks(),
         optimizer=None)
     self.assertIsNotNone(agent.policy)
     self.assertEqual(len(agent._variables_to_train()), 6)
 def testCreateAgentWithTooFewObjectiveNetworksRaisesError(self):
   with self.assertRaisesRegexp(
       ValueError,
       'Number of objectives should be at least two, but found to be 1'):
     greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_networks=[self._create_objective_networks()[0]],
         optimizer=None)
 def testCreateAgentWithWrongActionsRaisesError(self):
   action_spec = tensor_spec.BoundedTensorSpec((5, 6, 7), tf.float32, 0, 2)
   with self.assertRaisesRegexp(ValueError,
                                'Action spec must be a scalar; got shape'):
     greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         action_spec,
         self._scalarizer,
         objective_networks=self._create_objective_networks(),
         optimizer=None)
Exemple #9
0
 def testInitializeAgent(self):
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_networks=self._create_objective_networks(),
         optimizer=None)
     init_op = agent.initialize()
     if not tf.executing_eagerly():
         with self.cached_session() as sess:
             common.initialize_uninitialized_variables(sess)
             self.assertIsNone(sess.run(init_op))
Exemple #10
0
 def testCreateAgentWithHeteroscedasticNetworks(self):
     objective_networks = self._create_objective_networks()
     objective_networks[-1] = HeteroscedasticDummyNet(
         self._kernel_weights[-1], self._bias)
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_networks=objective_networks,
         optimizer=None)
     self.assertIsNotNone(agent.policy)
     self.assertEqual(len(agent._variables_to_train()), 8)
     self.assertAllEqual(agent._heteroscedastic, [False, False, True])
Exemple #11
0
 def testCreateAgentWithHeteroscedasticNetworks(self):
     networks_and_loss_fns = self._create_objective_network_and_loss_fn_sequence(
     )
     networks_and_loss_fns[-1] = (HeteroscedasticDummyNet(
         self._kernel_weights[-1],
         self._bias), tf.compat.v1.losses.mean_squared_error)
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_network_and_loss_fn_sequence=networks_and_loss_fns,
         optimizer=None)
     self.assertIsNotNone(agent.policy)
     self.assertEqual(len(agent._variables_to_train()), 8)
     self.assertAllEqual(agent._heteroscedastic, [False, False, True])
Exemple #12
0
 def testPolicy(self):
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         self._time_step_spec,
         self._action_spec,
         self._scalarizer,
         objective_networks=self._create_objective_networks(),
         optimizer=None)
     observations = tf.constant([[1, 2], [2, 1]], dtype=tf.float32)
     time_steps = ts.restart(observations, batch_size=2)
     policy = agent.policy
     action_step = policy.action(time_steps)
     # Batch size 2.
     self.assertAllEqual([2], action_step.action.shape)
     self.evaluate(tf.compat.v1.initialize_all_variables())
     actions = self.evaluate(action_step.action)
     self.assertAllEqual(actions, [2, 0])
Exemple #13
0
    def testLoss(self):
        agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
            self._time_step_spec,
            self._action_spec,
            self._scalarizer,
            objective_networks=self._create_objective_networks(),
            optimizer=None)
        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        actions = tf.constant([0, 1], dtype=tf.int32)
        objectives = tf.constant([[8, 12, 11], [25, 18, 32]], dtype=tf.float32)

        init_op = agent.initialize()
        if not tf.executing_eagerly():
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
                self.assertIsNone(sess.run(init_op))
        loss, _ = agent.loss(observations, actions, objectives)
        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertAllClose(self.evaluate(loss), 0.0)
Exemple #14
0
 def testTrainPerArmAgent(self):
     obs_spec = bandit_spec_utils.create_per_arm_observation_spec(
         2, 3, 4, add_num_actions_feature=True)
     time_step_spec = ts.time_step_spec(observation_spec=obs_spec,
                                        reward_spec=tensor_spec.TensorSpec(
                                            [3], tf.float32))
     action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 3)
     networks_and_loss_fns = [
         (global_and_arm_feature_network.
          create_feed_forward_common_tower_network(obs_spec, (4, 3), (3, 4),
                                                   (4, 2)),
          tf.compat.v1.losses.mean_squared_error) for _ in range(3)
     ]
     optimizer = tf.compat.v1.train.GradientDescentOptimizer(
         learning_rate=0.01)
     agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
         time_step_spec,
         action_spec,
         self._scalarizer,
         objective_network_and_loss_fn_sequence=networks_and_loss_fns,
         accepts_per_arm_features=True,
         optimizer=optimizer)
     observations = {
         bandit_spec_utils.GLOBAL_FEATURE_KEY:
         tf.constant([[1, 2], [3, 4]], dtype=tf.float32),
         bandit_spec_utils.PER_ARM_FEATURE_KEY:
         tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]),
                 dtype=tf.float32),
         bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY:
         tf.ones([2], dtype=tf.int32)
     }
     actions = np.array([0, 3], dtype=np.int32)
     objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32)
     initial_step, final_step = _get_initial_and_final_steps(
         observations, objectives)
     action_step = policy_step.PolicyStep(
         action=tf.convert_to_tensor(actions),
         info=policy_utilities.PerArmPolicyInfo(
             chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]],
                                          dtype=np.float32)))
     experience = _get_experience(initial_step, action_step, final_step)
     agent.train(experience, None)
     self.evaluate(tf.compat.v1.initialize_all_variables())
 def testTrainAgent(self):
   optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.01)
   agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
       self._time_step_spec,
       self._action_spec,
       self._scalarizer,
       objective_networks=self._create_objective_networks(),
       optimizer=optimizer)
   observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
   actions = np.array([0, 1], dtype=np.int32)
   objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32)
   initial_step, final_step = _get_initial_and_final_steps(
       observations, objectives)
   action_step = _get_action_step(actions)
   experience = _get_experience(initial_step, action_step, final_step)
   loss_before, _ = agent.train(experience, None)
   loss_after, _ = agent.train(experience, None)
   self.evaluate(tf.compat.v1.initialize_all_variables())
   self.assertAllClose(self.evaluate(loss_before), 763.5)
   self.assertLess(self.evaluate(loss_after), 763.5)
 def testTrainAgentWithWrongNumberOfObjectivesRaisesError(self):
   optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.01)
   objective_networks = self._create_objective_networks()
   objective_networks.pop(0)
   agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
       self._time_step_spec,
       self._action_spec,
       self._scalarizer,
       objective_networks=objective_networks,
       optimizer=optimizer)
   observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
   actions = np.array([0, 1], dtype=np.int32)
   objectives = np.array([[1, 2, 3], [6, 5, 4]], dtype=np.float32)
   initial_step, final_step = _get_initial_and_final_steps(
       observations, objectives)
   action_step = _get_action_step(actions)
   experience = _get_experience(initial_step, action_step, final_step)
   with self.assertRaisesRegexp(
       ValueError,
       'The number of objectives in the objective_values tensor: 3 is '
       'different from the number of objective networks: 2'):
     agent.train(experience, None)
Exemple #17
0
    def testLoss(self):
        agent = greedy_multi_objective_agent.GreedyMultiObjectiveNeuralAgent(
            self._time_step_spec,
            self._action_spec,
            self._scalarizer,
            objective_network_and_loss_fn_sequence=self.
            _create_objective_network_and_loss_fn_sequence(),
            optimizer=None)
        observations = np.array([[1, 2], [3, 4]], dtype=np.float32)
        actions = np.array([0, 1], dtype=np.int32)
        objectives = np.array([[8, 12, 11], [25, 18, 32]], dtype=np.float32)
        initial_step, final_step = _get_initial_and_final_steps(
            observations, objectives)
        action_step = _get_action_step(actions)
        experience = _get_experience(initial_step, action_step, final_step)

        init_op = agent.initialize()
        if not tf.executing_eagerly():
            with self.cached_session() as sess:
                common.initialize_uninitialized_variables(sess)
                self.assertIsNone(sess.run(init_op))
        loss, _ = agent._loss(experience)
        self.evaluate(tf.compat.v1.initialize_all_variables())
        self.assertAllClose(self.evaluate(loss), 0.0)