def _create_trajectories(self):
    # Order of args for trajectory methods:
    # observation, action, policy_info, reward, discount
    ts0 = nest_utils.stack_nested_tensors([
        trajectory.boundary((), (), (), 0., 1.),
        trajectory.boundary((), (), (), 0., 1.)
    ])
    ts1 = nest_utils.stack_nested_tensors([
        trajectory.first((), (), (), 1., 1.),
        trajectory.first((), (), (), 2., 1.)
    ])
    ts2 = nest_utils.stack_nested_tensors([
        trajectory.last((), (), (), 3., 1.),
        trajectory.last((), (), (), 4., 1.)
    ])
    ts3 = nest_utils.stack_nested_tensors([
        trajectory.boundary((), (), (), 0., 1.),
        trajectory.boundary((), (), (), 0., 1.)
    ])
    ts4 = nest_utils.stack_nested_tensors([
        trajectory.first((), (), (), 5., 1.),
        trajectory.first((), (), (), 6., 1.)
    ])
    ts5 = nest_utils.stack_nested_tensors([
        trajectory.last((), (), (), 7., 1.),
        trajectory.last((), (), (), 8., 1.)
    ])

    return [ts0, ts1, ts2, ts3, ts4, ts5]
Exemple #2
0
  def _create_misaligned_trajectories(self):

    def _concat_nested_tensors(nest1, nest2):
      return tf.nest.map_structure(lambda t1, t2: tf.concat([t1, t2], axis=0),
                                   nest1, nest2)

    # Order of args for trajectory methods:
    # observation, action, policy_info, reward, discount
    ts1 = _concat_nested_tensors(
        trajectory.first((), tf.constant([2]), (),
                         tf.constant([1.], dtype=tf.float32), [1.]),
        trajectory.boundary((), tf.constant([1]), (),
                            tf.constant([0.], dtype=tf.float32), [1.]))
    ts2 = _concat_nested_tensors(
        trajectory.last((), tf.constant([1]), (),
                        tf.constant([3.], dtype=tf.float32), [1.]),
        trajectory.first((), tf.constant([1]), (),
                         tf.constant([2.], dtype=tf.float32), [1.]))
    ts3 = _concat_nested_tensors(
        trajectory.boundary((), tf.constant([2]), (),
                            tf.constant([0.], dtype=tf.float32), [1.]),
        trajectory.last((), tf.constant([1]), (),
                        tf.constant([4.], dtype=tf.float32), [1.]))

    return [ts1, ts2, ts3]
Exemple #3
0
    def setUp(self):
        super(BatchedPyMetricTest, self).setUp()
        # Order of args for trajectory methods:
        # observation, action, policy_info, reward, discount
        ts0 = nest_utils.stack_nested_tensors([
            trajectory.boundary((), (), (), 0., 1.),
            trajectory.boundary((), (), (), 0., 1.)
        ])
        ts1 = nest_utils.stack_nested_tensors([
            trajectory.first((), (), (), 1., 1.),
            trajectory.first((), (), (), 2., 1.)
        ])
        ts2 = nest_utils.stack_nested_tensors([
            trajectory.last((), (), (), 3., 1.),
            trajectory.last((), (), (), 4., 1.)
        ])
        ts3 = nest_utils.stack_nested_tensors([
            trajectory.boundary((), (), (), 0., 1.),
            trajectory.boundary((), (), (), 0., 1.)
        ])
        ts4 = nest_utils.stack_nested_tensors([
            trajectory.first((), (), (), 5., 1.),
            trajectory.first((), (), (), 6., 1.)
        ])
        ts5 = nest_utils.stack_nested_tensors([
            trajectory.last((), (), (), 7., 1.),
            trajectory.last((), (), (), 8., 1.)
        ])

        self._ts = [ts0, ts1, ts2, ts3, ts4, ts5]
Exemple #4
0
    def testBatchSizeProvided(self, metric_class, expected_result):
        metric = metric_class(batch_size=2)

        metric(
            nest_utils.stack_nested_arrays([
                trajectory.boundary((), (), (), 0., 1.),
                trajectory.boundary((), (), (), 0., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.first((), (), (), 1., 1.),
                trajectory.first((), (), (), 1., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.mid((), (), (), 2., 1.),
                trajectory.last((), (), (), 3., 0.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.last((), (), (), 3., 0.),
                trajectory.boundary((), (), (), 0., 1.)
            ]))
        metric(
            nest_utils.stack_nested_arrays([
                trajectory.boundary((), (), (), 0., 1.),
                trajectory.first((), (), (), 1., 1.)
            ]))
        self.assertEqual(metric.result(), expected_result)
Exemple #5
0
 def setUp(self):
     super(PyDriverTest, self).setUp()
     f0 = np.array(0., dtype=np.float32)
     f1 = np.array(1., dtype=np.float32)
     # Order of args for trajectory methods:
     # (observation, action, policy_info, reward, discount)
     self._trajectories = [
         trajectory.first(0, 1, 2, f1, f1),
         trajectory.last(1, 2, 4, f1, f0),
         trajectory.boundary(3, 1, 2, f0, f1),
         trajectory.first(0, 1, 2, f1, f1),
         trajectory.last(1, 2, 4, f1, f0),
         trajectory.boundary(3, 1, 2, f0, f1),
         trajectory.first(0, 1, 2, f1, f1),
     ]
  def testLoss(self):
    cloning_net = DummyNet(self._observation_spec, self._action_spec)
    agent = behavioral_cloning_agent.BehavioralCloningAgent(
        self._time_step_spec,
        self._action_spec,
        cloning_network=cloning_net,
        optimizer=None)

    observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)]
    actions = tf.constant([0, 1], dtype=tf.int32)
    rewards = tf.constant([10, 20], dtype=tf.float32)
    discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

    experience = trajectory.first(
        observation=observations,
        action=actions,
        policy_info=(),
        reward=rewards,
        discount=discounts)
    loss_info = agent._loss(experience)

    self.evaluate(tf.compat.v1.global_variables_initializer())
    total_loss, _ = self.evaluate(loss_info)

    expected_loss = tf.reduce_mean(
        input_tensor=tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits(
            logits=cloning_net(observations)[0], labels=actions))

    self.assertAllClose(total_loss, expected_loss)
Exemple #7
0
 def testZeroEpisodes(self, metric_class, expected_result):
     metric = metric_class()
     # Order of args for trajectory methods:
     # observation, action, policy_info, reward, discount
     metric(trajectory.boundary((), (), (), 0., 1.))
     metric(trajectory.first((), (), (), 1., 1.))
     self.assertEqual(expected_result, metric.result())
    def testLoss(self):
        cloning_net = get_dummy_net(self._action_spec)
        agent = behavioral_cloning_agent.BehavioralCloningAgent(
            self._time_step_spec,
            self._action_spec,
            cloning_network=cloning_net,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001))

        observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
        actions = tf.constant([0, 1], dtype=tf.int32)
        rewards = tf.constant([10, 20], dtype=tf.float32)
        discounts = tf.constant([0.9, 0.9], dtype=tf.float32)

        experience = trajectory.first(observation=observations,
                                      action=actions,
                                      policy_info=(),
                                      reward=rewards,
                                      discount=discounts)

        self.evaluate(tf.compat.v1.global_variables_initializer())

        expected_loss = tf.reduce_mean(
            input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=actions, logits=cloning_net(observations)[0]))

        loss_info = agent.train(experience)
        total_loss = self.evaluate(loss_info.loss)

        self.assertAllClose(total_loss, expected_loss)

        test_util.test_loss_and_train_output(test=self,
                                             expect_equal_loss_values=True,
                                             agent=agent,
                                             experience=experience)
Exemple #9
0
    def testAverageOneEpisodeWithReset(self, metric_class, expected_result):
        metric = metric_class()

        metric(trajectory.first((), (), (), 0., 1.))
        metric(trajectory.mid((), (), (), 1., 1.))
        metric(trajectory.mid((), (), (), 2., 1.))
        # The episode is reset.
        #
        # This could happen when using the dynamic_episode_driver with
        # parallel_py_environment. When the parallel episodes are of different
        # lengths and num_episodes is reached, some episodes would be left in "MID".
        # When the driver runs again, all environments are reset at the beginning
        # of the tf.while_loop and the unfinished episodes would get "FIRST" without
        # seeing "LAST".
        metric(trajectory.first((), (), (), 3., 1.))
        metric(trajectory.last((), (), (), 4., 1.))
        self.assertEqual(expected_result, metric.result())
Exemple #10
0
 def verifyTrainAndRestore(self, loss_fn=None):
     """Helper function for testing correct variable updating and restoring."""
     batch_size = 2
     seq_len = 2
     observations = tensor_spec.sample_spec_nest(self._observation_spec,
                                                 outer_dims=(batch_size,
                                                             seq_len))
     actions = tensor_spec.sample_spec_nest(self._action_spec,
                                            outer_dims=(batch_size,
                                                        seq_len))
     rewards = tf.constant([[10, 10], [20, 20]], dtype=tf.float32)
     discounts = tf.constant([[0.9, 0.9], [0.9, 0.9]], dtype=tf.float32)
     experience = trajectory.first(observation=observations,
                                   action=actions,
                                   policy_info=(),
                                   reward=rewards,
                                   discount=discounts)
     strategy = tf.distribute.get_strategy()
     with strategy.scope():
         q_net = critic_network.CriticNetwork(
             (self._observation_spec, self._action_spec))
         agent = qtopt_agent.QtOptAgent(
             self._time_step_spec,
             self._action_spec,
             q_network=q_net,
             optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
             init_mean_cem=self._mean,
             init_var_cem=self._var,
             num_samples_cem=self._num_samples,
             actions_sampler=self._sampler,
             in_graph_bellman_update=True)
     loss_before_train = agent.loss(experience).loss
     # Check loss is stable.
     self.assertEqual(loss_before_train, agent.loss(experience).loss)
     # Train 1 step, verify that loss is decreased for the same input.
     agent.train(experience)
     loss_after_train = agent.loss(experience).loss
     self.assertLessEqual(loss_after_train, loss_before_train)
     # Assert loss evaluation is still stable, e.g. deterministic.
     self.assertLessEqual(loss_after_train, agent.loss(experience).loss)
     # Save checkpoint
     ckpt_dir = self.create_tempdir()
     checkpointer = common.Checkpointer(ckpt_dir=ckpt_dir, agent=agent)
     global_step = tf.constant(1)
     checkpointer.save(global_step)
     # Assign all vars to 0.
     for var in tf.nest.flatten(agent.variables):
         var.assign(tf.zeros_like(var))
     loss_after_zero = agent.loss(experience).loss
     self.assertEqual(loss_after_zero, agent.loss(experience).loss)
     self.assertNotEqual(loss_after_zero, loss_after_train)
     # Restore
     checkpointer._checkpoint.restore(
         checkpointer._manager.latest_checkpoint)
     loss_after_restore = agent.loss(experience).loss
     self.assertNotEqual(loss_after_restore, loss_after_zero)
     self.assertEqual(loss_after_restore, loss_after_train)
Exemple #11
0
 def testFirstArrays(self):
   observation = ()
   action = ()
   policy_info = ()
   reward = np.array([1.0, 1.0, 2.0])
   discount = np.array([1.0, 1.0, 1.0])
   traj = trajectory.first(observation, action, policy_info, reward, discount)
   self.assertFalse(tf.is_tensor(traj.step_type))
   self.assertAllEqual(traj.step_type, [ts.StepType.FIRST] * 3)
   self.assertAllEqual(traj.next_step_type, [ts.StepType.MID] * 3)
Exemple #12
0
 def testFirstTensors(self):
   observation = ()
   action = ()
   policy_info = ()
   reward = tf.constant([1.0, 1.0, 2.0])
   discount = tf.constant([1.0, 1.0, 1.0])
   traj = trajectory.first(observation, action, policy_info, reward, discount)
   self.assertTrue(tf.is_tensor(traj.step_type))
   traj_val = self.evaluate(traj)
   self.assertAllEqual(traj_val.step_type, [ts.StepType.FIRST] * 3)
   self.assertAllEqual(traj_val.next_step_type, [ts.StepType.MID] * 3)
 def verifyTrainAndRestore(self,
                           observation_spec,
                           action_spec,
                           actor_net,
                           loss_fn=None):
     """Helper function for testing correct variable updating and restoring."""
     batch_size = 2
     observations = tensor_spec.sample_spec_nest(observation_spec,
                                                 outer_dims=(batch_size, ))
     actions = tensor_spec.sample_spec_nest(action_spec,
                                            outer_dims=(batch_size, ))
     rewards = tf.constant([10, 20], dtype=tf.float32)
     discounts = tf.constant([0.9, 0.9], dtype=tf.float32)
     experience = trajectory.first(observation=observations,
                                   action=actions,
                                   policy_info=(),
                                   reward=rewards,
                                   discount=discounts)
     time_step_spec = ts.time_step_spec(observation_spec)
     strategy = tf.distribute.get_strategy()
     with strategy.scope():
         # Use BehaviorCloningAgent instead of AWRAgent to test the network.
         agent = behavioral_cloning_agent.BehavioralCloningAgent(
             time_step_spec,
             action_spec,
             cloning_network=actor_net,
             optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
             loss_fn=loss_fn)
     loss_before_train = agent.loss(experience).loss
     # Check loss is stable.
     self.assertEqual(loss_before_train, agent.loss(experience).loss)
     # Train 1 step, verify that loss is decreased for the same input.
     agent.train(experience)
     loss_after_train = agent.loss(experience).loss
     self.assertLessEqual(loss_after_train, loss_before_train)
     # Assert loss evaluation is still stable, e.g. deterministic.
     self.assertLessEqual(loss_after_train, agent.loss(experience).loss)
     # Save checkpoint
     ckpt_dir = self.create_tempdir()
     checkpointer = common.Checkpointer(ckpt_dir=ckpt_dir, agent=agent)
     global_step = tf.constant(1)
     checkpointer.save(global_step)
     # Assign all vars to 0.
     for var in tf.nest.flatten(agent.variables):
         var.assign(tf.zeros_like(var))
     loss_after_zero = agent.loss(experience).loss
     self.assertEqual(loss_after_zero, agent.loss(experience).loss)
     self.assertNotEqual(loss_after_zero, loss_after_train)
     # Restore
     checkpointer._checkpoint.restore(
         checkpointer._manager.latest_checkpoint)
     loss_after_restore = agent.loss(experience).loss
     self.assertNotEqual(loss_after_restore, loss_after_zero)
     self.assertEqual(loss_after_restore, loss_after_train)
Exemple #14
0
    def testAverageTwoEpisode(self, metric_class, expected_result):
        metric = metric_class()

        metric(trajectory.boundary((), (), (), 0., 1.))
        metric(trajectory.first((), (), (), 1., 1.))
        metric(trajectory.mid((), (), (), 2., 1.))
        metric(trajectory.last((), (), (), 3., 0.))
        metric(trajectory.boundary((), (), (), 0., 1.))

        # TODO(kbanoop): Add optional next_step_type arg to trajectory.first. Or
        # implement trajectory.first_last().
        metric(
            trajectory.Trajectory(ts.StepType.FIRST, (), (), (),
                                  ts.StepType.LAST, -6., 1.))

        self.assertEqual(expected_result, metric.result())