def _create_trajectories(self): # Order of args for trajectory methods: # observation, action, policy_info, reward, discount ts0 = nest_utils.stack_nested_tensors([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) ts1 = nest_utils.stack_nested_tensors([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 2., 1.) ]) ts2 = nest_utils.stack_nested_tensors([ trajectory.last((), (), (), 3., 1.), trajectory.last((), (), (), 4., 1.) ]) ts3 = nest_utils.stack_nested_tensors([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) ts4 = nest_utils.stack_nested_tensors([ trajectory.first((), (), (), 5., 1.), trajectory.first((), (), (), 6., 1.) ]) ts5 = nest_utils.stack_nested_tensors([ trajectory.last((), (), (), 7., 1.), trajectory.last((), (), (), 8., 1.) ]) return [ts0, ts1, ts2, ts3, ts4, ts5]
def _create_misaligned_trajectories(self): def _concat_nested_tensors(nest1, nest2): return tf.nest.map_structure(lambda t1, t2: tf.concat([t1, t2], axis=0), nest1, nest2) # Order of args for trajectory methods: # observation, action, policy_info, reward, discount ts1 = _concat_nested_tensors( trajectory.first((), tf.constant([2]), (), tf.constant([1.], dtype=tf.float32), [1.]), trajectory.boundary((), tf.constant([1]), (), tf.constant([0.], dtype=tf.float32), [1.])) ts2 = _concat_nested_tensors( trajectory.last((), tf.constant([1]), (), tf.constant([3.], dtype=tf.float32), [1.]), trajectory.first((), tf.constant([1]), (), tf.constant([2.], dtype=tf.float32), [1.])) ts3 = _concat_nested_tensors( trajectory.boundary((), tf.constant([2]), (), tf.constant([0.], dtype=tf.float32), [1.]), trajectory.last((), tf.constant([1]), (), tf.constant([4.], dtype=tf.float32), [1.])) return [ts1, ts2, ts3]
def setUp(self): super(BatchedPyMetricTest, self).setUp() # Order of args for trajectory methods: # observation, action, policy_info, reward, discount ts0 = nest_utils.stack_nested_tensors([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) ts1 = nest_utils.stack_nested_tensors([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 2., 1.) ]) ts2 = nest_utils.stack_nested_tensors([ trajectory.last((), (), (), 3., 1.), trajectory.last((), (), (), 4., 1.) ]) ts3 = nest_utils.stack_nested_tensors([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ]) ts4 = nest_utils.stack_nested_tensors([ trajectory.first((), (), (), 5., 1.), trajectory.first((), (), (), 6., 1.) ]) ts5 = nest_utils.stack_nested_tensors([ trajectory.last((), (), (), 7., 1.), trajectory.last((), (), (), 8., 1.) ]) self._ts = [ts0, ts1, ts2, ts3, ts4, ts5]
def testBatchSizeProvided(self, metric_class, expected_result): metric = metric_class(batch_size=2) metric( nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.boundary((), (), (), 0., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.first((), (), (), 1., 1.), trajectory.first((), (), (), 1., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.mid((), (), (), 2., 1.), trajectory.last((), (), (), 3., 0.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.last((), (), (), 3., 0.), trajectory.boundary((), (), (), 0., 1.) ])) metric( nest_utils.stack_nested_arrays([ trajectory.boundary((), (), (), 0., 1.), trajectory.first((), (), (), 1., 1.) ])) self.assertEqual(metric.result(), expected_result)
def setUp(self): super(PyDriverTest, self).setUp() f0 = np.array(0., dtype=np.float32) f1 = np.array(1., dtype=np.float32) # Order of args for trajectory methods: # (observation, action, policy_info, reward, discount) self._trajectories = [ trajectory.first(0, 1, 2, f1, f1), trajectory.last(1, 2, 4, f1, f0), trajectory.boundary(3, 1, 2, f0, f1), trajectory.first(0, 1, 2, f1, f1), trajectory.last(1, 2, 4, f1, f0), trajectory.boundary(3, 1, 2, f0, f1), trajectory.first(0, 1, 2, f1, f1), ]
def testLoss(self): cloning_net = DummyNet(self._observation_spec, self._action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, cloning_network=cloning_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) experience = trajectory.first( observation=observations, action=actions, policy_info=(), reward=rewards, discount=discounts) loss_info = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) total_loss, _ = self.evaluate(loss_info) expected_loss = tf.reduce_mean( input_tensor=tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits( logits=cloning_net(observations)[0], labels=actions)) self.assertAllClose(total_loss, expected_loss)
def testZeroEpisodes(self, metric_class, expected_result): metric = metric_class() # Order of args for trajectory methods: # observation, action, policy_info, reward, discount metric(trajectory.boundary((), (), (), 0., 1.)) metric(trajectory.first((), (), (), 1., 1.)) self.assertEqual(expected_result, metric.result())
def testLoss(self): cloning_net = get_dummy_net(self._action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) experience = trajectory.first(observation=observations, action=actions, policy_info=(), reward=rewards, discount=discounts) self.evaluate(tf.compat.v1.global_variables_initializer()) expected_loss = tf.reduce_mean( input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits( labels=actions, logits=cloning_net(observations)[0])) loss_info = agent.train(experience) total_loss = self.evaluate(loss_info.loss) self.assertAllClose(total_loss, expected_loss) test_util.test_loss_and_train_output(test=self, expect_equal_loss_values=True, agent=agent, experience=experience)
def testAverageOneEpisodeWithReset(self, metric_class, expected_result): metric = metric_class() metric(trajectory.first((), (), (), 0., 1.)) metric(trajectory.mid((), (), (), 1., 1.)) metric(trajectory.mid((), (), (), 2., 1.)) # The episode is reset. # # This could happen when using the dynamic_episode_driver with # parallel_py_environment. When the parallel episodes are of different # lengths and num_episodes is reached, some episodes would be left in "MID". # When the driver runs again, all environments are reset at the beginning # of the tf.while_loop and the unfinished episodes would get "FIRST" without # seeing "LAST". metric(trajectory.first((), (), (), 3., 1.)) metric(trajectory.last((), (), (), 4., 1.)) self.assertEqual(expected_result, metric.result())
def verifyTrainAndRestore(self, loss_fn=None): """Helper function for testing correct variable updating and restoring.""" batch_size = 2 seq_len = 2 observations = tensor_spec.sample_spec_nest(self._observation_spec, outer_dims=(batch_size, seq_len)) actions = tensor_spec.sample_spec_nest(self._action_spec, outer_dims=(batch_size, seq_len)) rewards = tf.constant([[10, 10], [20, 20]], dtype=tf.float32) discounts = tf.constant([[0.9, 0.9], [0.9, 0.9]], dtype=tf.float32) experience = trajectory.first(observation=observations, action=actions, policy_info=(), reward=rewards, discount=discounts) strategy = tf.distribute.get_strategy() with strategy.scope(): q_net = critic_network.CriticNetwork( (self._observation_spec, self._action_spec)) agent = qtopt_agent.QtOptAgent( self._time_step_spec, self._action_spec, q_network=q_net, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), init_mean_cem=self._mean, init_var_cem=self._var, num_samples_cem=self._num_samples, actions_sampler=self._sampler, in_graph_bellman_update=True) loss_before_train = agent.loss(experience).loss # Check loss is stable. self.assertEqual(loss_before_train, agent.loss(experience).loss) # Train 1 step, verify that loss is decreased for the same input. agent.train(experience) loss_after_train = agent.loss(experience).loss self.assertLessEqual(loss_after_train, loss_before_train) # Assert loss evaluation is still stable, e.g. deterministic. self.assertLessEqual(loss_after_train, agent.loss(experience).loss) # Save checkpoint ckpt_dir = self.create_tempdir() checkpointer = common.Checkpointer(ckpt_dir=ckpt_dir, agent=agent) global_step = tf.constant(1) checkpointer.save(global_step) # Assign all vars to 0. for var in tf.nest.flatten(agent.variables): var.assign(tf.zeros_like(var)) loss_after_zero = agent.loss(experience).loss self.assertEqual(loss_after_zero, agent.loss(experience).loss) self.assertNotEqual(loss_after_zero, loss_after_train) # Restore checkpointer._checkpoint.restore( checkpointer._manager.latest_checkpoint) loss_after_restore = agent.loss(experience).loss self.assertNotEqual(loss_after_restore, loss_after_zero) self.assertEqual(loss_after_restore, loss_after_train)
def testFirstArrays(self): observation = () action = () policy_info = () reward = np.array([1.0, 1.0, 2.0]) discount = np.array([1.0, 1.0, 1.0]) traj = trajectory.first(observation, action, policy_info, reward, discount) self.assertFalse(tf.is_tensor(traj.step_type)) self.assertAllEqual(traj.step_type, [ts.StepType.FIRST] * 3) self.assertAllEqual(traj.next_step_type, [ts.StepType.MID] * 3)
def testFirstTensors(self): observation = () action = () policy_info = () reward = tf.constant([1.0, 1.0, 2.0]) discount = tf.constant([1.0, 1.0, 1.0]) traj = trajectory.first(observation, action, policy_info, reward, discount) self.assertTrue(tf.is_tensor(traj.step_type)) traj_val = self.evaluate(traj) self.assertAllEqual(traj_val.step_type, [ts.StepType.FIRST] * 3) self.assertAllEqual(traj_val.next_step_type, [ts.StepType.MID] * 3)
def verifyTrainAndRestore(self, observation_spec, action_spec, actor_net, loss_fn=None): """Helper function for testing correct variable updating and restoring.""" batch_size = 2 observations = tensor_spec.sample_spec_nest(observation_spec, outer_dims=(batch_size, )) actions = tensor_spec.sample_spec_nest(action_spec, outer_dims=(batch_size, )) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) experience = trajectory.first(observation=observations, action=actions, policy_info=(), reward=rewards, discount=discounts) time_step_spec = ts.time_step_spec(observation_spec) strategy = tf.distribute.get_strategy() with strategy.scope(): # Use BehaviorCloningAgent instead of AWRAgent to test the network. agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=actor_net, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss_fn=loss_fn) loss_before_train = agent.loss(experience).loss # Check loss is stable. self.assertEqual(loss_before_train, agent.loss(experience).loss) # Train 1 step, verify that loss is decreased for the same input. agent.train(experience) loss_after_train = agent.loss(experience).loss self.assertLessEqual(loss_after_train, loss_before_train) # Assert loss evaluation is still stable, e.g. deterministic. self.assertLessEqual(loss_after_train, agent.loss(experience).loss) # Save checkpoint ckpt_dir = self.create_tempdir() checkpointer = common.Checkpointer(ckpt_dir=ckpt_dir, agent=agent) global_step = tf.constant(1) checkpointer.save(global_step) # Assign all vars to 0. for var in tf.nest.flatten(agent.variables): var.assign(tf.zeros_like(var)) loss_after_zero = agent.loss(experience).loss self.assertEqual(loss_after_zero, agent.loss(experience).loss) self.assertNotEqual(loss_after_zero, loss_after_train) # Restore checkpointer._checkpoint.restore( checkpointer._manager.latest_checkpoint) loss_after_restore = agent.loss(experience).loss self.assertNotEqual(loss_after_restore, loss_after_zero) self.assertEqual(loss_after_restore, loss_after_train)
def testAverageTwoEpisode(self, metric_class, expected_result): metric = metric_class() metric(trajectory.boundary((), (), (), 0., 1.)) metric(trajectory.first((), (), (), 1., 1.)) metric(trajectory.mid((), (), (), 2., 1.)) metric(trajectory.last((), (), (), 3., 0.)) metric(trajectory.boundary((), (), (), 0., 1.)) # TODO(kbanoop): Add optional next_step_type arg to trajectory.first. Or # implement trajectory.first_last(). metric( trajectory.Trajectory(ts.StepType.FIRST, (), (), (), ts.StepType.LAST, -6., 1.)) self.assertEqual(expected_result, metric.result())