def testCreateAgentNestSizeChecks(self, action_spec, expected_error): cloning_net = get_dummy_net(action_spec, self._observation_spec) if expected_error is not None: with self.assertRaisesRegex(ValueError, expected_error): behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, action_spec, cloning_network=cloning_net, optimizer=None) else: behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, action_spec, cloning_network=cloning_net, optimizer=None)
def test_inference(self): test_agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, self._network, tf.compat.v1.train.AdamOptimizer(), num_outer_dims=2) test_trainer = trainer.Trainer(root_dir=self.get_temp_dir(), agent=test_agent) inference_batch_size = 1 random_time_step = tensor_spec.sample_spec_nest( self._time_step_spec, outer_dims=(inference_batch_size, )) initial_policy_state = test_trainer._agent.policy.get_initial_state( inference_batch_size) action_outputs = test_trainer._agent.policy.action( random_time_step, initial_policy_state) self.assertAllEqual([inference_batch_size], action_outputs.action.shape) action_outputs = test_trainer._agent.policy.action( random_time_step, action_outputs.state) self.assertAllEqual([inference_batch_size], action_outputs.action.shape)
def testTrainWithSingleOuterDimension(self): # Hard code a trajectory shaped (time=6, batch=1, ...). traj, time_step_spec, action_spec = create_arbitrary_trajectory() # Remove the batch dimension so there is only one outer dimension. traj = tf.nest.map_structure(lambda x: tf.squeeze(x, axis=1), traj) cloning_net = q_network.QNetwork( time_step_spec.observation, action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.01)) # Disable clipping to make sure we can see the difference in behavior agent.policy._clip = False # Remove policy_info, as BehavioralCloningAgent expects none. traj = traj.replace(policy_info=()) # TODO(b/123883319) if tf.executing_eagerly(): train_and_loss = lambda: agent.train(traj) else: train_and_loss = agent.train(traj) self.evaluate(tf.compat.v1.global_variables_initializer()) for _ in range(TRAIN_ITERATIONS): self.evaluate(train_and_loss)
def testTrainWithRNN(self): # Hard code a trajectory shaped (time=6, batch=1, ...). traj, time_step_spec, action_spec = create_arbitrary_trajectory() cloning_net = q_rnn_network.QRnnNetwork( time_step_spec.observation, action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.01), num_outer_dims=2) # Disable clipping to make sure we can see the difference in behavior agent.policy._clip = False # Remove policy_info, as BehavioralCloningAgent expects none. traj = traj.replace(policy_info=()) # TODO(b/123883319) if tf.executing_eagerly(): train_and_loss = lambda: agent.train(traj) else: train_and_loss = agent.train(traj) self.evaluate(tf.compat.v1.global_variables_initializer()) initial_loss = self.evaluate(train_and_loss).loss for _ in range(TRAIN_ITERATIONS - 1): loss = self.evaluate(train_and_loss).loss # We don't necessarily converge to the same actions as in trajectory after # 10 steps of an untuned optimizer, but the loss should go down. self.assertGreater(initial_loss, loss)
def testLoss(self): cloning_net = get_dummy_net(self._action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) experience = trajectory.first(observation=observations, action=actions, policy_info=(), reward=rewards, discount=discounts) self.evaluate(tf.compat.v1.global_variables_initializer()) expected_loss = tf.reduce_mean( input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits( labels=actions, logits=cloning_net(observations)[0])) loss_info = agent.train(experience) total_loss = self.evaluate(loss_info.loss) self.assertAllClose(total_loss, expected_loss) test_util.test_loss_and_train_output(test=self, expect_equal_loss_values=True, agent=agent, experience=experience)
def testLoss(self): cloning_net = DummyNet(self._observation_spec, self._action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, cloning_network=cloning_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] actions = tf.constant([0, 1], dtype=tf.int32) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) experience = trajectory.first( observation=observations, action=actions, policy_info=(), reward=rewards, discount=discounts) loss_info = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) total_loss, _ = self.evaluate(loss_info) expected_loss = tf.reduce_mean( input_tensor=tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits( logits=cloning_net(observations)[0], labels=actions)) self.assertAllClose(total_loss, expected_loss)
def testTrainWithSingleOuterDimension(self): # Emits trajectories shaped (batch=1, time=6, ...) traj, time_step_spec, action_spec = ( driver_test_utils.make_random_trajectory()) # Convert to shapes (batch=6, 1, ...) so this works with a non-RNN model. traj = tf.nest.map_structure(common.transpose_batch_time, traj) # Remove the time dimension so there is only one outer dimension. traj = tf.nest.map_structure(lambda x: tf.squeeze(x, axis=1), traj) cloning_net = q_network.QNetwork( time_step_spec.observation, action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.01)) # Disable clipping to make sure we can see the difference in behavior agent.policy._clip = False # Remove policy_info, as BehavioralCloningAgent expects none. traj = traj.replace(policy_info=()) # TODO(b/123883319) if tf.executing_eagerly(): train_and_loss = lambda: agent.train(traj) else: train_and_loss = agent.train(traj) self.evaluate(tf.compat.v1.global_variables_initializer()) for _ in range(TRAIN_ITERATIONS): self.evaluate(train_and_loss)
def testLearnerRaiseExceptionOnMismatchingBatchSetup(self): obs_spec = tensor_spec.TensorSpec([2], tf.float32) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1) flat_action_spec = tf.nest.flatten(action_spec)[0] num_actions = flat_action_spec.maximum - flat_action_spec.minimum + 1 network = sequential.Sequential([ tf.keras.layers.Dense(num_actions, dtype=tf.float32), inner_reshape.InnerReshape([None], [num_actions]) ]) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=network, optimizer=None) with self.assertRaisesRegex( ValueError, 'All of the Tensors in `value` must have one outer dimension.' ): learner.Learner(root_dir=os.path.join( self.create_tempdir().full_path, 'learner'), train_step=train_utils.create_train_step(), agent=agent)
def testTrainWithRNN(self): # Emits trajectories shaped (batch=1, time=6, ...) traj, time_step_spec, action_spec = ( driver_test_utils.make_random_trajectory()) cloning_net = q_rnn_network.QRnnNetwork( time_step_spec.observation, action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.01), num_outer_dims=2) # Disable clipping to make sure we can see the difference in behavior agent.policy._clip = False # Remove policy_info, as BehavioralCloningAgent expects none. traj = traj.replace(policy_info=()) # TODO(b/123883319) if tf.executing_eagerly(): train_and_loss = lambda: agent.train(traj) else: train_and_loss = agent.train(traj) replay = trajectory_replay.TrajectoryReplay(agent.policy) self.evaluate(tf.compat.v1.global_variables_initializer()) initial_actions = self.evaluate(replay.run(traj)[0]) for _ in range(TRAIN_ITERATIONS): self.evaluate(train_and_loss) post_training_actions = self.evaluate(replay.run(traj)[0]) # We don't necessarily converge to the same actions as in trajectory after # 10 steps of an untuned optimizer, but the policy does change. self.assertFalse(np.all(initial_actions == post_training_actions))
def testTrain(self): # Emits trajectories shaped (batch=1, time=6, ...) traj, time_step_spec, action_spec = ( driver_test_utils.make_random_trajectory()) # Convert to shapes (batch=6, 1, ...) so this works with a non-RNN model. traj = nest.map_structure(tf.contrib.rnn.transpose_batch_time, traj) cloning_net = q_network.QNetwork(time_step_spec.observation, action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=cloning_net, optimizer=tf.train.AdamOptimizer(learning_rate=0.01)) # Remove policy_info, as BehavioralCloningAgent expects none. traj = traj.replace(policy_info=()) train_and_loss = agent.train(traj) replay = trajectory_replay.TrajectoryReplay(agent.policy()) self.evaluate(tf.global_variables_initializer()) initial_actions = self.evaluate(replay.run(traj)[0]) for _ in range(TRAIN_ITERATIONS): self.evaluate(train_and_loss) post_training_actions = self.evaluate(replay.run(traj)[0]) # We don't necessarily converge to the same actions as in trajectory after # 10 steps of an untuned optimizer, but the policy does change. self.assertFalse(np.all(initial_actions == post_training_actions))
def testLearnerRaiseExceptionOnMismatchingBatchSetup(self): obs_spec = tensor_spec.TensorSpec([2], tf.float32) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1) flat_action_spec = tf.nest.flatten(action_spec)[0] num_actions = flat_action_spec.maximum - flat_action_spec.minimum + 1 network = sequential.Sequential([ tf.keras.layers.Dense(num_actions, dtype=tf.float32), inner_reshape.InnerReshape([None], [num_actions]) ]) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=network, optimizer=None) with self.assertRaisesRegex( RuntimeError, (r'The slot variable initialization failed. The learner assumes all ' r'experience tensors required an `outer_rank = \(None, ' r'agent.train_sequence_length\)`\. If that\'s not the case for your ' r'agent try setting `run_optimizer_variable_init=False`\.')): learner.Learner( root_dir=os.path.join(self.create_tempdir().full_path, 'learner'), train_step=train_utils.create_train_step(), agent=agent)
def testCreateAgent(self): cloning_net = DummyNet(self._observation_spec, self._action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, cloning_network=cloning_net, optimizer=None) self.assertIsNotNone(agent.policy)
def testCreateAgentDimChecks(self): action_spec = tensor_spec.BoundedTensorSpec([1, 2], tf.int32, 0, 1) cloning_net = DummyNet(self._observation_spec, action_spec) with self.assertRaisesRegexp(NotImplementedError, '.*scalar, unnested.*'): behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, action_spec, cloning_network=cloning_net, optimizer=None)
def testCreateAgentWithListActionSpec(self): action_spec = [tensor_spec.BoundedTensorSpec([1], tf.int32, 0, 1)] cloning_net = DummyNet(self._observation_spec, action_spec) with self.assertRaisesRegexp(ValueError, '.*nested actions.*'): behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, action_spec, cloning_network=cloning_net, optimizer=None)
def test_trainer_initialization(self): test_agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, self._network, tf.compat.v1.train.AdamOptimizer(), num_outer_dims=2) test_trainer = trainer.Trainer(root_dir=self.get_temp_dir(), agent=test_agent) self.assertEqual(0, test_trainer._global_step.numpy())
def verifyTrainAndRestore(self, observation_spec, action_spec, actor_net, loss_fn=None): """Helper function for testing correct variable updating and restoring.""" batch_size = 2 observations = tensor_spec.sample_spec_nest(observation_spec, outer_dims=(batch_size, )) actions = tensor_spec.sample_spec_nest(action_spec, outer_dims=(batch_size, )) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) experience = trajectory.first(observation=observations, action=actions, policy_info=(), reward=rewards, discount=discounts) time_step_spec = ts.time_step_spec(observation_spec) strategy = tf.distribute.get_strategy() with strategy.scope(): # Use BehaviorCloningAgent instead of AWRAgent to test the network. agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=actor_net, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss_fn=loss_fn) loss_before_train = agent.loss(experience).loss # Check loss is stable. self.assertEqual(loss_before_train, agent.loss(experience).loss) # Train 1 step, verify that loss is decreased for the same input. agent.train(experience) loss_after_train = agent.loss(experience).loss self.assertLessEqual(loss_after_train, loss_before_train) # Assert loss evaluation is still stable, e.g. deterministic. self.assertLessEqual(loss_after_train, agent.loss(experience).loss) # Save checkpoint ckpt_dir = self.create_tempdir() checkpointer = common.Checkpointer(ckpt_dir=ckpt_dir, agent=agent) global_step = tf.constant(1) checkpointer.save(global_step) # Assign all vars to 0. for var in tf.nest.flatten(agent.variables): var.assign(tf.zeros_like(var)) loss_after_zero = agent.loss(experience).loss self.assertEqual(loss_after_zero, agent.loss(experience).loss) self.assertNotEqual(loss_after_zero, loss_after_train) # Restore checkpointer._checkpoint.restore( checkpointer._manager.latest_checkpoint) loss_after_restore = agent.loss(experience).loss self.assertNotEqual(loss_after_restore, loss_after_zero) self.assertEqual(loss_after_restore, loss_after_train)
def testCreateAgentNestSizeChecks(self): action_spec = [ tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1), tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1) ] cloning_net = get_dummy_net(action_spec) with self.assertRaisesRegex(ValueError, 'Only scalar .*'): behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, action_spec, cloning_network=cloning_net, optimizer=None)
def testCreateAgentNestSizeChecks(self): action_spec = [ tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1), tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1) ] cloning_net = DummyNet(self._observation_spec, action_spec) with self.assertRaisesRegexp(ValueError, '.*multi-dimensional.*'): behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, action_spec, cloning_network=cloning_net, optimizer=None)
def _create_behavioral_cloning_agent(time_step_spec, action_spec, policy_network): """Creates a behavioral_cloning_agent.""" layers = tf.nest.map_structure( feature_ops.get_observation_processing_layer_creator(), time_step_spec.observation) network = policy_network(time_step_spec.observation, action_spec, preprocessing_layers=layers, name='QNetwork') return behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=network, num_outer_dims=2)
def test_training(self): test_agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, self._network, tf.compat.v1.train.AdamOptimizer(), num_outer_dims=2) test_trainer = trainer.Trainer(root_dir=self.get_temp_dir(), agent=test_agent) self.assertEqual(0, test_trainer._global_step.numpy()) dataset_iter = _create_test_data(batch_size=3, sequence_length=3) test_trainer.train(dataset_iter, num_iterations=10) self.assertEqual(10, test_trainer._global_step.numpy())
def testPolicy(self): cloning_net = DummyNet(self._observation_spec, self._action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, cloning_network=cloning_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) # Batch size 2. self.assertAllEqual( [2] + self._action_spec.shape.as_list(), action_step.action.shape, ) self.evaluate(tf.compat.v1.global_variables_initializer()) actions_ = self.evaluate(action_step.action) self.assertTrue(all(actions_ <= self._action_spec.maximum)) self.assertTrue(all(actions_ >= self._action_spec.minimum))
def testTrainWithNN(self, is_convert, is_distribution_network): # Hard code a trajectory shaped (time=6, batch=1, ...). traj, time_step_spec, action_spec = create_arbitrary_trajectory() if is_convert: # Convert to single step trajectory of shapes (batch=6, 1, ...). traj = tf.nest.map_structure(common.transpose_batch_time, traj) if is_distribution_network: cloning_net = sequential.Sequential([ expand_dims_layer.ExpandDims(-1), tf.keras.layers.Dense(action_spec.maximum - action_spec.minimum + 1), tf.keras.layers.Lambda( lambda t: tfp.distributions.Categorical(logits=t)), ]) else: cloning_net = q_network.QNetwork(time_step_spec.observation, action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=cloning_net, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001), num_outer_dims=2) # Disable clipping to make sure we can see the difference in behavior agent.policy._clip = False # TODO(b/123883319) if tf.executing_eagerly(): train_and_loss = lambda: agent.train(traj) else: train_and_loss = agent.train(traj) self.evaluate(tf.compat.v1.global_variables_initializer()) initial_loss = self.evaluate(train_and_loss).loss for _ in range(TRAIN_ITERATIONS - 1): loss = self.evaluate(train_and_loss).loss # We don't necessarily converge to the same actions as in trajectory after # 10 steps of an untuned optimizer, but the loss should go down. self.assertGreater(initial_loss, loss)
def testInitializeRestoreAgent(self): cloning_net = DummyNet(self._observation_spec, self._action_spec) agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, cloning_network=cloning_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) policy = agent.policy action_step = policy.action(time_steps) self.evaluate(tf.compat.v1.global_variables_initializer()) checkpoint = tf.train.Checkpoint(agent=agent) latest_checkpoint = tf.train.latest_checkpoint(self.get_temp_dir()) checkpoint_load_status = checkpoint.restore(latest_checkpoint) with self.cached_session() as sess: checkpoint_load_status.initialize_or_restore(sess) self.assertAllEqual(sess.run(action_step.action), [0, 0])
def verifyVariableAssignAndRestore(self, observation_spec, action_spec, actor_net, loss_fn=None): strategy = tf.distribute.get_strategy() time_step_spec = ts.time_step_spec(observation_spec) with strategy.scope(): # Use BehaviorCloningAgent instead of AWRAgent to test the network. agent = behavioral_cloning_agent.BehavioralCloningAgent( time_step_spec, action_spec, cloning_network=actor_net, optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss_fn=loss_fn) # Assign all vars to 0. for var in tf.nest.flatten(agent.variables): var.assign(tf.zeros_like(var)) # Save checkpoint ckpt_dir = self.create_tempdir() checkpointer = common.Checkpointer(ckpt_dir=ckpt_dir, agent=agent) global_step = tf.constant(0) checkpointer.save(global_step) # Assign all vars to 1. for var in tf.nest.flatten(agent.variables): var.assign(tf.ones_like(var)) # Restore to 0. checkpointer._checkpoint.restore( checkpointer._manager.latest_checkpoint) for var in tf.nest.flatten(agent.variables): value = var.numpy() if isinstance(value, np.int64): self.assertEqual(value, 0) else: self.assertAllEqual( value, np.zeros_like(value), msg='{} has var mean {}, expected 0.'.format( var.name, value))
def test_training(self): test_agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, self._network, tf.compat.v1.train.AdamOptimizer(), num_outer_dims=2) test_trainer = trainer.Trainer(root_dir=self.get_temp_dir(), agent=test_agent) self.assertEqual(0, test_trainer._global_step.numpy()) dataset_iter = _create_test_data(batch_size=3, sequence_length=3) monitor_dict = {'test': 1} with mock.patch.object(tf.summary, 'scalar', autospec=True) as mock_scalar_summary: test_trainer.train(dataset_iter, monitor_dict, num_iterations=10) self.assertEqual( 10, sum(1 for c in mock_scalar_summary.mock_calls if c[2]['name'] == 'test')) self.assertEqual(10, test_trainer._global_step.numpy())
def test_save_policy(self): test_agent = behavioral_cloning_agent.BehavioralCloningAgent( self._time_step_spec, self._action_spec, self._network, tf.compat.v1.train.AdamOptimizer()) policy_dict = { 'saved_policy': test_agent.policy, 'saved_collect_policy': test_agent.collect_policy } test_policy_saver = policy_saver.PolicySaver(policy_dict=policy_dict) root_dir = self.get_temp_dir() test_policy_saver.save(root_dir) sub_dirs = tf.io.gfile.listdir(root_dir) self.assertCountEqual(['saved_policy', 'saved_collect_policy'], sub_dirs) for sub_dir in ['saved_policy', 'saved_collect_policy']: self.assertTrue( tf.io.gfile.exists( os.path.join(root_dir, sub_dir, 'saved_model.pb'))) self.assertTrue( tf.io.gfile.exists( os.path.join(root_dir, sub_dir, 'variables/variables.data-00000-of-00001'))) output_signature_fn = os.path.join(root_dir, sub_dir, 'output_spec.json') self.assertTrue(tf.io.gfile.exists(output_signature_fn)) self.assertEqual([{ 'logging_name': 'inlining_decision', 'tensor_spec': { 'name': 'StatefulPartitionedCall', 'port': 0, 'type': 'int64_t', 'shape': [1], } }], json.loads(tf.io.gfile.GFile(output_signature_fn).read()))