def testTrain(self): agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._dummy_categorical_net, self._optimizer) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) train_step = agent.train(experience, weights=None) # Due to the constant initialization of the DummyCategoricalNet, we can # expect the same loss every time. expected_loss = 2.19525 self.evaluate(tf.compat.v1.global_variables_initializer()) evaluated_loss, _ = self.evaluate(train_step) self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
def testLossWithL2Regularization(self, agent_class): q_net = DummyNet(self._observation_spec, self._action_spec, l2_regularization_weight=1.0) agent = agent_class(self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1]], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = [tf.constant([[5, 6], [7, 8]], dtype=tf.float32)] next_time_steps = ts.transition(next_observations, rewards, discounts) experience = trajectories_test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # See the loss explanation in testLoss above. # L2_regularization_loss: 2^2 + 1^2 + 1^2 + 1^2 = 7.0 # Overall loss: 26.0 (from testLoss) + 7.0 = 33.0 expected_loss = 33.0 loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), expected_loss)
def testInitialize(self): agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._categorical_net, self._optimizer) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_time_steps = ts.transition(observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) loss_info = agent._loss(experience) initialize = agent.initialize() self.evaluate(tf.compat.v1.global_variables_initializer()) losses = self.evaluate(loss_info).loss self.assertGreater(losses, 0.0) critic_variables = agent._q_network.variables target_critic_variables = agent._target_q_network.variables self.assertTrue(critic_variables) self.assertTrue(target_critic_variables) self.evaluate(initialize) for s, t in zip(critic_variables, target_critic_variables): self.assertAllClose(self.evaluate(s), self.evaluate(t))
def testLossWithMaskedActions(self, agent_class): # Observations are now a tuple of the usual observation and an action mask. observation_spec_with_mask = (self._observation_spec, tensor_spec.BoundedTensorSpec([2], tf.int32, 0, 1)) time_step_spec = ts.time_step_spec(observation_spec_with_mask) q_net = DummyNet(observation_spec_with_mask, self._action_spec, mask_split_fn=lambda x: (x[0], x[1])) agent = agent_class(time_step_spec, self._action_spec, q_network=q_net, optimizer=None) # For observations, the masks are set up so that all actions are valid. observations = ([tf.constant([[1, 2], [3, 4]], dtype=tf.float32)], tf.constant([[1, 1], [1, 1]], dtype=tf.int32)) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1]], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) # For next_observations, the masks are set up so that only one action is # valid for each element in the batch. next_observations = ([tf.constant([[5, 6], [7, 8]], dtype=tf.float32)], tf.constant([[0, 1], [1, 0]], dtype=tf.int32)) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = trajectories_test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer # [[1], [1]] from DummyNet above, we can calculate the following values: # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5 # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8 # (Here we use the second row of the kernel initializer above, since the # chosen action is now 1 instead of 0.) # # For target Q-values, because of the masks we only have one valid choice of # action for each next_observation: # Target Q-value for first next_observation (only action 1 is valid): # 1 * 5 + 1 * 6 + 1 = 12 # Target Q-value for second next_observation (only action 0 is valid): # 2 * 7 + 1 * 8 + 1 = 23 # TD targets: 10 + 0.9 * 12 = 20.8 and 20 + 0.9 * 23 = 40.7 # TD errors: 20.8 - 5 = 15.8 and 40.7 - 8 = 32.7 # TD loss: 15.3 and 32.2 (Huber loss subtracts 0.5) # Overall loss: (15.3 + 32.2) / 2 = 23.75 expected_loss = 23.75 loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(loss), expected_loss)
def testLoss(self): q_net = DummyNet((self._observation_spec, self._action_spec)) agent = qtopt_agent.QtOptAgent(self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None, init_mean_cem=self._mean, init_var_cem=self._var, num_samples_cem=self._num_samples, actions_sampler=self._sampler) agent._target_q_network_delayed = DummyNet( (self._observation_spec, self._action_spec), bias=1) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0.0], [0.0]], dtype=tf.float32) action_steps = policy_step.PolicyStep(actions, info=()) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = trajectories_test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer # ([[2], [2]] for q_network/target_network, [[1], [1]] for delayed # target_network) # from DummyNet above, we can calculate the following values: # Q Network: # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 2 = 6 # Q-value for second observation/action pair: 2 * 3 + 1 * 4 + 2 = 12 # Target Network: # Q-value for first next_observation: 2 * 5 + 1 * 6 + 2 = 18 # Q-value for second next_observation: 2 * 7 + 1 * 8 + 2 = 24 # Delayed Target Network: # Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17 # Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23 # TD targets: 10 + 0.9 * min(17, 18) = 25.3; 20 + 0.9 * min(23, 24) = 40.7 # TD errors: 25.3 - 6 = 19.3; 40.7 - 12 = 28.7 # TD loss: 18.8 and 28.2 (Huber loss subtracts 0.5) # Overall loss: (18.8 + 28.2) / 2 = 23.5 expected_td_loss = 23.5 loss, loss_info = agent._loss(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), expected_td_loss) self.assertAllClose(self.evaluate(tf.reduce_mean(loss_info.td_loss)), expected_td_loss)
def testCriticLossWithMaskedActions(self): # Observations are now a tuple of the usual observation and an action mask. observation_spec_with_mask = (self._obs_spec, tensor_spec.BoundedTensorSpec([2], tf.int32, 0, 1)) time_step_spec = ts.time_step_spec(observation_spec_with_mask) dummy_categorical_net = DummyCategoricalNet(self._obs_spec) agent = categorical_dqn_agent.CategoricalDqnAgent( time_step_spec, self._action_spec, dummy_categorical_net, self._optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1])) # For `observations`, the masks are set up so that only one action is valid # for each element in the batch. observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32), tf.constant([[1, 0], [0, 1]], dtype=tf.int32)) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) # For `next_observations`, the masks are set up so the opposite actions as # before are valid. next_observations = (tf.constant([[5, 6], [7, 8]], dtype=tf.float32), tf.constant([[0, 1], [1, 0]], dtype=tf.int32)) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Due to the constant initialization of the DummyCategoricalNet, we can # expect the same loss every time. Note this is different from the loss in # testCriticLoss above due to previously optimal actions being masked out. expected_loss = 5.062895 loss_info = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) evaluated_loss = self.evaluate(loss_info).loss self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
def testLossWithChangedOptimalActions(self, agent_class): q_net = DummyNet(self._observation_spec, self._action_spec) agent = agent_class( self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) # Note that instead of [[5, 6], [7, 8]] as before, we now have -5 and -7. next_observations = tf.constant([[-5, 6], [-7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = trajectories_test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer # [[1], [1]] from DummyNet above, we can calculate the following values: # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5 # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8 # (Here we use the second row of the kernel initializer above, since the # chosen action is now 1 instead of 0.) # # For the target Q-values here, note that since we've replaced 5 and 7 with # -5 and -7, it is better to use action 1 with a kernel of [1, 1] instead of # action 0 with a kernel of [2, 1]. # Target Q-value for first next_observation: 1 * -5 + 1 * 6 + 1 = 2 # Target Q-value for second next_observation: 1 * -7 + 1 * 8 + 1 = 2 # TD targets: 10 + 0.9 * 2 = 11.8 and 20 + 0.9 * 2 = 21.8 # TD errors: 11.8 - 5 = 6.8 and 21.8 - 8 = 13.8 # TD loss: 6.3 and 13.3 (Huber loss subtracts 0.5) # Overall loss: (6.3 + 13.3) / 2 = 9.8 expected_loss = 9.8 loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(loss), expected_loss)
def testLoss(self, agent_class, run_mode): if tf.executing_eagerly() and run_mode == context.graph_mode: self.skipTest('b/123778560') with run_mode(), tf.compat.v2.summary.record_if(False): q_net = DummyNet(self._observation_spec, self._action_spec) agent = agent_class(self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) actions = [tf.constant([[0], [1]], dtype=tf.int32)] action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = [ tf.constant([[5, 6], [7, 8]], dtype=tf.float32) ] next_time_steps = ts.transition(next_observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer # [[1], [1]] from DummyNet above, we can calculate the following values: # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5 # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8 # (Here we use the second row of the kernel initializer above, since the # chosen action is now 1 instead of 0.) # Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17 # Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23 # TD targets: 10 + 0.9 * 17 = 25.3 and 20 + 0.9 * 23 = 40.7 # TD errors: 25.3 - 5 = 20.3 and 40.7 - 8 = 32.7 # TD loss: 19.8 and 32.2 (Huber loss subtracts 0.5) # Overall loss: (19.8 + 32.2) / 2 = 26 expected_loss = 26.0 loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), expected_loss)
def testUpdateTarget(self): agent = categorical_dqn_agent.CategoricalDqnAgent( self._time_step_spec, self._action_spec, self._categorical_net, self._optimizer) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, time_steps) loss_info = agent._loss(experience) update_targets = agent._update_target() self.evaluate(tf.compat.v1.global_variables_initializer()) losses = self.evaluate(loss_info).loss self.assertGreater(losses, 0.0) self.evaluate(update_targets)
def testLoss(self, agent_class): q_net = DummyNet(self._observation_spec, self._action_spec) agent = agent_class( self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = trajectories_test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer # [[1], [1]] from DummyNet above, we can calculate the following values: # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5 # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8 # (Here we use the second row of the kernel initializer above, since the # chosen action is now 1 instead of 0.) # # For target Q-values, action 0 produces a greater Q-value with a kernel of # [2, 1] instead of [1, 1] for action 1. # Target Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17 # Target Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23 # TD targets: 10 + 0.9 * 17 = 25.3 and 20 + 0.9 * 23 = 40.7 # TD errors: 25.3 - 5 = 20.3 and 40.7 - 8 = 32.7 # TD loss: 19.8 and 32.2 (Huber loss subtracts 0.5) # Overall loss: (19.8 + 32.2) / 2 = 26 expected_loss = 26.0 loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(loss), expected_loss)