def build_q_losses( policy: Policy, model: ModelV2, dist_class: Type[TFActionDistribution], train_batch: SampleBatch, ) -> TensorType: """Constructs the loss for SimpleQTFPolicy. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]): The action distribution class. train_batch (SampleBatch): The training data. Returns: TensorType: A single loss tensor. """ # q network evaluation q_t = compute_q_values(policy, policy.model, train_batch[SampleBatch.CUR_OBS], explore=False) # target q network evalution q_tp1 = compute_q_values(policy, policy.target_model, train_batch[SampleBatch.NEXT_OBS], explore=False) if not hasattr(policy, "q_func_vars"): policy.q_func_vars = model.variables() policy.target_q_func_vars = policy.target_model.variables() # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot( tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32), policy.action_space.n) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) # compute estimate of best possible value starting from state at t + 1 dones = tf.cast(train_batch[SampleBatch.DONES], tf.float32) q_tp1_best_one_hot_selection = tf.one_hot(tf.argmax(q_tp1, 1), policy.action_space.n) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_tp1_best_masked = (1.0 - dones) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = (train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * q_tp1_best_masked) # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) loss = tf.reduce_mean(huber_loss(td_error)) # save TD error as an attribute for outside access policy.td_error = td_error return loss
def build_q_losses(policy: Policy, model, dist_class, train_batch: SampleBatch) -> TensorType: """Constructs the loss for SimpleQTorchPolicy. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]): The action distribution class. train_batch (SampleBatch): The training data. Returns: TensorType: A single loss tensor. """ target_model = policy.target_models[model] # q network evaluation q_t = compute_q_values(policy, model, train_batch[SampleBatch.CUR_OBS], explore=False, is_training=True) # target q network evalution q_tp1 = compute_q_values(policy, target_model, train_batch[SampleBatch.NEXT_OBS], explore=False, is_training=True) # q scores for actions which we know were selected in the given state. one_hot_selection = F.one_hot(train_batch[SampleBatch.ACTIONS].long(), policy.action_space.n) q_t_selected = torch.sum(q_t * one_hot_selection, 1) # compute estimate of best possible value starting from state at t + 1 dones = train_batch[SampleBatch.DONES].float() q_tp1_best_one_hot_selection = F.one_hot(torch.argmax(q_tp1, 1), policy.action_space.n) q_tp1_best = torch.sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_tp1_best_masked = (1.0 - dones) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = (train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * q_tp1_best_masked) # Compute the error (Square/Huber). td_error = q_t_selected - q_t_selected_target.detach() loss = torch.mean(huber_loss(td_error)) # save TD error as an attribute for outside access policy.td_error = td_error return loss