def __init__( self, q_t_selected: TensorType, q_logits_t_selected: TensorType, q_tp1_best: TensorType, q_dist_tp1_best: TensorType, importance_weights: TensorType, rewards: TensorType, done_mask: TensorType, gamma: float = 0.99, n_step: int = 1, num_atoms: int = 1, v_min: float = -10.0, v_max: float = 10.0, ): if num_atoms > 1: # Distributional Q-learning which corresponds to an entropy loss z = tf.range(num_atoms, dtype=tf.float32) z = v_min + z * (v_max - v_min) / float(num_atoms - 1) # (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms) r_tau = tf.expand_dims( rewards, -1) + gamma**n_step * tf.expand_dims( 1.0 - done_mask, -1) * tf.expand_dims(z, 0) r_tau = tf.clip_by_value(r_tau, v_min, v_max) b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1)) lb = tf.floor(b) ub = tf.math.ceil(b) # indispensable judgement which is missed in most implementations # when b happens to be an integer, lb == ub, so pr_j(s', a*) will # be discarded because (ub-b) == (b-lb) == 0 floor_equal_ceil = tf.cast(tf.less(ub - lb, 0.5), tf.float32) l_project = tf.one_hot( tf.cast(lb, dtype=tf.int32), num_atoms) # (batch_size, num_atoms, num_atoms) u_project = tf.one_hot( tf.cast(ub, dtype=tf.int32), num_atoms) # (batch_size, num_atoms, num_atoms) ml_delta = q_dist_tp1_best * (ub - b + floor_equal_ceil) mu_delta = q_dist_tp1_best * (b - lb) ml_delta = tf.reduce_sum(l_project * tf.expand_dims(ml_delta, -1), axis=1) mu_delta = tf.reduce_sum(u_project * tf.expand_dims(mu_delta, -1), axis=1) m = ml_delta + mu_delta # Rainbow paper claims that using this cross entropy loss for # priority is robust and insensitive to `prioritized_replay_alpha` self.td_error = tf.nn.softmax_cross_entropy_with_logits( labels=m, logits=q_logits_t_selected) self.loss = tf.reduce_mean(self.td_error * tf.cast(importance_weights, tf.float32)) self.stats = { # TODO: better Q stats for dist dqn "mean_td_error": tf.reduce_mean(self.td_error), } else: q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked # compute the error (potentially clipped) self.td_error = q_t_selected - tf.stop_gradient( q_t_selected_target) self.loss = tf.reduce_mean( tf.cast(importance_weights, tf.float32) * huber_loss(self.td_error)) self.stats = { "mean_q": tf.reduce_mean(q_t_selected), "min_q": tf.reduce_min(q_t_selected), "max_q": tf.reduce_max(q_t_selected), "mean_td_error": tf.reduce_mean(self.td_error), }
def sac_actor_critic_loss( policy: Policy, model: ModelV2, dist_class: Type[TFActionDistribution], train_batch: SampleBatch, ) -> Union[TensorType, List[TensorType]]: """Constructs the loss for the Soft Actor Critic. Args: policy (Policy): The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. dist_class (Type[ActionDistribution]: The action distr. class. train_batch (SampleBatch): The training data. Returns: Union[TensorType, List[TensorType]]: A single loss tensor or a list of loss tensors. """ # Should be True only for debugging purposes (e.g. test cases)! deterministic = policy.config["_deterministic_loss"] _is_training = policy._get_is_training_placeholder() # Get the base model output from the train batch. model_out_t, _ = model( SampleBatch(obs=train_batch[SampleBatch.CUR_OBS], _is_training=_is_training), [], None, ) # Get the base model output from the next observations in the train batch. model_out_tp1, _ = model( SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS], _is_training=_is_training), [], None, ) # Get the target model's base outputs from the next observations in the # train batch. target_model_out_tp1, _ = policy.target_model( SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS], _is_training=_is_training), [], None, ) # Discrete actions case. if model.discrete: # Get all action probs directly from pi and form their logp. log_pis_t = tf.nn.log_softmax(model.get_policy_output(model_out_t), -1) policy_t = tf.math.exp(log_pis_t) log_pis_tp1 = tf.nn.log_softmax(model.get_policy_output(model_out_tp1), -1) policy_tp1 = tf.math.exp(log_pis_tp1) # Q-values. q_t = model.get_q_values(model_out_t) # Target Q-values. q_tp1 = policy.target_model.get_q_values(target_model_out_tp1) if policy.config["twin_q"]: twin_q_t = model.get_twin_q_values(model_out_t) twin_q_tp1 = policy.target_model.get_twin_q_values( target_model_out_tp1) q_tp1 = tf.reduce_min((q_tp1, twin_q_tp1), axis=0) q_tp1 -= model.alpha * log_pis_tp1 # Actually selected Q-values (from the actions batch). one_hot = tf.one_hot(train_batch[SampleBatch.ACTIONS], depth=q_t.shape.as_list()[-1]) q_t_selected = tf.reduce_sum(q_t * one_hot, axis=-1) if policy.config["twin_q"]: twin_q_t_selected = tf.reduce_sum(twin_q_t * one_hot, axis=-1) # Discrete case: "Best" means weighted by the policy (prob) outputs. q_tp1_best = tf.reduce_sum(tf.multiply(policy_tp1, q_tp1), axis=-1) q_tp1_best_masked = (1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32)) * q_tp1_best # Continuous actions case. else: # Sample simgle actions from distribution. action_dist_class = _get_dist_class(policy, policy.config, policy.action_space) action_dist_t = action_dist_class(model.get_policy_output(model_out_t), policy.model) policy_t = (action_dist_t.sample() if not deterministic else action_dist_t.deterministic_sample()) log_pis_t = tf.expand_dims(action_dist_t.logp(policy_t), -1) action_dist_tp1 = action_dist_class( model.get_policy_output(model_out_tp1), policy.model) policy_tp1 = (action_dist_tp1.sample() if not deterministic else action_dist_tp1.deterministic_sample()) log_pis_tp1 = tf.expand_dims(action_dist_tp1.logp(policy_tp1), -1) # Q-values for the actually selected actions. q_t = model.get_q_values( model_out_t, tf.cast(train_batch[SampleBatch.ACTIONS], tf.float32)) if policy.config["twin_q"]: twin_q_t = model.get_twin_q_values( model_out_t, tf.cast(train_batch[SampleBatch.ACTIONS], tf.float32)) # Q-values for current policy in given current state. q_t_det_policy = model.get_q_values(model_out_t, policy_t) if policy.config["twin_q"]: twin_q_t_det_policy = model.get_twin_q_values( model_out_t, policy_t) q_t_det_policy = tf.reduce_min( (q_t_det_policy, twin_q_t_det_policy), axis=0) # target q network evaluation q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1) if policy.config["twin_q"]: twin_q_tp1 = policy.target_model.get_twin_q_values( target_model_out_tp1, policy_tp1) # Take min over both twin-NNs. q_tp1 = tf.reduce_min((q_tp1, twin_q_tp1), axis=0) q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if policy.config["twin_q"]: twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) q_tp1 -= model.alpha * log_pis_tp1 q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32)) * q_tp1_best # Compute RHS of bellman equation for the Q-loss (critic(s)). q_t_selected_target = tf.stop_gradient( tf.cast(train_batch[SampleBatch.REWARDS], tf.float32) + policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked) # Compute the TD-error (potentially clipped). base_td_error = tf.math.abs(q_t_selected - q_t_selected_target) if policy.config["twin_q"]: twin_td_error = tf.math.abs(twin_q_t_selected - q_t_selected_target) td_error = 0.5 * (base_td_error + twin_td_error) else: td_error = base_td_error # Calculate one or two critic losses (2 in the twin_q case). prio_weights = tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) critic_loss = [tf.reduce_mean(prio_weights * huber_loss(base_td_error))] if policy.config["twin_q"]: critic_loss.append( tf.reduce_mean(prio_weights * huber_loss(twin_td_error))) # Alpha- and actor losses. # Note: In the papers, alpha is used directly, here we take the log. # Discrete case: Multiply the action probs as weights with the original # loss terms (no expectations needed). if model.discrete: alpha_loss = tf.reduce_mean( tf.reduce_sum( tf.multiply( tf.stop_gradient(policy_t), -model.log_alpha * tf.stop_gradient(log_pis_t + model.target_entropy), ), axis=-1, )) actor_loss = tf.reduce_mean( tf.reduce_sum( tf.multiply( # NOTE: No stop_grad around policy output here # (compare with q_t_det_policy for continuous case). policy_t, model.alpha * log_pis_t - tf.stop_gradient(q_t), ), axis=-1, )) else: alpha_loss = -tf.reduce_mean( model.log_alpha * tf.stop_gradient(log_pis_t + model.target_entropy)) actor_loss = tf.reduce_mean(model.alpha * log_pis_t - q_t_det_policy) # Save for stats function. policy.policy_t = policy_t policy.q_t = q_t policy.td_error = td_error policy.actor_loss = actor_loss policy.critic_loss = critic_loss policy.alpha_loss = alpha_loss policy.alpha_value = model.alpha policy.target_entropy = model.target_entropy # In a custom apply op we handle the losses separately, but return them # combined in one loss here. return actor_loss + tf.math.add_n(critic_loss) + alpha_loss
def ddpg_actor_critic_loss(policy: Policy, model: ModelV2, _, train_batch: SampleBatch) -> TensorType: twin_q = policy.config["twin_q"] gamma = policy.config["gamma"] n_step = policy.config["n_step"] use_huber = policy.config["use_huber"] huber_threshold = policy.config["huber_threshold"] l2_reg = policy.config["l2_reg"] input_dict = SampleBatch(obs=train_batch[SampleBatch.CUR_OBS], _is_training=True) input_dict_next = SampleBatch(obs=train_batch[SampleBatch.NEXT_OBS], _is_training=True) model_out_t, _ = model(input_dict, [], None) model_out_tp1, _ = model(input_dict_next, [], None) target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None) policy.target_q_func_vars = policy.target_model.variables() # Policy network evaluation. policy_t = model.get_policy_output(model_out_t) policy_tp1 = policy.target_model.get_policy_output(target_model_out_tp1) # Action outputs. if policy.config["smooth_target_policy"]: target_noise_clip = policy.config["target_noise_clip"] clipped_normal_sample = tf.clip_by_value( tf.random.normal(tf.shape(policy_tp1), stddev=policy.config["target_noise"]), -target_noise_clip, target_noise_clip, ) policy_tp1_smoothed = tf.clip_by_value( policy_tp1 + clipped_normal_sample, policy.action_space.low * tf.ones_like(policy_tp1), policy.action_space.high * tf.ones_like(policy_tp1), ) else: # No smoothing, just use deterministic actions. policy_tp1_smoothed = policy_tp1 # Q-net(s) evaluation. # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) # Q-values for given actions & observations in given current q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) # Q-values for current policy (no noise) in given current state q_t_det_policy = model.get_q_values(model_out_t, policy_t) if twin_q: twin_q_t = model.get_twin_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) # Target q-net(s) evaluation. q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1_smoothed) if twin_q: twin_q_tp1 = policy.target_model.get_twin_q_values( target_model_out_tp1, policy_tp1_smoothed) q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if twin_q: twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) q_tp1 = tf.minimum(q_tp1, twin_q_tp1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = ( 1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32)) * q_tp1_best # Compute RHS of bellman equation. q_t_selected_target = tf.stop_gradient( tf.cast(train_batch[SampleBatch.REWARDS], tf.float32) + gamma**n_step * q_tp1_best_masked) # Compute the error (potentially clipped). if twin_q: td_error = q_t_selected - q_t_selected_target twin_td_error = twin_q_t_selected - q_t_selected_target if use_huber: errors = huber_loss(td_error, huber_threshold) + huber_loss( twin_td_error, huber_threshold) else: errors = 0.5 * tf.math.square(td_error) + 0.5 * tf.math.square( twin_td_error) else: td_error = q_t_selected - q_t_selected_target if use_huber: errors = huber_loss(td_error, huber_threshold) else: errors = 0.5 * tf.math.square(td_error) critic_loss = tf.reduce_mean( tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) * errors) actor_loss = -tf.reduce_mean(q_t_det_policy) # Add l2-regularization if required. if l2_reg is not None: for var in policy.model.policy_variables(): if "bias" not in var.name: actor_loss += l2_reg * tf.nn.l2_loss(var) for var in policy.model.q_variables(): if "bias" not in var.name: critic_loss += l2_reg * tf.nn.l2_loss(var) # Model self-supervised losses. if policy.config["use_state_preprocessor"]: # Expand input_dict in case custom_loss' need them. input_dict[SampleBatch.ACTIONS] = train_batch[SampleBatch.ACTIONS] input_dict[SampleBatch.REWARDS] = train_batch[SampleBatch.REWARDS] input_dict[SampleBatch.DONES] = train_batch[SampleBatch.DONES] input_dict[SampleBatch.NEXT_OBS] = train_batch[SampleBatch.NEXT_OBS] if log_once("ddpg_custom_loss"): logger.warning( "You are using a state-preprocessor with DDPG and " "therefore, `custom_loss` will be called on your Model! " "Please be aware that DDPG now uses the ModelV2 API, which " "merges all previously separate sub-models (policy_model, " "q_model, and twin_q_model) into one ModelV2, on which " "`custom_loss` is called, passing it " "[actor_loss, critic_loss] as 1st argument. " "You may have to change your custom loss function to handle " "this.") [actor_loss, critic_loss] = model.custom_loss([actor_loss, critic_loss], input_dict) # Store values for stats function. policy.actor_loss = actor_loss policy.critic_loss = critic_loss policy.td_error = td_error policy.q_t = q_t # Return one loss value (even though we treat them separately in our # 2 optimizers: actor and critic). return policy.critic_loss + policy.actor_loss
def r2d2_loss(policy: Policy, model, _, train_batch: SampleBatch) -> TensorType: """Constructs the loss for R2D2TFPolicy. Args: policy: The Policy to calculate the loss for. model (ModelV2): The Model to calculate the loss for. train_batch: The training data. Returns: TensorType: A single loss tensor. """ config = policy.config # Construct internal state inputs. i = 0 state_batches = [] while "state_in_{}".format(i) in train_batch: state_batches.append(train_batch["state_in_{}".format(i)]) i += 1 assert state_batches # Q-network evaluation (at t). q, _, _, _ = compute_q_values( policy, model, train_batch, state_batches=state_batches, seq_lens=train_batch.get(SampleBatch.SEQ_LENS), explore=False, is_training=True, ) # Target Q-network evaluation (at t+1). q_target, _, _, _ = compute_q_values( policy, policy.target_model, train_batch, state_batches=state_batches, seq_lens=train_batch.get(SampleBatch.SEQ_LENS), explore=False, is_training=True, ) if not hasattr(policy, "target_q_func_vars"): policy.target_q_func_vars = policy.target_model.variables() actions = tf.cast(train_batch[SampleBatch.ACTIONS], tf.int64) dones = tf.cast(train_batch[SampleBatch.DONES], tf.float32) rewards = train_batch[SampleBatch.REWARDS] weights = tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) B = tf.shape(state_batches[0])[0] T = tf.shape(q)[0] // B # Q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot(actions, policy.action_space.n) q_selected = tf.reduce_sum( tf.where(q > tf.float32.min, q, tf.zeros_like(q)) * one_hot_selection, axis=1) if config["double_q"]: best_actions = tf.argmax(q, axis=1) else: best_actions = tf.argmax(q_target, axis=1) best_actions_one_hot = tf.one_hot(best_actions, policy.action_space.n) q_target_best = tf.reduce_sum( tf.where(q_target > tf.float32.min, q_target, tf.zeros_like(q_target)) * best_actions_one_hot, axis=1, ) if config["num_atoms"] > 1: raise ValueError("Distributional R2D2 not supported yet!") else: q_target_best_masked_tp1 = (1.0 - dones) * tf.concat( [q_target_best[1:], tf.constant([0.0])], axis=0) if config["use_h_function"]: h_inv = h_inverse(q_target_best_masked_tp1, config["h_function_epsilon"]) target = h_function( rewards + config["gamma"]**config["n_step"] * h_inv, config["h_function_epsilon"], ) else: target = ( rewards + config["gamma"]**config["n_step"] * q_target_best_masked_tp1) # Seq-mask all loss-related terms. seq_mask = tf.sequence_mask(train_batch[SampleBatch.SEQ_LENS], T)[:, :-1] # Mask away also the burn-in sequence at the beginning. burn_in = policy.config["replay_buffer_config"]["replay_burn_in"] # Making sure, this works for both static graph and eager. if burn_in > 0: seq_mask = tf.cond( pred=tf.convert_to_tensor(burn_in, tf.int32) < T, true_fn=lambda: tf.concat( [tf.fill([B, burn_in], False), seq_mask[:, burn_in:]], 1), false_fn=lambda: seq_mask, ) def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, seq_mask)) # Make sure to use the correct time indices: # Q(t) - [gamma * r + Q^(t+1)] q_selected = tf.reshape(q_selected, [B, T])[:, :-1] td_error = q_selected - tf.stop_gradient( tf.reshape(target, [B, T])[:, :-1]) td_error = td_error * tf.cast(seq_mask, tf.float32) weights = tf.reshape(weights, [B, T])[:, :-1] policy._total_loss = reduce_mean_valid(weights * huber_loss(td_error)) # Store the TD-error per time chunk (b/c we need only one mean # prioritized replay weight per stored sequence). policy._td_error = tf.reduce_mean(td_error, axis=-1) policy._loss_stats = { "mean_q": reduce_mean_valid(q_selected), "min_q": tf.reduce_min(q_selected), "max_q": tf.reduce_max(q_selected), "mean_td_error": reduce_mean_valid(td_error), } return policy._total_loss
def build_slateq_losses( policy: Policy, model: ModelV2, _, train_batch: SampleBatch, ) -> TensorType: """Constructs the choice- and Q-value losses for the SlateQTorchPolicy. Args: policy: The Policy to calculate the loss for. model: The Model to calculate the loss for. train_batch: The training data. Returns: The Q-value loss tensor. """ # B=batch size # S=slate size # C=num candidates # E=embedding size # A=number of all possible slates # Q-value computations. # --------------------- observation = train_batch[SampleBatch.OBS] # user.shape: [B, E] user_obs = observation["user"] batch_size = tf.shape(user_obs)[0] # doc.shape: [B, C, E] doc_obs = list(observation["doc"].values()) # action.shape: [B, S] actions = train_batch[SampleBatch.ACTIONS] # click_indicator.shape: [B, S] click_indicator = tf.cast( tf.stack([k["click"] for k in observation["response"]], 1), tf.float32) # item_reward.shape: [B, S] item_reward = tf.stack([k["watch_time"] for k in observation["response"]], 1) # q_values.shape: [B, C] q_values = model.get_q_values(user_obs, doc_obs) # slate_q_values.shape: [B, S] slate_q_values = tf.gather(q_values, tf.cast(actions, dtype=tf.int32), batch_dims=-1) # Only get the Q from the clicked document. # replay_click_q.shape: [B] replay_click_q = tf.reduce_sum(input_tensor=slate_q_values * click_indicator, axis=1, name="replay_click_q") # Target computations. # -------------------- next_obs = train_batch[SampleBatch.NEXT_OBS] # user.shape: [B, E] user_next_obs = next_obs["user"] # doc.shape: [B, C, E] doc_next_obs = list(next_obs["doc"].values()) # Only compute the watch time reward of the clicked item. reward = tf.reduce_sum(input_tensor=item_reward * click_indicator, axis=1) # TODO: Find out, whether it's correct here to use obs, not next_obs! # Dopamine uses obs, then next_obs only for the score. # next_q_values = policy.target_model.get_q_values(user_next_obs, doc_next_obs) next_q_values = policy.target_model.get_q_values(user_obs, doc_obs) scores, score_no_click = score_documents(user_next_obs, doc_next_obs) # next_q_values_slate.shape: [B, A, S] next_q_values_slate = tf.gather(next_q_values, policy.slates, axis=1) # scores_slate.shape [B, A, S] scores_slate = tf.gather(scores, policy.slates, axis=1) # score_no_click_slate.shape: [B, A] score_no_click_slate = tf.reshape( tf.tile(score_no_click, tf.shape(input=policy.slates)[:1]), [batch_size, -1]) # next_q_target_slate.shape: [B, A] next_q_target_slate = tf.reduce_sum( input_tensor=next_q_values_slate * scores_slate, axis=2) / (tf.reduce_sum(input_tensor=scores_slate, axis=2) + score_no_click_slate) next_q_target_max = tf.reduce_max(input_tensor=next_q_target_slate, axis=1) target = reward + policy.config["gamma"] * next_q_target_max * ( 1.0 - tf.cast(train_batch["dones"], tf.float32)) target = tf.stop_gradient(target) clicked = tf.reduce_sum(input_tensor=click_indicator, axis=1) clicked_indices = tf.squeeze(tf.where(tf.equal(clicked, 1)), axis=1) # Clicked_indices is a vector and tf.gather selects the batch dimension. q_clicked = tf.gather(replay_click_q, clicked_indices) target_clicked = tf.gather(target, clicked_indices) td_error = tf.where( tf.cast(clicked, tf.bool), replay_click_q - target, tf.zeros_like(train_batch[SampleBatch.REWARDS]), ) if policy.config["use_huber"]: loss = huber_loss(td_error, delta=policy.config["huber_threshold"]) else: loss = tf.math.square(td_error) loss = tf.reduce_mean(loss) td_error = tf.abs(td_error) mean_td_error = tf.reduce_mean(td_error) policy._q_values = tf.reduce_mean(q_values) policy._q_clicked = tf.reduce_mean(q_clicked) policy._scores = tf.reduce_mean(scores) policy._score_no_click = tf.reduce_mean(score_no_click) policy._slate_q_values = tf.reduce_mean(slate_q_values) policy._replay_click_q = tf.reduce_mean(replay_click_q) policy._bellman_reward = tf.reduce_mean(reward) policy._next_q_values = tf.reduce_mean(next_q_values) policy._target = tf.reduce_mean(target) policy._next_q_target_slate = tf.reduce_mean(next_q_target_slate) policy._next_q_target_max = tf.reduce_mean(next_q_target_max) policy._target_clicked = tf.reduce_mean(target_clicked) policy._q_loss = loss policy._td_error = td_error policy._mean_td_error = mean_td_error policy._mean_actions = tf.reduce_mean(actions) return loss