def __init__(self, actions, actions_logp, actions_entropy, dones, behaviour_action_logp, behaviour_logits, target_logits, discount, rewards, values, bootstrap_value, dist_class, model, valid_mask, config, vf_loss_coeff=0.5, entropy_coeff=0.01, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0): """Policy gradient loss with vtrace importance weighting. VTraceLoss takes tensors of shape [T, B, ...], where `B` is the batch_size. The reason we need to know `B` is for V-trace to properly handle episode cut boundaries. Args: actions: An int|float32 tensor of shape [T, B, ACTION_SPACE]. actions_logp: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. behaviour_action_logp: Tensor of shape [T, B]. behaviour_logits: A list with length of ACTION_SPACE of float32 tensors of shapes [T, B, ACTION_SPACE[0]], ..., [T, B, ACTION_SPACE[-1]] target_logits: A list with length of ACTION_SPACE of float32 tensors of shapes [T, B, ACTION_SPACE[0]], ..., [T, B, ACTION_SPACE[-1]] discount: A float32 scalar. rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. dist_class: action distribution class for logits. valid_mask: A bool tensor of valid RNN input elements (#2992). config: Trainer config dict. """ # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( behaviour_action_log_probs=behaviour_action_logp, behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=tf.unstack(actions, axis=2), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, dist_class=dist_class, model=model, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32)) self.value_targets = self.vtrace_returns.vs # The policy gradients loss self.pi_loss = -tf.reduce_sum( tf.boolean_mask(actions_logp * self.vtrace_returns.pg_advantages, valid_mask)) # The baseline loss delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask) self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) # The entropy loss self.entropy = tf.reduce_sum( tf.boolean_mask(actions_entropy, valid_mask)) # The summed weighted loss self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff)
def __init__(self, actions, prev_actions_logp, actions_logp, action_kl, actions_entropy, dones, behaviour_logits, target_logits, discount, rewards, values, bootstrap_value, valid_mask, vf_loss_coeff=0.5, entropy_coeff=-0.01, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, clip_param=0.3): """PPO surrogate loss with vtrace importance weighting. VTraceLoss takes tensors of shape [T, B, ...], where `B` is the batch_size. The reason we need to know `B` is for V-trace to properly handle episode cut boundaries. Arguments: actions: An int32 tensor of shape [T, B, NUM_ACTIONS]. prev_actions_logp: A float32 tensor of shape [T, B]. actions_logp: A float32 tensor of shape [T, B]. action_kl: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. behaviour_logits: A float32 tensor of shape [T, B, NUM_ACTIONS]. target_logits: A float32 tensor of shape [T, B, NUM_ACTIONS]. discount: A float32 scalar. rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. valid_mask: A bool tensor of valid RNN input elements (#2992). """ # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=tf.unstack(tf.cast(actions, tf.int32), axis=2), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32)) logp_ratio = tf.exp(actions_logp - prev_actions_logp) advantages = self.vtrace_returns.pg_advantages surrogate_loss = tf.minimum( advantages * logp_ratio, advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param)) self.mean_kl = tf.reduce_mean(action_kl) self.pi_loss = -tf.reduce_sum(surrogate_loss) # The baseline loss delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask) self.value_targets = self.vtrace_returns.vs self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) # The entropy loss self.entropy = tf.reduce_sum( tf.boolean_mask(actions_entropy, valid_mask)) # The summed weighted loss self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + self.entropy * entropy_coeff)
def __init__(self, actions, prev_actions_logp, actions_logp, old_policy_actions_logp, action_kl, actions_entropy, dones, behaviour_logits, old_policy_behaviour_logits, target_logits, discount, rewards, values, bootstrap_value, config, dist_class, valid_mask, vf_loss_coeff=0.5, entropy_coeff=0.01, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, clip_param=0.3, cur_kl_coeff=None, use_kl_loss=False): """APPO Loss, with IS modifications and V-trace for Advantage Estimation VTraceLoss takes tensors of shape [T, B, ...], where `B` is the batch_size. The reason we need to know `B` is for V-trace to properly handle episode cut boundaries. Arguments: actions: An int|float32 tensor of shape [T, B, logit_dim]. prev_actions_logp: A float32 tensor of shape [T, B]. actions_logp: A float32 tensor of shape [T, B]. old_policy_actions_logp: A float32 tensor of shape [T, B]. action_kl: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. behaviour_logits: A float32 tensor of shape [T, B, logit_dim]. old_policy_behaviour_logits: A float32 tensor of shape [T, B, logit_dim]. target_logits: A float32 tensor of shape [T, B, logit_dim]. discount: A float32 scalar. rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. config: Trainer config dict. dist_class: action distribution class for logits. valid_mask: A bool tensor of valid RNN input elements (#2992). vf_loss_coeff (float): Coefficient of the value function loss. entropy_coeff (float): Coefficient of the entropy regularizer. clip_param (float): Clip parameter. cur_kl_coeff (float): Coefficient for KL loss. use_kl_loss (bool): If true, use KL loss. """ def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, valid_mask)) # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( behaviour_policy_logits=behaviour_logits, target_policy_logits=old_policy_behaviour_logits, actions=tf.unstack(actions, axis=2), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, config=config, dist_class=dist_class, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32)) self.is_ratio = tf.clip_by_value( tf.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0) logp_ratio = self.is_ratio * tf.exp(actions_logp - prev_actions_logp) advantages = self.vtrace_returns.pg_advantages surrogate_loss = tf.minimum( advantages * logp_ratio, advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param)) self.mean_kl = reduce_mean_valid(action_kl) self.pi_loss = -reduce_mean_valid(surrogate_loss) # The baseline loss delta = values - self.vtrace_returns.vs self.value_targets = self.vtrace_returns.vs self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta)) # The entropy loss self.entropy = reduce_mean_valid(actions_entropy) # The summed weighted loss self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff) # Optional additional KL Loss if use_kl_loss: self.total_loss += cur_kl_coeff * self.mean_kl
def __init__( self, actions, actions_logp, actions_entropy, message_entropy, dones, behaviour_action_logp, behaviour_logits, target_logits, discount, rewards, values, bootstrap_value, dist_class, model, valid_mask, config, vf_loss_coeff=0.5, entropy_coeff=0.01, message_entropy_coeff=0.0, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, use_cpc=True, cpc_ins=None, cpc_preds=None, cpc_coeff=10.0, use_sender_bias=False, l_ps_lambda=3.0, entropy_target=1.0, average_message_entropy=None, sender_bias_coeff=0.1, use_receiver_bias=False, l_ce_coeff=0.001, l_pl_coeff=0.01, message_p=None, no_message_p=None, **kwargs, ): """ See VTraceLoss class Args: use_cpc: True if CPC loss should be added cpc_ins: Input encodings of CPC (Shape: [T, B, code_size] cpc_preds: Output encodings of CPC(Shape: [T, B, length, code_size] cpc_coeff: Coefficient for CPC loss use_sender_bias: True if sender bias loss should be added l_ps_lambda: """ # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( behaviour_action_log_probs=behaviour_action_logp, behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=tf.unstack(actions, axis=2), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, dist_class=dist_class, model=model, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32), ) self.value_targets = self.vtrace_returns.vs # The policy gradients loss self.pi_loss = -tf.reduce_sum( tf.boolean_mask(actions_logp * self.vtrace_returns.pg_advantages, valid_mask) ) # The baseline loss delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask) self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) # The entropy loss self.entropy = tf.reduce_sum(tf.boolean_mask(actions_entropy, valid_mask)) self.message_entropy = tf.reduce_sum(tf.boolean_mask(message_entropy, valid_mask)) # The summed weighted loss self.total_loss = ( self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff - self.message_entropy * message_entropy_coeff ) if use_cpc: # CPC loss with tf.variable_scope("cpc_loss"): losses = [] cpc_length = cpc_preds.shape.as_list()[2] T = tf.shape(cpc_preds)[0] # Scaling coeff to take mean over k scaling_coeff = tf.cast( tf.reverse(tf.minimum(tf.range(1, T - 1 + 1), cpc_length), axis=[0]), dtype=tf.float32, ) for k in range(1, cpc_length + 1): loss = CPCLayer(k, name=f"cpc_{k}")([cpc_ins, cpc_preds[:, :, k - 1]]) losses.append(tf.reduce_sum(loss / scaling_coeff[: T - k])) self.cpc_loss = tf.reduce_sum(tf.stack(losses), name=f"cpc_loss") self.total_loss += self.cpc_loss * cpc_coeff else: self.cpc_loss = tf.constant(np.nan) if use_sender_bias: # Sender bias loss with tf.variable_scope("sender_bias"): self.average_message_entropy = average_message_entropy self.sender_bias_loss = ( tf.reduce_sum(l_ps_lambda * (message_entropy - entropy_target) ** 2) - average_message_entropy ) self.total_loss += self.sender_bias_loss * sender_bias_coeff else: self.average_message_entropy = tf.constant(np.nan) self.sender_bias_loss = tf.constant(np.nan) if use_receiver_bias: # Receiver bias loss with tf.variable_scope("receiver_bias"): self.l_ce = -tf.reduce_sum( tf.stop_gradient(message_p) * tf.log(no_message_p) ) self.l_pl = tf.reduce_sum( tf.abs(message_p - tf.stop_gradient(no_message_p)) ) self.total_loss += self.l_ce * l_ce_coeff - self.l_pl * l_pl_coeff else: self.l_ce = tf.constant(np.nan) self.l_pl = tf.constant(np.nan)
def __init__(self, actions, actions_logp, actions_entropy, dones, behaviour_action_logp, behaviour_logits, target_logits, discount, rewards, values, bootstrap_value, dist_class, model, valid_mask, config, vf_loss_coeff=0.5, entropy_coeff=0.01, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0): # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( behaviour_action_log_probs=behaviour_action_logp, behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=tf.unstack(actions, axis=2), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, dist_class=dist_class, model=model, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32)) self.value_targets = self.vtrace_returns.vs advantages = self.vtrace_returns.pg_advantages # The advantages has shape [Sample batch size, B] (B is the # number of sample_batch in train_batch). Here we normalize # advantages among whole train batch. advantages = (advantages - tf.reduce_mean(advantages)) / \ tf.maximum(1e-4, tf.math.reduce_std(advantages)) # The policy gradients loss self.pi_loss = -tf.reduce_sum( tf.boolean_mask(actions_logp * advantages, valid_mask) ) # The baseline loss delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask) self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) # The entropy loss self.entropy = tf.reduce_sum( tf.boolean_mask(actions_entropy, valid_mask)) # The summed weighted loss self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff - self.entropy * entropy_coeff)
def __init__(self, actions, actions_logp, actions_entropy, dones, behaviour_logits, target_logits, discount, rewards, values, bootstrap_value, valid_mask, vf_loss_coeff=0.5, entropy_coeff=-0.01, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0): """Policy gradient loss with vtrace importance weighting. VTraceLoss takes tensors of shape [T, B, ...], where `B` is the batch_size. The reason we need to know `B` is for V-trace to properly handle episode cut boundaries. Args: actions: An int32 tensor of shape [T, B, ACTION_SPACE]. actions_logp: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. behaviour_logits: A list with length of ACTION_SPACE of float32 tensors of shapes [T, B, ACTION_SPACE[0]], ..., [T, B, ACTION_SPACE[-1]] target_logits: A list with length of ACTION_SPACE of float32 tensors of shapes [T, B, ACTION_SPACE[0]], ..., [T, B, ACTION_SPACE[-1]] discount: A float32 scalar. rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. valid_mask: A bool tensor of valid RNN input elements (#2992). """ # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( behaviour_policy_logits=behaviour_logits, target_policy_logits=target_logits, actions=tf.unstack(tf.cast(actions, tf.int32), axis=2), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32)) # The policy gradients loss self.pi_loss = -tf.reduce_sum( tf.boolean_mask(actions_logp * self.vtrace_returns.pg_advantages, valid_mask)) # The baseline loss delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask) self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) # The entropy loss self.entropy = tf.reduce_sum( tf.boolean_mask(actions_entropy, valid_mask)) # The summed weighted loss self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + self.entropy * entropy_coeff)
def __init__(self, actions, prev_actions_logp, actions_logp, old_policy_actions_logp, action_kl, actions_entropy, dones, behaviour_action_log_probs, behaviour_logits, old_policy_behaviour_logits, target_logits, discount, rewards, values, bootstrap_value, dist_class, model, valid_mask, vf_loss_coeff=0.5, entropy_coeff=0.01, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0, clip_param=0.3, cur_kl_coeff=None, normalize_advantage=True # use_kl_loss=False ): """APPO Loss, with IS modifications and V-trace for Advantage Estimation VTraceLoss takes tensors of shape [T, B, ...], where `B` is the batch_size. The reason we need to know `B` is for V-trace to properly handle episode cut boundaries. Arguments: actions: An int|float32 tensor of shape [T, B, logit_dim]. prev_actions_logp: A float32 tensor of shape [T, B]. actions_logp: A float32 tensor of shape [T, B]. old_policy_actions_logp: A float32 tensor of shape [T, B]. action_kl: A float32 tensor of shape [T, B]. actions_entropy: A float32 tensor of shape [T, B]. dones: A bool tensor of shape [T, B]. behaviour_logits: A float32 tensor of shape [T, B, logit_dim]. old_policy_behaviour_logits: A float32 tensor of shape [T, B, logit_dim]. target_logits: A float32 tensor of shape [T, B, logit_dim]. discount: A float32 scalar. rewards: A float32 tensor of shape [T, B]. values: A float32 tensor of shape [T, B]. bootstrap_value: A float32 tensor of shape [B]. dist_class: action distribution class for logits. model: backing ModelV2 instance valid_mask: A bool tensor of valid RNN input elements (#2992). vf_loss_coeff (float): Coefficient of the value function loss. entropy_coeff (float): Coefficient of the entropy regularizer. clip_param (float): Clip parameter. cur_kl_coeff (float): Coefficient for KL loss. use_kl_loss (bool): If true, use KL loss. """ def reduce_mean_valid(t): return tf.reduce_mean(tf.boolean_mask(t, valid_mask)) # Compute vtrace on the CPU for better perf. with tf.device("/cpu:0"): self.vtrace_returns = vtrace.multi_from_logits( behaviour_action_log_probs=behaviour_action_log_probs, # V behaviour_policy_logits=behaviour_logits, # V target_policy_logits=old_policy_behaviour_logits, # V actions=tf.unstack(actions, axis=2), discounts=tf.to_float(~dones) * discount, rewards=rewards, values=values, bootstrap_value=bootstrap_value, dist_class=dist_class, model=model, clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, tf.float32)) self.is_ratio = tf.clip_by_value( tf.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0) # with tf.control_dependencies([ # tf.print("IS ratio: ", tf.reduce_mean(self.is_ratio))]): logp_ratio = self.is_ratio * tf.exp(actions_logp - prev_actions_logp) # logp_ratio = tf.exp(actions_logp - prev_actions_logp) advantages = self.vtrace_returns.pg_advantages if normalize_advantage: advantages = (advantages - tf.reduce_mean(advantages)) / tf.maximum( 1e-4, tf.math.reduce_std(advantages)) self.advantage = advantages self.debug_ratio = logp_ratio surrogate_loss = tf.minimum( advantages * logp_ratio, advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, 1 + clip_param)) self.mean_kl = reduce_mean_valid(action_kl) self.mean_policy_loss = -reduce_mean_valid(surrogate_loss) # The baseline loss delta = values - self.vtrace_returns.vs self.value_targets = self.vtrace_returns.vs vf_loss = tf.square(delta) self.mean_vf_loss = reduce_mean_valid(vf_loss) # The entropy loss self.mean_entropy = reduce_mean_valid(actions_entropy) # The summed weighted loss # self.loss = ( # self.mean_policy_loss + self.mean_vf_loss * vf_loss_coeff - # self.mean_entropy * entropy_coeff # ) # turn the reduce mean to the outer of many terms. self.loss = reduce_mean_valid(-surrogate_loss + cur_kl_coeff * action_kl + vf_loss_coeff * vf_loss - entropy_coeff * actions_entropy)