def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") policy_loss = -tf.reduce_mean(self.policy.logli * adv) value_loss = tf.reduce_mean( (self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss try: with open("loss_fn.txt", "x+") as f: f.write("out\n") f.write("full_loss: {0} type: {1}\n".format( type(full_loss), full_loss.dtype)) f.write("policy_loss: {0} type: {1}\n".format( type(policy_loss), policy_loss.dtype)) f.write("value_loss: {0} type: {1}\n".format( type(value_loss), value_loss.dtype)) f.write("entropy_loss: {0} type: {1}\n".format( type(entropy_loss), entropy_loss.dtype)) f.write("adv: {0} type: {1}\n".format(type(adv), adv.dtype)) f.write("returns: {0} type: {1}\n".format( type(returns), returns.dtype)) f.close() except FileExistsError: print("") return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns]
def true_fn(images): if augment_entire_batch: image_2 = images mean_color = tf.reduce_mean(image_2, axis=[1, 2], keepdims=True) print(mean_color.shape) else: image_1, image_2 = tf.unstack(images) mean_color = tf.reduce_mean(image_2, axis=[0, 1], keepdims=True) def body(var_img, mean_color): x0 = tf.random.uniform([], 0, width, dtype=tf.int32) y0 = tf.random.uniform([], 0, height, dtype=tf.int32) dx = tf.random.uniform([], min_size, max_size, dtype=tf.int32) dy = tf.random.uniform([], min_size, max_size, dtype=tf.int32) x = tf.range(width) x_mask = (x0 <= x) & (x < x0+dx) y = tf.range(height) y_mask = (y0 <= y) & (y < y0+dy) mask = x_mask & y_mask[:, tf.newaxis] mask = tf.cast(mask[:, :, tf.newaxis], image_2.dtype) result = var_img * (1 - mask) + mean_color * mask return result # Perform at least one erase operation. image_2 = body(image_2, mean_color) # Perform additional erase operations. for _ in range(max_operations - 1): perform_erase = tf.less( tf.random.uniform([]), probability_additional_operations) image_2 = tf.cond(perform_erase, lambda: body(image_2, mean_color), lambda: image_2) if augment_entire_batch: images = image_2 else: images = tf.stack([image_1, image_2]) return images
def _mine(self, x_in, y_in): """Mutual Infomation Neural Estimator. Implement mutual information neural estimator from Belghazi et al "Mutual Information Neural Estimation" http://proceedings.mlr.press/v80/belghazi18a/belghazi18a.pdf 'DV': sup_T E_P(T) - log E_Q(exp(T)) where P is the joint distribution of X and Y, and Q is the product marginal distribution of P. DV is a lower bound for KLD(P||Q)=MI(X, Y). """ y_in_tran = transpose2(y_in, 1, 0) y_shuffle_tran = math_ops.shuffle(y_in_tran) y_shuffle = transpose2(y_shuffle_tran, 1, 0) # propagate the forward pass T_xy, _ = self._network([x_in, y_in]) T_x_y, _ = self._network([x_in, y_shuffle]) # compute the negative loss (maximize loss == minimize -loss) mean_exp_T_x_y = tf.reduce_mean(tf.math.exp(T_x_y), axis=1) loss = tf.reduce_mean(T_xy, axis=1) - tf.math.log(mean_exp_T_x_y) loss = tf.squeeze(loss, axis=-1) # Mutual Information return loss
def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") logli_old = tf.placeholder(tf.float32, [None], name="logli_old") value_old = tf.placeholder(tf.float32, [None], name="value_old") ratio = tf.exp(self.policy.logli - logli_old) clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio) value_err = (self.value - returns)**2 if self.clip_value > 0.0: clipped_value = tf.clip_by_value(self.value, value_old - self.clip_value, value_old + self.clip_value) clipped_value_err = (clipped_value - returns)**2 value_err = tf.maximum(value_err, clipped_value_err) policy_loss = -tf.reduce_mean( tf.minimum(adv * ratio, adv * clipped_ratio)) value_loss = tf.reduce_mean(value_err) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns, logli_old, value_old]
def _summary(): with tf.name_scope('ActorCriticLoss'): tf.summary.scalar("values", tf.reduce_mean(value)) tf.summary.scalar("returns", tf.reduce_mean(returns)) tf.summary.scalar("advantages", tf.reduce_mean(advantages)) tf.summary.scalar("explained_variance_of_return_by_value", common.explained_variance(value, returns))
def quantile_loss(y, y_hat, k=4): k = np.linspace(0., 1., k) loss = 0. y = tf.squeeze(y, axis=2) for idx, q in enumerate(k): error = tf.subtract(y, y_hat[:, :, idx]) loss += tf.reduce_mean(tf.maximum(q * error, (q - 1) / error), axis=-1) return tf.reduce_mean(loss)
def _loss_op(self): with tf.name_scope("loss_op"): self.d_loss = tf.reduce_mean(self._fake_d) - tf.reduce_mean( self._true_d) self.g_loss = -tf.reduce_mean(self._fake_d) # reg = self._reg(tf.shape(self.x)[0], self.d, self.x, self.x_fake) # self.d_loss += reg self.loss = [self.d_loss, self.g_loss]
def _summary(): with self.name_scope: tf.summary.scalar("values", tf.reduce_mean(value)) tf.summary.scalar("returns", tf.reduce_mean(returns)) tf.summary.scalar("advantages/mean", tf.reduce_mean(advantages)) tf.summary.histogram("advantages/value", advantages) tf.summary.scalar("explained_variance_of_return_by_value", common.explained_variance(value, returns))
def _variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var)
def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") policy_loss = -tf.reduce_mean(self.policy.logli * adv) value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns]
def summarize_stats(stats): """Summarize a dictionary of variables. Args: stats: a dictionary of {name: tensor} to compute stats over. """ for name, stat in stats.items(): mean = tf.reduce_mean(stat) tf.summary.scalar('mean_%s' % name, mean) tf.summary.scalar('max_%s' % name, tf.reduce_max(stat)) tf.summary.scalar('min_%s' % name, tf.reduce_min(stat)) std = tf.sqrt(tf.reduce_mean(tf.square(stat)) - tf.square(mean) + 1e-10) tf.summary.scalar('std_%s' % name, std) tf.summary.histogram(name, stat)
def train_step(self, time_step: ActionTimeStep, state, calc_intrinsic_reward=True): """ Args: time_step (ActionTimeStep): input time_step data for ICM state (Tensor): state for ICM (previous observation) calc_intrinsic_reward (bool): if False, only return the losses Returns: TrainStep: outputs: empty tuple () state: observation info (ICMInfo): """ feature = time_step.observation prev_action = time_step.prev_action if self._encoding_net is not None: feature, _ = self._encoding_net(feature) prev_feature = state prev_action = self._encode_action(prev_action) forward_pred, _ = self._forward_net( inputs=[tf.stop_gradient(prev_feature), prev_action]) forward_loss = 0.5 * tf.reduce_mean( tf.square(tf.stop_gradient(feature) - forward_pred), axis=-1) action_pred, _ = self._inverse_net(inputs=[prev_feature, feature]) if tensor_spec.is_discrete(self._action_spec): inverse_loss = tf.nn.softmax_cross_entropy_with_logits( labels=prev_action, logits=action_pred) else: inverse_loss = 0.5 * tf.reduce_mean( tf.square(prev_action - action_pred), axis=-1) intrinsic_reward = () if calc_intrinsic_reward: intrinsic_reward = tf.stop_gradient(forward_loss) intrinsic_reward = self._reward_normalizer.normalize( intrinsic_reward) return AlgorithmStep( outputs=(), state=feature, info=ICMInfo(reward=intrinsic_reward, loss=LossInfo(loss=forward_loss + inverse_loss, extra=dict(forward_loss=forward_loss, inverse_loss=inverse_loss))))
def test_ppo(self): env_class = PolicyUnittestEnv learning_rate = 1e-1 iterations = 20 batch_size = 100 steps_per_episode = 13 env = env_class(batch_size, steps_per_episode) env = TFPyEnvironment(env) eval_env = env_class(batch_size, steps_per_episode) eval_env = TFPyEnvironment(eval_env) algorithm = create_algorithm(env, learning_rate=learning_rate) driver = SyncOffPolicyDriver(env, algorithm, debug_summaries=DEBUGGING, summarize_grads_and_vars=DEBUGGING) replayer = driver.exp_replayer eval_driver = OnPolicyDriver(eval_env, algorithm, training=False, greedy_predict=True) env.reset() eval_env.reset() time_step = driver.get_initial_time_step() policy_state = driver.get_initial_policy_state() for i in range(iterations): time_step, policy_state = driver.run(max_num_steps=batch_size * steps_per_episode, time_step=time_step, policy_state=policy_state) experience = replayer.replay_all() driver.train(experience, num_updates=4, mini_batch_size=25) replayer.clear() eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.info("%d reward=%f", i, float(tf.reduce_mean(eval_time_step.reward))) eval_env.reset() eval_time_step, _ = eval_driver.run( max_num_steps=(steps_per_episode - 1) * batch_size) logging.info("reward=%f", float(tf.reduce_mean(eval_time_step.reward))) self.assertAlmostEqual(1.0, float(tf.reduce_mean(eval_time_step.reward)), delta=1e-1)
def fit_gaussian(embeddings, damping=1e-7, full_covariance=False): """Fits a unimodal Gaussian distribution to `embeddings`. Args: embeddings: A [batch_size, embedding_dim] tf.Tensor of embeddings. damping: The scale of the covariance damping coefficient. full_covariance: Whether to use a full or diagonal covariance. Returns: Parameter estimates (means and log variances) for a Gaussian model. """ if full_covariance: num, dim = tf.split(tf.shape(input=embeddings), num_or_size_splits=2) num, dim = tf.squeeze(num), tf.squeeze(dim) sample_mean = tf.reduce_mean(input_tensor=embeddings, axis=0) centered_embeddings = embeddings - sample_mean sample_covariance = tf.einsum('ij,ik->kj', centered_embeddings, centered_embeddings) # Outer product. sample_covariance += damping * tf.eye(dim) # Positive definiteness. sample_covariance /= tf.cast(num, dtype=tf.float32) # Scale by N. return sample_mean, sample_covariance else: sample_mean, sample_variances = tf.nn.moments(x=embeddings) log_variances = tf.math.log(sample_variances + damping * tf.ones_like(sample_variances)) return sample_mean, log_variances
def get_train_op(): loss = tf.reduce_mean(input_tensor=tf.square(q_clicked - target_clicked)) if self.summary_writer is not None: with tf.variable_scope('Losses'): tf.summary.scalar('Loss', loss) return loss
def fn(): """Loss function for when number of input and output boxes is positive.""" if is_balanced: weights = loss_utils.get_balanced_loss_weights_multiclass( labels=input_boxes_instance_id) else: weights = tf.ones([tf.shape(input_boxes_instance_id)[0], 1], dtype=tf.float32) gt_length = tf.reshape(input_boxes_length, [-1, 1]) gt_height = tf.reshape(input_boxes_height, [-1, 1]) gt_width = tf.reshape(input_boxes_width, [-1, 1]) predicted_length = tf.reshape(output_boxes_length, [-1, 1]) predicted_height = tf.reshape(output_boxes_height, [-1, 1]) predicted_width = tf.reshape(output_boxes_width, [-1, 1]) predicted_length /= gt_length predicted_height /= gt_height predicted_width /= gt_width predicted_size = tf.concat( [predicted_length, predicted_height, predicted_width], axis=1) gt_size = tf.ones_like(predicted_size) if loss_type == 'huber': loss_fn = tf.keras.losses.Huber( delta=delta, reduction=tf.keras.losses.Reduction.NONE) elif loss_type == 'absolute_difference': loss_fn = tf.keras.losses.MeanAbsoluteError( reduction=tf.keras.losses.Reduction.NONE) else: raise ValueError(('Unknown loss type %s.' % loss_type)) size_losses = loss_fn(y_true=gt_size, y_pred=predicted_size) return tf.reduce_mean(size_losses * tf.reshape(weights, [-1]))
def train_step(self, inputs, state, calc_intrinsic_reward=True): """ Args: inputs (tuple): observation state (tuple): empty tuple () calc_intrinsic_reward (bool): if False, only return the losses Returns: TrainStep: outputs: empty tuple () state: empty tuple () info: RNDInfo """ observation, _ = inputs if self._observation_normalizer is not None: observation = self._observation_normalizer.normalize(observation) pred_embedding, _ = self._predictor_net(observation) target_embedding, _ = self._target_net(observation) loss = 0.5 * tf.reduce_mean( tf.square(pred_embedding - tf.stop_gradient(target_embedding)), axis=-1) intrinsic_reward = () if calc_intrinsic_reward: intrinsic_reward = tf.stop_gradient(loss) intrinsic_reward = self._reward_normalizer.normalize( intrinsic_reward) return AlgorithmStep(outputs=(), state=(), info=RNDInfo(reward=intrinsic_reward, loss=LossInfo(loss=loss)))
def _summary_op(self): with tf.name_scope("summary_op"): # self._summary_list += tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) metrics = regr_metrics(y=self.y, y_hat=self.y_hat) metrics = {k: tf.reduce_mean(v) for k, v in metrics.items()} self._summary_dict.update(metrics) self.summary = summary_op(t_dict=self._summary_dict)
def _forward_pass(iteration_idx_, variables_mapping_, images_, onehot_labels_): """Helper function to compute the outputs of a forward pass.""" with self.embedding_fn.reparameterize(variables_mapping_): # TODO(eringrant): Implement non-transductive batch normalization (i.e., # pass the support set statistics through the query set forward pass. embeddings_ = self.embedding_fn(images_, training=True) # TODO(eringrant): `head_fn` is an attribute of the subclass. with self.head_fn.reparameterize(variables_mapping_): predictions_ = self.head_fn(embeddings_)[:, :data.way] accuracy_ = tf.reduce_mean(input_tensor=self.compute_accuracy( onehot_labels=onehot_labels_, predictions=predictions_)) inner_objective_ = self.inner_objective( onehot_labels=onehot_labels_, predictions=predictions_, iteration_idx=iteration_idx_) outer_objective_ = self.outer_objective( onehot_labels=onehot_labels_, predictions=predictions_, ) return ForwardPass( embeddings=embeddings_, predictions=predictions_, inner_objective_value=inner_objective_, outer_objective_value=outer_objective_, accuracy=accuracy_, )
def _compute_prototype_loss(self, embeddings, labels, labels_one_hot, prototypes=None): """Computes the loss and accuracy on an episode.""" labels_dense = labels if prototypes is None: # Compute protos. labels = tf.cast(labels_one_hot, tf.float32) # [num examples, 1, embedding size]. embeddings_ = tf.expand_dims(embeddings, 1) # [num examples, num classes, 1]. labels = tf.expand_dims(labels, 2) # Sums each class' embeddings. [num classes, embedding size]. class_sums = tf.reduce_sum(labels * embeddings_, 0) # The prototype of each class is the averaged embedding of its examples. class_num_images = tf.reduce_sum(labels, 0) # [way]. prototypes = class_sums / class_num_images # [way, embedding size]. # Compute logits. embeddings = tf.nn.l2_normalize(embeddings, 1, epsilon=1e-3) prototypes = tf.nn.l2_normalize(prototypes, 1, epsilon=1e-3) logits = tf.matmul(embeddings, prototypes, transpose_b=True) loss = self.compute_loss(labels_one_hot, logits) acc = tf.reduce_mean(self.compute_accuracy(labels_dense, logits)) return loss, acc, prototypes, logits
def _build_train_op(self): """Builds the training op for Rainbow. Returns: train_op: An op performing one step of training. """ target_distribution = tf.stop_gradient(self._build_target_distribution()) # size of indices: batch_size x 1. indices = tf.range(tf.shape(self._replay_logits)[0])[:, None] # size of reshaped_actions: batch_size x 2. reshaped_actions = tf.concat([indices, self._replay.actions[:, None]], 1) # For each element of the batch, fetch the logits for its selected action. chosen_action_logits = tf.gather_nd(self._replay_logits, reshaped_actions) loss = tf.nn.softmax_cross_entropy_with_logits( labels=target_distribution, logits=chosen_action_logits) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, epsilon=self.optimizer_epsilon) update_priorities_op = self._replay.tf_set_priority( self._replay.indices, tf.sqrt(loss + 1e-10)) target_priorities = self._replay.tf_get_priority(self._replay.indices) target_priorities = tf.math.add(target_priorities, 1e-10) target_priorities = 1.0 / tf.sqrt(target_priorities) target_priorities /= tf.reduce_max(target_priorities) weighted_loss = target_priorities * loss with tf.control_dependencies([update_priorities_op]): return optimizer.minimize(tf.reduce_mean(weighted_loss)), weighted_loss
def build_graph(self): """Builds the neural network graph.""" # define graph self.g = tf.Graph() with self.g.as_default(): # create and store a new session for the graph self.sess = tf.Session() # define placeholders self.x = tf.placeholder(shape=[None, self.dim_input], dtype=tf.float32) self.y = tf.placeholder(shape=[None, self.num_classes], dtype=tf.float32) # define simple model with tf.variable_scope('last_layer'): self.z = tf.layers.dense(inputs=self.x, units=self.num_classes) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y, logits=self.z)) self.output_probs = tf.nn.softmax(self.z) # Variables of the last layer self.ll_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.ll_vars_concat = tf.concat( [self.ll_vars[0], tf.expand_dims(self.ll_vars[1], axis=0)], 0) # Summary _variable_summaries(self.ll_vars_concat) # saving the weights of last layer when running bootstrap algorithm self.saver = tf.train.Saver(var_list=self.ll_vars) self.gd_opt = tf.train.GradientDescentOptimizer(self.step_size) # SGD optimizer for the last layer grads_vars_sgd = self.gd_opt.compute_gradients(self.loss) self.train_op = self.gd_opt.apply_gradients(grads_vars_sgd) for g, v in grads_vars_sgd: if g is not None: s = list(v.name) s[v.name.rindex(':')] = '_' tf.summary.histogram(''.join(s) + '/grad_hist_boot_sgd', g) # Merge all the summaries and write them out self.all_summaries = tf.summary.merge_all() location = os.path.join(self.working_dir, 'logs') self.writer = tf.summary.FileWriter(location, graph=self.g) saver_network = tf.train.Saver(var_list=self.ll_vars) print('Loading the network...') # Restores from checkpoint saver_network.restore(self.sess, self.model_dir) print('Graph successfully loaded.')
def spatial_loss(truth_features, predicted_features, space_desc): feature_losses = [] for truth, prediction, spec in zip(truth_features, predicted_features, space_desc.features): if spec.type == FeatureType.CATEGORICAL: truth = tf.transpose(truth, (0, 2, 3, 1)) prediction = tf.transpose(prediction, (0, 2, 3, 1)) feature_losses.append( tf.losses.softmax_cross_entropy(truth, prediction)) summary_image = tf.argmax( tf.concat([truth, prediction], 2), 3) summary_image = tf.gather( palette[space_desc.index][spec.index], summary_image) tf.summary.image(spec.name, summary_image) else: feature_losses.append( tf.losses.mean_squared_error(truth, prediction)) summary_image = tf.concat([truth, prediction], 3) tf.summary.image(spec.name, tf.transpose(summary_image, (0, 2, 3, 1))) tf.summary.scalar(spec.name, feature_losses[-1]) return tf.reduce_mean(tf.stack(feature_losses))
def safety_critic_loss(tf_agent, safety_critic, time_steps, actions, next_time_steps, safety_rewards, weights=None): """Returns a critic loss with safety.""" next_actions, next_log_pis = tf_agent._actions_and_log_probs( # pylint: disable=protected-access next_time_steps) del next_log_pis target_input = (next_time_steps.observation[0], next_actions[0]) target_q_values, unused_network_state1 = safety_critic( target_input, next_time_steps.step_type[0]) target_q_values = tf.nn.sigmoid(target_q_values) safety_rewards = tf.to_float(safety_rewards) td_targets = tf.stop_gradient(safety_rewards + (1 - safety_rewards) * next_time_steps.discount * target_q_values) td_targets = tf.squeeze(td_targets) pred_input = (time_steps.observation[0], actions[0]) pred_td_targets, unused_network_state1 = safety_critic( pred_input, time_steps.step_type[0]) loss = tf.losses.sigmoid_cross_entropy(td_targets, pred_td_targets) if weights is not None: loss *= tf.to_float(tf.squeeze(weights)) # Take the mean across the batch. loss = tf.reduce_mean(input_tensor=loss) return loss
def _build_train_op(self): """Builds the training op for Rainbow. Returns: train_op: An op performing one step of training. """ replay_action_one_hot = tf.one_hot(self._replay.actions, self.num_actions, 1., 0., name='action_one_hot') replay_chosen_q = tf.reduce_sum(self._replay_qs * replay_action_one_hot, reduction_indices=1, name='replay_chosen_q') target = tf.stop_gradient(self._build_target_q_op()) loss = tf.losses.huber_loss(target, replay_chosen_q, reduction=tf.losses.Reduction.NONE) update_priorities_op = self._replay.tf_set_priority( self._replay.indices, tf.sqrt(loss + 1e-10)) target_priorities = self._replay.tf_get_priority(self._replay.indices) target_priorities = tf.math.add(target_priorities, 1e-10) target_priorities = 1.0 / tf.sqrt(target_priorities) target_priorities /= tf.reduce_max(target_priorities) weighted_loss = target_priorities * loss with tf.control_dependencies([update_priorities_op]): return self.optimizer.minimize( tf.reduce_mean(weighted_loss)), weighted_loss
def loss_fn(self): adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") logli_old = tf.placeholder(tf.float32, [None], name="logli_old") ratio = tf.exp(self.policy.logli - logli_old) clipped_ratio = tf.clip_by_value(ratio, 1-self.clip_ratio, 1+self.clip_ratio) policy_loss = -tf.reduce_mean(tf.minimum(adv * ratio, adv * clipped_ratio)) # TODO clip value loss value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return full_loss, [policy_loss, value_loss, entropy_loss], [adv, returns, logli_old]
def tanh_similarity(states, actions, rewards, next_states, contexts, mse_scale=1.0, state_scales=1.0, goal_scales=1.0, summarize=False): """Returns the similarity between next_states and contexts using tanh and mse. Args: states: A [batch_size, num_state_dims] Tensor representing a batch of states. actions: A [batch_size, num_action_dims] Tensor representing a batch of actions. rewards: A [batch_size] Tensor representing a batch of rewards. next_states: A [batch_size, num_state_dims] Tensor representing a batch of next states. contexts: A list of [batch_size, num_context_dims] Tensor representing a batch of contexts. mse_scale: A float, to scale mse before tanh. state_scales: multiplicative scale for (next) states. A scalar or 1D tensor, must be broadcastable to number of state dimensions. goal_scales: multiplicative scale for contexts. A scalar or 1D tensor, must be broadcastable to number of goal dimensions. summarize: (boolean) enable summary ops. Returns: A new tf.float32 [batch_size] rewards Tensor, and tf.float32 [batch_size] discounts tensor. """ del states, actions, rewards # Unused mse = tf.reduce_mean(tf.squared_difference(next_states * state_scales, contexts[0] * goal_scales), -1) tanh = tf.tanh(mse_scale * mse) if summarize: with tf.name_scope('RewardFn/'): tf.summary.scalar('mean_mse', tf.reduce_mean(mse)) tf.summary.histogram('mse', mse) tf.summary.scalar('mean_tanh', tf.reduce_mean(tanh)) tf.summary.histogram('tanh', tanh) rewards = tf.to_float(1 - tanh) return rewards, tf.ones_like(rewards)
def fn(): """Loss function for when number of input and output boxes is positive.""" if is_balanced: weights = loss_utils.get_balanced_loss_weights_multiclass( labels=input_boxes_instance_id) else: weights = tf.ones([tf.shape(input_boxes_instance_id)[0], 1], dtype=tf.float32) normalized_box_size = 5.0 predicted_boxes_length = output_boxes_length predicted_boxes_height = output_boxes_height predicted_boxes_width = output_boxes_width predicted_boxes_center = output_boxes_center predicted_boxes_rotation_matrix = output_boxes_rotation_matrix gt_boxes_length = input_boxes_length gt_boxes_height = input_boxes_height gt_boxes_width = input_boxes_width gt_boxes_center = input_boxes_center gt_boxes_rotation_matrix = input_boxes_rotation_matrix if loss_type in ['normalized_huber', 'normalized_euclidean']: predicted_boxes_length /= (gt_boxes_length / normalized_box_size) predicted_boxes_height /= (gt_boxes_height / normalized_box_size) predicted_boxes_width /= (gt_boxes_width / normalized_box_size) gt_boxes_length = tf.ones_like( gt_boxes_length, dtype=tf.float32) * normalized_box_size gt_boxes_height = tf.ones_like( gt_boxes_height, dtype=tf.float32) * normalized_box_size gt_boxes_width = tf.ones_like( gt_boxes_width, dtype=tf.float32) * normalized_box_size gt_box_corners = box_utils.get_box_corners_3d( boxes_length=gt_boxes_length, boxes_height=gt_boxes_height, boxes_width=gt_boxes_width, boxes_rotation_matrix=gt_boxes_rotation_matrix, boxes_center=gt_boxes_center) predicted_box_corners = box_utils.get_box_corners_3d( boxes_length=predicted_boxes_length, boxes_height=predicted_boxes_height, boxes_width=predicted_boxes_width, boxes_rotation_matrix=predicted_boxes_rotation_matrix, boxes_center=predicted_boxes_center) corner_weights = tf.tile(weights, [1, 8]) if loss_type in ['huber', 'normalized_huber']: loss_fn = tf.keras.losses.Huber( delta=delta, reduction=tf.keras.losses.Reduction.NONE) elif loss_type in [ 'normalized_absolute_difference', 'absolute_difference' ]: loss_fn = tf.keras.losses.MeanAbsoluteError( reduction=tf.keras.losses.Reduction.NONE) else: raise ValueError(('Unknown loss type %s.' % loss_type)) box_corner_losses = loss_fn(y_true=tf.reshape(gt_box_corners, [-1, 3]), y_pred=tf.reshape(predicted_box_corners, [-1, 3])) return tf.reduce_mean(box_corner_losses * tf.reshape(corner_weights, [-1]))
def _reg(cls, batch_size, d, x, x_fake, beta=1e-1): alpha = tf.random_uniform(shape=[batch_size, 1], minval=0., maxval=1.) interpolates = alpha * x + (1 - alpha) * x_fake int_d = d(interpolates) gradients = tf.gradients(int_d, [interpolates])[0] slopes = tf.sqrt( tf.reduce_sum(tf.square(gradients), reduction_indices=[1])) return beta * tf.reduce_mean((slopes - 1)**2)
def entropy_loss(self): with tf.name_scope('entropy_loss'): entropies = [ dist.entropy() for name, dist in self.model.policy.items() ] entropy = tf.reduce_mean(tf.add_n(entropies)) entropy_loss = -entropy * self.entropy_factor entropy_masked = tf.stack(entropies, axis=-1) * tf.gather( self.function_args_mask, self.input_actions['function_id']) entropy_masked = tf.reduce_mean(tf.reduce_sum(entropy_masked, axis=-1)) tf.summary.scalar('policy_entropy', entropy, family='entropy') tf.summary.scalar('policy_entropy_masked', entropy_masked, family='entropy') tf.summary.scalar('entropy_loss', entropy_loss, family='losses') return entropy_loss
def loss_fn(self): """ Sample trajectories and fit a cost function C. Form grad estimate with C and take a TRPO step for next policy. """ adv = tf.placeholder(tf.float32, [None], name="advantages") returns = tf.placeholder(tf.float32, [None], name="returns") policy_loss = -tf.reduce_mean(self.policy.logli * adv) # value_loss = tf.reduce_mean((self.value - returns)**2) * self.value_coef value_loss = tf.reduce_mean(self.value - returns) entropy_loss = tf.reduce_mean(self.policy.entropy) * self.entropy_coef # we want to reduce policy and value errors, and maximize entropy # but since optimizer is minimizing the signs are opposite full_loss = policy_loss + value_loss - entropy_loss return value_loss