def _True(anchor, bboxes): """True branch when num of bboxes is non-zero.""" n = tf.shape(bboxes)[0] centroid = BBoxesCentroid(bboxes) # Computed dot products between centroid and the anchor point. dot = tf.squeeze(tf.matmul(centroid, tf.expand_dims(anchor, 1)), axis=1) # Normalize dot to get the cosine of the angles. norm = tf.norm(anchor) * tf.norm(centroid, axis=1) cosine = tf.where(tf.greater(norm, 0), dot / norm, tf.zeros([n], norm.dtype)) # Disambiguates the angle anchor--O--point is positive or negative by the # sign of cross products between angle and points. tf.linalg.cross takes # 3-vector (x, y, z), so we set z to 0. tf.linalg.cross does not support # broadcasting, so we tile anchor to shape [n, 3]. cross = tf.linalg.cross( tf.tile(tf.pad(tf.expand_dims(anchor, 0), [[0, 0], [0, 1]]), [n, 1]), tf.pad(centroid, [[0, 0], [0, 1]])) # If the sign is positive, the points lie on the clockwise side of # O-->anchor. Hence, -1 - cosine moves the cosine values to [-2, 0]. If the # sign is negative, the points lie on the counter-clockwise side of # O-->anchor. 1 + cosine moves the cosine values to [0, 2]. # # The car dataset shows that the points are scanned in the counter-clockwise # fashion. Therefore, top-k orders the points in the same order in which # bboxes appears in the spin. score = tf.where(tf.greater(cross, 0)[:, 2], -1 - cosine, 1 + cosine) _, indices = tf.nn.top_k(score, n, sorted=True) return indices
def _internal_apply_dense(self, grad, var, magnitude_optimizer_apply_fn, direction_optimizer_apply_fn): # pylint: disable=g-doc-args """Main optimization logic of AdaGraft, which calls the child optimizers. Args: grad: Tensor containing gradients. var: Tensor containing parameter values. magnitude_optimizer_apply_fn: Apply magnitude optimizer. direction_optimizer_apply_fn: Apply direction optimizer. Returns: The final update op, which increments var by the grafted step. Pseudocode: - Copy weights into scratch space 'scratch_copy'. - Run magnitude_optimizer in-place. - Use scratch copy to figure out how far we moved ('magnitude_step'). - Copy weights back. - Run direction_optimizer in-place. - Move weights along the line segment with scratch_copy. """ if self.use_global_norm: self._variables.append(var) # Slot with current parameter values scratch_slot = self.get_slot(var, "scratch_copy") old_var = tf.assign(scratch_slot, var) with tf.control_dependencies([old_var]): m_updated_var = magnitude_optimizer_apply_fn(grad, var) # pylint: disable=protected-access # Run magnitude optimizer and compute the norm of the update. with tf.control_dependencies([m_updated_var]): m_step = var - old_var m_step_norm = tf.norm(m_step) if self.diagnostic or self.use_global_norm: m_step_norm = tf.assign(self.get_slot(var, "m_step_norm"), m_step_norm) # Run direction optimizer and compute its norm, and the direction. with tf.control_dependencies([m_step_norm]): flushed_var = tf.assign(var, old_var) with tf.control_dependencies([flushed_var]): d_updated_var = direction_optimizer_apply_fn(grad, var) # pylint: disable=protected-access # Run an update of the direction optimizer with magnitude optimizer norm. with tf.control_dependencies([d_updated_var]): d_step = var - old_var d_step_norm = tf.norm(d_step) if self.diagnostic or self.use_global_norm: d_step_norm = tf.assign(self.get_slot(var, "d_step_norm"), d_step_norm) if self.use_global_norm: flushed_var = tf.assign(var, old_var) with tf.control_dependencies([d_step_norm, flushed_var]): return tf.assign(scratch_slot, d_step) step = tf.where( tf.greater(d_step_norm, 0), (m_step_norm / tf.maximum(d_step_norm, 1e-30)) * d_step, tf.zeros_like(d_step)) return tf.assign(var, old_var + self._learning_rate_tensor * step)
def CornerLoss(self, gt_bboxes, predicted_bboxes, symmetric=True): """Corner regularization loss. This function computes the corner loss, an alternative regression loss for box residuals. This was used in the Frustum-PointNets paper [1]. We compute the predicted bboxes (all 8 corners) and compute a SmoothedL1 loss between the corners of the predicted boxes and ground truth. Hence, this loss can help encourage the model to maximize the IoU of the predictions. [1] Frustum PointNets for 3D Object Detection from RGB-D Data https://arxiv.org/pdf/1711.08488.pdf Args: gt_bboxes: tf.float32 of shape [..., 7] which contains (x, y, z, dx, dy, dz, phi), corresponding to ground truth bbox parameters. predicted_bboxes: tf.float32 of same shape as gt_bboxes containing predicted bbox parameters. symmetric: boolean. If True, computes the minimum of the corner loss with respect to both the gt box and the gt box rotated 180 degrees. Returns: tf.float32 Tensor of shape [...] where each entry contains the corner loss for the corresponding bbox. """ bbox_shape = py_utils.GetShape(gt_bboxes) batch_size = bbox_shape[0] gt_bboxes = tf.reshape(gt_bboxes, [batch_size, -1, 7]) predicted_bboxes = tf.reshape(predicted_bboxes, [batch_size, -1, 7]) gt_corners = geometry.BBoxCorners(gt_bboxes) predicted_corners = geometry.BBoxCorners(predicted_bboxes) corner_dist = tf.norm(predicted_corners - gt_corners, axis=-1) huber_loss = self.ScaledHuberLoss(labels=tf.zeros_like(corner_dist), predictions=corner_dist) huber_loss = tf.reduce_sum(huber_loss, axis=-1) if symmetric: # Compute the loss assuming the ground truth is flipped 180, and # take the minimum of the two losses. rot = tf.constant([[[0., 0., 0., 0., 0., 0., np.pi]]], dtype=tf.float32) rotated_gt_bboxes = gt_bboxes + rot rotated_gt_corners = geometry.BBoxCorners(rotated_gt_bboxes) rotated_corner_dist = tf.norm(predicted_corners - rotated_gt_corners, axis=-1) rotated_huber_loss = self.ScaledHuberLoss( labels=tf.zeros_like(rotated_corner_dist), predictions=rotated_corner_dist) rotated_huber_loss = tf.reduce_sum(rotated_huber_loss, axis=-1) huber_loss = tf.minimum(huber_loss, rotated_huber_loss) huber_loss = tf.reshape(huber_loss, bbox_shape[:-1]) return huber_loss
def inlined_matrix_inverse_pth_root(mat_g, mat_g_size, alpha, iter_count=100, error_tolerance=1e-6, ridge_epsilon=1e-6): """Computes mat_g^alpha, where alpha = -1/p, p is one of 2, 4, or 8. We use an iterative Schur-Newton method from equation 3.2 on page 9 of: A Schur-Newton Method for the Matrix p-th Root and its Inverse by Chun-Hua Guo and Nicholas J. Higham SIAM Journal on Matrix Analysis and Applications, 2006, Vol. 28, No. 3 : pp. 788-804 https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf Args: mat_g: the symmetric PSD matrix whose power it to be computed mat_g_size: size of mat_g. alpha: exponent, must be -1/p for p a positive integer. iter_count: Maximum number of iterations. error_tolerance: Error indicator, useful for early termination. ridge_epsilon: Ridge epsilon added to make the matrix positive definite. Returns: mat_g^alpha """ alpha = tf.cast(alpha, tf.float64) neg_alpha = -1.0 * alpha exponent = 1.0 / neg_alpha identity = tf.eye(tf.cast(mat_g_size, tf.int32), dtype=tf.float64) def _unrolled_mat_pow_2(mat_m): """Computes mat_m^2.""" return tf.matmul(mat_m, mat_m) def _unrolled_mat_pow_4(mat_m): """Computes mat_m^4.""" mat_pow_2 = _unrolled_mat_pow_2(mat_m) return tf.matmul(mat_pow_2, mat_pow_2) def _unrolled_mat_pow_8(mat_m): """Computes mat_m^4.""" mat_pow_4 = _unrolled_mat_pow_4(mat_m) return tf.matmul(mat_pow_4, mat_pow_4) def mat_power(mat_m, p): """Computes mat_m^p, for p == 2 or 4 or 8. Args: mat_m: a square matrix p: a positive integer Returns: mat_m^p """ branch_index = tf.cast(p / 2 - 1, tf.int32) return tf.switch_case( branch_index, { 0: functools.partial(_unrolled_mat_pow_2, mat_m), 1: functools.partial(_unrolled_mat_pow_4, mat_m), 2: functools.partial(_unrolled_mat_pow_8, mat_m), }) def _iter_condition(i, unused_mat_m, unused_mat_h, unused_old_mat_h, error, run_step): return tf.math.logical_and( tf.math.logical_and(i < iter_count, error > error_tolerance), run_step) def _iter_body(i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step): mat_m_i = (1 - alpha) * identity + alpha * mat_m new_mat_m = tf.matmul(mat_power(mat_m_i, exponent), mat_m) new_mat_h = tf.matmul(mat_h, mat_m_i) new_error = tf.reduce_max(tf.abs(new_mat_m - identity)) return (i + 1, new_mat_m, new_mat_h, mat_h, new_error, new_error < error) if mat_g_size == 1: mat_h = tf.pow(mat_g + ridge_epsilon, alpha) else: damped_mat_g = mat_g + ridge_epsilon * identity z = (1 - 1 / alpha) / (2 * tf.norm(damped_mat_g)) # The best value for z is # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) / # (c_max^{1-alpha} - c_min^{1-alpha}) # where c_max and c_min are the largest and smallest singular values of # damped_mat_g. # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha) # Can replace above line by the one below, but it is less accurate, # hence needs more iterations to converge. # z = (1 - 1/alpha) / tf.trace(damped_mat_g) # If we want the method to always converge, use z = 1 / norm(damped_mat_g) # or z = 1 / tf.trace(damped_mat_g), but these can result in many # extra iterations. new_mat_m_0 = damped_mat_g * z new_error = tf.reduce_max(tf.abs(new_mat_m_0 - identity)) new_mat_h_0 = identity * tf.pow(z, neg_alpha) _, mat_m, mat_h, old_mat_h, error, convergence = tf.while_loop( _iter_condition, _iter_body, [0, new_mat_m_0, new_mat_h_0, new_mat_h_0, new_error, True]) error = tf.reduce_max(tf.abs(mat_m - identity)) is_converged = tf.cast(convergence, old_mat_h.dtype) resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h return resultant_mat_h, error
def FProp(self, theta, x, paddings=None, update=False): """Computes distances of the given input 'x' to all centroids. This implementation applies layer normalization on 'x' internally first, and the returned 'dists' is computed using the normalized 'x'. Args: theta: A `.NestedMap` of weights' values of this layer. x: A tensor of shape [B, L, N, H]. paddings: If not None, a tensor of shape [B, L]. update: bool, whether to update centroids using x. Returns: dists: "distances" of the given input 'x' to all centroids. Shape [B, L, N, K]. k_means_loss: the average squared Euclidean distances to the closest centroid, a scalar. """ p = self.params if paddings is None: paddings = tf.zeros_like(x[:, :, 0, 0]) # Shape [B, L, 1, 1] paddings_4d = paddings[:, :, None, None] if p.apply_layer_norm: x = KMeansClusteringForAtten.LayerNorm(x, p.epsilon) # 'x' is normalized (but theta.means is not), we use negative dot product to # approximate the Euclidean distance here. dists = -tf.einsum('BLNH, NKH -> BLNK', x, theta.means) # For padded positions we update the distances to very large numbers. very_large_dists = tf.ones_like(dists) * tf.constant( 0.1, dtype=dists.dtype) * dists.dtype.max paddings_tiled = tf.tile(paddings_4d, [1, 1, p.num_heads, p.num_clusters]) dists = tf.where(paddings_tiled > 0.0, very_large_dists, dists) # Shape [B, L, N, K], the same as 'dists' above. nearest_one_hot = tf.one_hot( tf.math.argmin(dists, axis=-1), p.num_clusters, dtype=py_utils.FPropDtype(p)) # Same shape as the input 'x'. nearest_centroid = tf.einsum('BLNK, NKH -> BLNH', nearest_one_hot, theta.means) diff = tf.math.squared_difference(x, tf.stop_gradient(nearest_centroid)) diff = py_utils.ApplyPadding(paddings_4d, diff) diff = tf.math.reduce_mean(diff, axis=2) # The commitment loss which when back proped against encourages the 'x' # values to commit to their chosen centroids. k_means_loss = tf.math.reduce_sum(diff) / tf.math.reduce_sum(1.0 - paddings) summary_utils.scalar('k_means/squared_distance_loss', k_means_loss) # TODO(zhouwk): investigate normalizing theta.means after each update. means_norm = tf.norm(theta.means) summary_utils.scalar('k_means/centroid_l2_norm/min', tf.math.reduce_min(means_norm)) summary_utils.scalar('k_means/centroid_l2_norm/mean', tf.math.reduce_mean(means_norm)) if not update: return dists, k_means_loss # To update the centroids (self.vars.means), we apply gradient descent on # the mini-batch of input 'x', which yields the following: # new_centroid = centroid + (1 - decay) * (x_mean - centroid) # where x_mean is the average over all the input vectors closest to this # centroid. # # Note that this approach is equivalent with backprop via # loss = tf.math.reduce_mean( # tf.math.squared_difference(tf.stop_gradient(x), nearest_centroid))) # , except that here the learning rate is independently set via 'decay'. # Ensure that the padded positions are not used to update the centroids. nearest_one_hot = py_utils.ApplyPadding(paddings_4d, nearest_one_hot) # Sum away batch and sequence length dimensions to get per cluster count. # Shape: [N, K] per_cluster_count = tf.reduce_sum(nearest_one_hot, axis=[0, 1]) summary_utils.histogram('k_means/per_cluster_vec_count', per_cluster_count) # Sum of the input 'x' per each closest centroid. sum_x = tf.einsum('BLNK, BLNH -> NKH', nearest_one_hot, x) if py_utils.use_tpu(): per_cluster_count = tf.tpu.cross_replica_sum(per_cluster_count) sum_x = tf.tpu.cross_replica_sum(sum_x) # If per_cluster_count for a cluster is 0, then 'nearest_one_hot' in that # cluster's position will always be 0, hence 'sum_x' in that dimension will # be 0. new_means = sum_x / tf.maximum( tf.constant(1.0, dtype=per_cluster_count.dtype), tf.expand_dims(per_cluster_count, axis=-1)) # We use exponential moving average. TODO(zhouwk): investigate smooth this # over an exponentially moving averaged per cluster count. # # Note that we intentionally do not normalize the means after this update # as empirically this works better. update_means_diff = tf.cast((1.0 - p.decay) * (new_means - theta.means), self.vars.means.dtype) return py_utils.with_dependencies( [tf.assign_add(self.vars.means, update_means_diff)], dists), k_means_loss
def CornerLoss(self, gt_bboxes, predicted_bboxes): """Corner regularization loss. This function computes the corner loss, an alternative regression loss for box residuals. This was used in the Frustum-PointNets paper [1]. We compute the predicted bboxes (all 8 corners) and compute a SmoothedL1 loss between the corners of the predicted boxes and ground truth. Hence, this loss can help encourage the model to maximize the IoU of the predictions. [1] Frustum PointNets for 3D Object Detection from RGB-D Data https://arxiv.org/pdf/1711.08488.pdf TODO(bcyang): support arbitrary input shapes [..., 7]. Args: gt_bboxes: tf.float32 of shape [batch_size, num_centers, num_anchor_bboxes_per_center, 7] which contains (x, y, z, dx, dy, dz, phi), corresponding to ground truth bbox parameters. predicted_bboxes: tf.float32 of same shape as gt_bboxes containing predicted bbox parameters. Returns: tf.float32 Tensor of shape [batch_size, num_centers, num_anchor_bboxes_per_center] where each entry contains the corner loss for the corresponding bbox. """ batch_size, num_centers, num_anchor_bboxes_per_center = py_utils.GetShape( gt_bboxes, 3) gt_bboxes = py_utils.HasShape( gt_bboxes, [batch_size, num_centers, num_anchor_bboxes_per_center, 7]) predicted_bboxes = py_utils.HasShape( predicted_bboxes, [batch_size, num_centers, num_anchor_bboxes_per_center, 7]) gt_bboxes = tf.reshape( gt_bboxes, [batch_size, num_centers * num_anchor_bboxes_per_center, 7]) predicted_bboxes = tf.reshape( predicted_bboxes, [batch_size, num_centers * num_anchor_bboxes_per_center, 7]) rot = tf.constant([[[0., 0., 0., 0., 0., 0., np.pi]]], dtype=tf.float32) rotated_gt_bboxes = gt_bboxes + rot gt_corners = geometry.BBoxCorners(gt_bboxes) rotated_gt_corners = geometry.BBoxCorners(rotated_gt_bboxes) predicted_corners = geometry.BBoxCorners(predicted_bboxes) corner_dist = tf.norm(predicted_corners - gt_corners, axis=-1) rotated_corner_dist = tf.norm( predicted_corners - rotated_gt_corners, axis=-1) total_dist = tf.reduce_sum(corner_dist, axis=-1) rotated_total_dist = tf.reduce_sum(rotated_corner_dist, axis=-1) min_dist = tf.minimum(total_dist, rotated_total_dist) huber_loss = self.ScaledHuberLoss( labels=tf.zeros_like(total_dist), predictions=min_dist) huber_loss = tf.reshape( huber_loss, [batch_size, num_centers, num_anchor_bboxes_per_center]) return huber_loss