def CollectVarHistogram(vs_gs): """Adds histogram summaries for variables and gradients.""" for name, (var, grad) in vs_gs.FlattenItems(): with tf.device(var.device), tf.name_scope(name + '/summary'): if isinstance(grad, tf.IndexedSlices): var = tf.gather(var, grad.indices) grad = grad.values if var.dtype.is_complex: var = tf.abs(var) grad = tf.abs(grad) histogram('var_hist/' + name, var) histogram('grad_hist/' + name, grad)
def _MelSpectrogram(self, signal): """Computes the mel spectrogram from a waveform signal. Args: signal: f32 Tensor, shaped [batch_size, num_samples] Returns: f32 features Tensor, shaped [batch_size, num_frames, mel_channels] """ p = self.params # FFT. real_frequency_spectrogram = tf.signal.rfft(signal, [self._fft_size]) magnitude_spectrogram = tf.abs(real_frequency_spectrogram) # Shape of magnitude_spectrogram is num_frames x (fft_size/2+1) # Mel_weight is [num_spectrogram_bins, num_mel_bins] mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins=p.num_bins, num_spectrogram_bins=self._fft_size // 2 + 1, sample_rate=p.sample_rate, lower_edge_hertz=p.lower_edge_hertz, upper_edge_hertz=p.upper_edge_hertz, dtype=tf.float32) # Weight matrix implemented in the magnitude domain. batch_size, num_frames, fft_channels = py_utils.GetShape( magnitude_spectrogram, 3) mel_spectrogram = tf.matmul( tf.reshape(magnitude_spectrogram, [batch_size * num_frames, fft_channels]), mel_weight_matrix) mel_spectrogram = tf.reshape(mel_spectrogram, [batch_size, num_frames, p.num_bins]) return mel_spectrogram
def _iter_body(i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step): mat_m_i = (1 - alpha) * identity + alpha * mat_m new_mat_m = tf.matmul(mat_power(mat_m_i, exponent), mat_m) new_mat_h = tf.matmul(mat_h, mat_m_i) new_error = tf.reduce_max(tf.abs(new_mat_m - identity)) return (i + 1, new_mat_m, new_mat_h, mat_h, new_error, new_error < error)
def _SampleGumbelWithMax(phi, target_max, batch_seed, time_step, src_ids, src_paddings): """Samples a set of Gumbel noises with a specified maximum value. A set of values are sampled from Gumbel distributions with location parameters `phi` under the condition that their maximum is equal to `target_max`. The numerical stable implementation from Appendix B.3 of https://arxiv.org/pdf/1903.06059.pdf is used. Args: phi: A float tensor of shape [tgt_batch, k] thtat represents location parameters of Gumbel distributions. target_max: A float tensor of shape [tgt_batch, 1] that represents the target max values. batch_seed: An int tensor of shape [src_batch] that holds a seed value for each batch item. src_batch must be equal to tgt_batch / num_hyps_per_beam. The same seed is used within each consecutive num_hyps_per_beam items along the tgt_batch axis. time_step: A float tensor used as a secondary seed. src_ids: An int tensor of shape [src_batch, src_seq] that represents source IDs. Used for turning the random seed into a function of source IDs. src_paddings: A 0/1 float tensor of shape [src_batch, src_seq] where 1 means that the corresponding element of src_ids is a padding. Returns: A float tensor like `phi` where their maximum values along the second axis is (almost) equal to `target_max`. """ dtype = phi.dtype tgt_batch = tf.shape(phi)[0] k = tf.shape(phi)[1] src_batch = tf.shape(batch_seed)[0] num_hyps_per_beam = tgt_batch // src_batch # Sample noises from Gumbel distributions with location parameters `phi`. # shape: [src_batch, num_hyps_per_beam, k] gumbel_noises = _BatchSampleGumbel(batch_seed, time_step, src_ids, src_paddings, [num_hyps_per_beam, k], dtype) # shape: [num_hyps_per_beam, src_batch, k] gumbel_noises = tf.transpose(gumbel_noises, perm=[1, 0, 2]) # shape: [tgt_batch, k] gumbel_noises = tf.reshape(gumbel_noises, tf.shape(phi)) # shape: [tgt_batch, k] g_phi = phi + gumbel_noises # shape: [tgt_batch, 1] z = tf.reduce_max(g_phi, axis=1, keepdims=True) # Equation (23). # shape: [tgt_batch, k] v = target_max - g_phi + tf.math.log1p( # Without taking max, sometimes the result of log1p would become NaN on # TPU. tf.maximum(-tf.exp(g_phi - z), tf.constant(-1., dtype=dtype))) # Equation (24). return target_max - tf.nn.relu(v) - tf.math.log1p(tf.exp(-tf.abs(v)))
def CollectVarHistogram(vs_gs): """Adds histogram summaries for variables and gradients.""" for name, (var, grad) in vs_gs.FlattenItems(): name = py_utils.SanitizeScopeKey(name) with tf.device(var.device), tf.name_scope(name + '/summary'): if isinstance(grad, tf.IndexedSlices): var = tf.gather(var, grad.indices) grad = grad.values if var.dtype.is_complex: var = tf.abs(var) grad = tf.abs(grad) if py_utils.IsEagerMode(): histogram_v2(f'var_hist/{name}', var) histogram_v2(f'grad_hist/{name}', grad) else: histogram(f'var_hist/{name}', var) histogram(f'grad_hist/{name}', grad)
def _generalized_inverse_pth_root(self, input_t, exponent, epsilon=1e-12): input_t_f64 = tf.cast(input_t, tf.float64) s, u, v = tf.linalg.svd( input_t_f64 + tf.eye(tf.shape(input_t_f64)[0], dtype=tf.float64) * epsilon, full_matrices=True) inv_s = tf.reshape( tf.pow(tf.maximum(s, epsilon), tf.cast(exponent, tf.float64)), [1, -1]) val = tf.matmul(u * inv_s, v, adjoint_b=True) return tf.cast(val, tf.float32), tf.reduce_max(tf.abs(u - v))
def inverse_pth_root(self, input_t, exponent, epsilon=1e-12): input_t_f64 = tf.cast(input_t, tf.float64) s, u, v = tf.linalg.svd( input_t_f64 + tf.eye(tf.shape(input_t_f64)[0], dtype=tf.float64) * epsilon, full_matrices=True) val = tf.matmul( tf.matmul( u, tf.linalg.tensor_diag( tf.pow(tf.maximum(s, epsilon), tf.cast(exponent, tf.float64)))), tf.transpose(v)) return tf.cast(val, tf.float32), tf.reduce_max(tf.abs(u - v))
def compute_relative_changes(eps, u, v, w, new_eps, new_u, new_v, new_w): prev_sum_uvw = tf.stop_gradient((u + v + w) / eps) sum_uvw = tf.stop_gradient((new_u + new_v + new_w) / new_eps) # Compute the relative changes on margins of P. # This will be used for stopping criteria. # Note the last update on w would guarantee the # margin constraint c is satisfied, so we don't # need to check it here. p = tf.exp(tf.stop_gradient(score_ / new_eps + sum_uvw)) p_a = tf.reduce_sum(p, axis=-1, keepdims=True) p_b = tf.reduce_sum(p, axis=-2, keepdims=True) delta_a = tf.abs(a - p_a) / (a + 1e-6) delta_b = tf.abs(b - p_b) / (b + 1e-6) new_delta = tf.reduce_max(delta_a) new_delta = tf.maximum(new_delta, tf.reduce_max(delta_b)) # Compute the relative changes on assignment solution P. # This will be used for stopping criteria. delta_p = tf.abs(tf.exp(prev_sum_uvw) - tf.exp(sum_uvw)) / (tf.exp(sum_uvw) + 1e-6) new_delta = tf.maximum(new_delta, tf.reduce_max(delta_p)) return new_delta
def _testGradDrop(self, graddrop_params): batch_size, dims = 4, 5 gd_layer = graddrop_params.Set(name='test_gd_layer').Instantiate() linear_layer = builder_layers.LinearLayer.Params().Set( name='test_linear_layer', input_dims=dims, output_dims=dims).Instantiate() x = tf.random.uniform((batch_size, dims)) x = linear_layer.FPropDefaultTheta(x) # Make a copy of x after graddrop. x_gd = gd_layer.FPropDefaultTheta(x) # Compute a loss based on graddrop's version of x. gd_loss_0 = tf.reduce_sum(x_gd**2) gd_loss_1 = tf.reduce_sum(-tf.abs(x_gd)) gd_layer.SetLosses([ (gd_loss_0, 0.1), (gd_loss_1, 0.2), ]) gd_total_loss = gd_loss_0 + gd_loss_1 gd_grad = tf.gradients(gd_total_loss, x) # Compute the same loss based on the regular version of x. loss_0 = tf.reduce_sum(x**2) loss_1 = tf.reduce_sum(-tf.abs(x)) total_loss = loss_0 + loss_1 grad = tf.gradients(total_loss, x) with self.session() as sess: sess.run(tf.global_variables_initializer()) actual_total_loss, actual_grad, actual_gd_total_loss, actual_gd_grad = ( sess.run([total_loss, grad, gd_total_loss, gd_grad])) # Verify that losses are similar, but the gradients are different. self.assertAllClose(actual_total_loss, actual_gd_total_loss) self.assertNotAllClose(actual_grad, actual_gd_grad)
def _update_mask(self, weights, threshold): """Updates the mask for a given weight tensor. This functions first computes the cdf of the weight tensor, and estimates the threshold value such that 'desired_sparsity' fraction of weights have magnitude less than the threshold. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if sparsity is not defined """ if self._sparsity is None: raise ValueError('Sparsity variable undefined') sparsity = self._get_sparsity(weights.op.name) with tf.name_scope(weights.op.name + '_pruning_ops'): abs_weights = tf.abs(weights) k = tf.cast( tf.round( tf.cast(tf.size(abs_weights), tf.float32) * (1 - sparsity)), tf.int32) # Sort the entire array values, _ = tf.nn.top_k(tf.reshape(abs_weights, [-1]), k=tf.size(abs_weights)) # Grab the (k-1) th value current_threshold = tf.gather(values, k - 1) smoothed_threshold = tf.add_n([ tf.multiply(current_threshold, 1 - self._spec.threshold_decay), tf.multiply(threshold, self._spec.threshold_decay) ]) new_mask = tf.cast( tf.greater_equal(abs_weights, smoothed_threshold), tf.float32) return smoothed_threshold, new_mask
def testGradDropSetLossesTwiceRaisesError(self): batch_size, dims = 4, 5 gd_layer = graddrop.GradDrop.Params().Set( name='test_gd_layer').Instantiate() x = tf.random.uniform((batch_size, dims)) x_gd = gd_layer.FPropDefaultTheta(x) gd_loss_0 = tf.reduce_sum(x_gd**2) gd_loss_1 = tf.reduce_sum(-tf.abs(x_gd)) gd_layer.SetLosses([ (gd_loss_0, 0.1), (gd_loss_1, 0.2), ]) with self.assertRaisesRegex(ValueError, r'.*Losses already set.*'): gd_layer.SetLosses([ (gd_loss_0, 0.1), (gd_loss_1, 0.2), ])
def _BBoxArea(bbox): """Computes the area of a 2-d bbox. Vertices must be ordered clockwise or counter-clockwise. This function can technically handle any kind of convex polygons. Args: bbox: a float Tensor of shape [..., 4, 2] of bboxes. The last coordinates are the four corners of the bbox and (x, y). The corners must be given in counter-clockwise order. Returns: Area of the bbox. Tensor of shape [..., 1]. """ bbox_roll = tf.roll(bbox, shift=1, axis=-2) det = tf.reduce_sum( bbox[..., 0] * bbox_roll[..., 1] - bbox[..., 1] * bbox_roll[..., 0], axis=-1, keepdims=True) / 2.0 return tf.abs(det)
def inlined_matrix_inverse_pth_root(mat_g, mat_g_size, alpha, iter_count=100, error_tolerance=1e-6, ridge_epsilon=1e-6): """Computes mat_g^alpha, where alpha = -1/p, p is one of 2, 4, or 8. We use an iterative Schur-Newton method from equation 3.2 on page 9 of: A Schur-Newton Method for the Matrix p-th Root and its Inverse by Chun-Hua Guo and Nicholas J. Higham SIAM Journal on Matrix Analysis and Applications, 2006, Vol. 28, No. 3 : pp. 788-804 https://pdfs.semanticscholar.org/0abe/7f77433cf5908bfe2b79aa91af881da83858.pdf Args: mat_g: the symmetric PSD matrix whose power it to be computed mat_g_size: size of mat_g. alpha: exponent, must be -1/p for p a positive integer. iter_count: Maximum number of iterations. error_tolerance: Error indicator, useful for early termination. ridge_epsilon: Ridge epsilon added to make the matrix positive definite. Returns: mat_g^alpha """ alpha = tf.cast(alpha, tf.float64) neg_alpha = -1.0 * alpha exponent = 1.0 / neg_alpha identity = tf.eye(tf.cast(mat_g_size, tf.int32), dtype=tf.float64) def _unrolled_mat_pow_2(mat_m): """Computes mat_m^2.""" return tf.matmul(mat_m, mat_m) def _unrolled_mat_pow_4(mat_m): """Computes mat_m^4.""" mat_pow_2 = _unrolled_mat_pow_2(mat_m) return tf.matmul(mat_pow_2, mat_pow_2) def _unrolled_mat_pow_8(mat_m): """Computes mat_m^4.""" mat_pow_4 = _unrolled_mat_pow_4(mat_m) return tf.matmul(mat_pow_4, mat_pow_4) def mat_power(mat_m, p): """Computes mat_m^p, for p == 2 or 4 or 8. Args: mat_m: a square matrix p: a positive integer Returns: mat_m^p """ branch_index = tf.cast(p / 2 - 1, tf.int32) return tf.switch_case( branch_index, { 0: functools.partial(_unrolled_mat_pow_2, mat_m), 1: functools.partial(_unrolled_mat_pow_4, mat_m), 2: functools.partial(_unrolled_mat_pow_8, mat_m), }) def _iter_condition(i, unused_mat_m, unused_mat_h, unused_old_mat_h, error, run_step): return tf.math.logical_and( tf.math.logical_and(i < iter_count, error > error_tolerance), run_step) def _iter_body(i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step): mat_m_i = (1 - alpha) * identity + alpha * mat_m new_mat_m = tf.matmul(mat_power(mat_m_i, exponent), mat_m) new_mat_h = tf.matmul(mat_h, mat_m_i) new_error = tf.reduce_max(tf.abs(new_mat_m - identity)) return (i + 1, new_mat_m, new_mat_h, mat_h, new_error, new_error < error) if mat_g_size == 1: mat_h = tf.pow(mat_g + ridge_epsilon, alpha) else: damped_mat_g = mat_g + ridge_epsilon * identity z = (1 - 1 / alpha) / (2 * tf.norm(damped_mat_g)) # The best value for z is # (1 - 1/alpha) * (c_max^{-alpha} - c_min^{-alpha}) / # (c_max^{1-alpha} - c_min^{1-alpha}) # where c_max and c_min are the largest and smallest singular values of # damped_mat_g. # The above estimate assumes that c_max > c_min * 2^p. (p = -1/alpha) # Can replace above line by the one below, but it is less accurate, # hence needs more iterations to converge. # z = (1 - 1/alpha) / tf.trace(damped_mat_g) # If we want the method to always converge, use z = 1 / norm(damped_mat_g) # or z = 1 / tf.trace(damped_mat_g), but these can result in many # extra iterations. new_mat_m_0 = damped_mat_g * z new_error = tf.reduce_max(tf.abs(new_mat_m_0 - identity)) new_mat_h_0 = identity * tf.pow(z, neg_alpha) _, mat_m, mat_h, old_mat_h, error, convergence = tf.while_loop( _iter_condition, _iter_body, [0, new_mat_m_0, new_mat_h_0, new_mat_h_0, new_error, True]) error = tf.reduce_max(tf.abs(mat_m - identity)) is_converged = tf.cast(convergence, old_mat_h.dtype) resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h return resultant_mat_h, error
def max_assignment(score: tf.Tensor, *, elementwise_upper_bound: tf.Tensor, row_sums: tf.Tensor, col_sums: tf.Tensor, epsilon: float = 0.1, num_iterations: int = 50, use_epsilon_scaling: bool = True): """Differentiable max assignment with margin and upper bound constraints. Args: score: a 3D tensor of size [batch_size, n_rows, n_columns]. score[i, j, k] denotes the weight if the assignment on this entry is non-zero. elementwise_upper_bound: a 3D tensor of size [batch_size, n_rows, n_columns]. Each entry denotes the maximum value assignment[i, j, k] can take and must be a non-negative value. For example, upper_bound[i, j, k]=1.0 for binary assignment problem. row_sums: a 2D tensor of size [batch_size, n_rows]. The row sum constraint. The output assignment p[i, j, :] must sum to row_sums[i, j]. col_sums: a 2D tensor of size [batch_size, n_columns]. The column sum constraint. The output assignment p[i, :, k] must sum to col_sums[i, k]. epsilon: the epsilon coefficient of entropy regularization. The value should be within the range (0, 1]. `0.01` might work better than `0.1`. `0.1` may not make the assignment close enough to 0 or 1. num_iterations: the maximum number of iterations to perform. use_epsilon_scaling: whether to use epsilon scaling. In practice, the convergence of the iterative algorithm is much better if we start by solving the optimization with a larger epsilon value and re-use the solution (i.e. dual variables) for the instance with a smaller epsilon. This is called the epsilon scaling trick. See [Schmitzer 2019] (https://arxiv.org/pdf/1610.06519.pdf) as a reference. Here if use_epsilon_scaling=True, after each iteration we decrease the running epsilon by a constant factor until it reaches the target epsilon value. We found this to work well for gradient backward propagation, while the original scaling trick doesn't. Returns: A tuple with the following values. - assignment: a 3D tensor of size [batch_size, n_rows, n_columns]. The output assignment. - used_iter: a scalar tensor indicating the number of iterations used. - eps: a scalar tensor indicating the stopping epsilon value. - delta: a scalar tensor indicating the stopping delta value (the relative change on the margins of assignment p in the last iteration). """ # Check if all shapes are correct score_shape = score.shape bsz = score_shape[0] n = score_shape[1] m = score_shape[2] score = tf.ensure_shape(score, [bsz, n, m]) elementwise_upper_bound = tf.ensure_shape(elementwise_upper_bound, [bsz, n, m]) row_sums = tf.ensure_shape(tf.expand_dims(row_sums, axis=2), [bsz, n, 1]) col_sums = tf.ensure_shape(tf.expand_dims(col_sums, axis=1), [bsz, 1, m]) # the total sum of row sums must be equal to total sum of column sums sum_diff = tf.reduce_sum(row_sums, axis=1) - tf.reduce_sum(col_sums, axis=2) sum_diff = tf.abs(sum_diff) tf.Assert(tf.reduce_all(sum_diff < 1e-6), [sum_diff]) # Convert upper_bound constraint into another margin constraint # by adding auxiliary variables & scores. Tensor `a`, `b` and `c` # represent the margins (i.e. reduced sum) of 3 axes respectively. # max_row_sums = tf.reduce_sum(elementwise_upper_bound, axis=-1, keepdims=True) max_col_sums = tf.reduce_sum(elementwise_upper_bound, axis=-2, keepdims=True) score_ = tf.stack([score, tf.zeros_like(score)], axis=1) # (bsz, 2, n, m) a = tf.stack([row_sums, max_row_sums - row_sums], axis=1) # (bsz, 2, n, 1) b = tf.stack([col_sums, max_col_sums - col_sums], axis=1) # (bsz, 2, 1, m) c = tf.expand_dims(elementwise_upper_bound, axis=1) # (bsz, 1, n, m) # Clip log(0) to a large negative values -1e+36 to avoid # getting inf or NaN values in computation. Cannot use larger # values because float32 would use `-inf` automatically. # tf.Assert(tf.reduce_all(a >= 0), [a]) tf.Assert(tf.reduce_all(b >= 0), [b]) tf.Assert(tf.reduce_all(c >= 0), [c]) log_a = tf.maximum(tf.math.log(a), -1e+36) log_b = tf.maximum(tf.math.log(b), -1e+36) log_c = tf.maximum(tf.math.log(c), -1e+36) # Initialize the dual variables of margin constraints u = tf.zeros_like(a) v = tf.zeros_like(b) w = tf.zeros_like(c) eps = tf.constant(1.0 if use_epsilon_scaling else epsilon, dtype=score.dtype) epsilon = tf.constant(epsilon, dtype=score.dtype) def do_updates(cur_iter, eps, u, v, w): # pylint: disable=unused-argument # Epsilon scaling, i.e. gradually decreasing `eps` until it # reaches the target `epsilon` value cur_iter = tf.cast(cur_iter, u.dtype) scaling = tf.minimum(0.6 * 1.04**cur_iter, 0.85) eps = tf.maximum(epsilon, eps * scaling) score_div_eps = score_ / eps # Update u log_q_1 = score_div_eps + (w + v) / eps log_q_1 = tf.reduce_logsumexp(log_q_1, axis=-1, keepdims=True) new_u = (log_a - tf.maximum(log_q_1, -1e+30)) * eps # Update v log_q_2 = score_div_eps + (w + new_u) / eps log_q_2 = tf.reduce_logsumexp(log_q_2, axis=-2, keepdims=True) new_v = (log_b - tf.maximum(log_q_2, -1e+30)) * eps # Update w log_q_3 = score_div_eps + (new_u + new_v) / eps log_q_3 = tf.reduce_logsumexp(log_q_3, axis=-3, keepdims=True) new_w = (log_c - tf.maximum(log_q_3, -1e+30)) * eps return eps, new_u, new_v, new_w def compute_relative_changes(eps, u, v, w, new_eps, new_u, new_v, new_w): prev_sum_uvw = tf.stop_gradient((u + v + w) / eps) sum_uvw = tf.stop_gradient((new_u + new_v + new_w) / new_eps) # Compute the relative changes on margins of P. # This will be used for stopping criteria. # Note the last update on w would guarantee the # margin constraint c is satisfied, so we don't # need to check it here. p = tf.exp(tf.stop_gradient(score_ / new_eps + sum_uvw)) p_a = tf.reduce_sum(p, axis=-1, keepdims=True) p_b = tf.reduce_sum(p, axis=-2, keepdims=True) delta_a = tf.abs(a - p_a) / (a + 1e-6) delta_b = tf.abs(b - p_b) / (b + 1e-6) new_delta = tf.reduce_max(delta_a) new_delta = tf.maximum(new_delta, tf.reduce_max(delta_b)) # Compute the relative changes on assignment solution P. # This will be used for stopping criteria. delta_p = tf.abs(tf.exp(prev_sum_uvw) - tf.exp(sum_uvw)) / (tf.exp(sum_uvw) + 1e-6) new_delta = tf.maximum(new_delta, tf.reduce_max(delta_p)) return new_delta for cur_iter in tf.range(num_iterations): prev_eps, prev_u, prev_v, prev_w = eps, u, v, w eps, u, v, w = do_updates(cur_iter, eps, u, v, w) delta = compute_relative_changes(prev_eps, prev_u, prev_v, prev_w, eps, u, v, w) cur_iter = num_iterations assignment = tf.exp((score_ + u + v + w) / eps) assignment = assignment[:, 0] return assignment, cur_iter, eps, delta
def _SmoothL1Norm(a): """Smoothed L1 norm.""" # F&F paper formula (3). # http://openaccess.thecvf.com/content_cvpr_2018/papers/Luo_Fast_and_Furious_CVPR_2018_paper.pdf return tf.where(tf.abs(a) < 1, 0.5 * tf.square(a), tf.abs(a) - 0.5)
def _Gradient(inputs, _, original_grad): # Compute the gradients for each loss w.r.t. the inputs. # TODO(jngiam): Look into whether TF dedups this computation. per_loss_grads = [] for loss, _ in self._losses: per_loss_grad = tf.gradients(loss, self._output_tensor)[0] if per_loss_grad is None: tf.logging.warning( 'Loss %s did not result in a gradient during ' 'GradDrop computation.', loss) else: per_loss_grads.append(per_loss_grad) if not per_loss_grads: raise ValueError('No valid gradients for GradDrop.') # Multiply the gradients with the inputs. grads = per_loss_grads if p.use_input_sign_only: input_abs = tf.abs( tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs) grads = [grad * ((inputs) / (input_abs)) for grad in grads] else: grads = [grad * inputs for grad in grads] # Sum gradient over batch, assuming that batch is always on dim 0. if p.marginalize_batch_dim: grads = [ tf.reduce_sum(grad, axis=0, keepdims=True) for grad in grads ] # First discretize all gradients into their sign values. grad_sign_positive = [ tf.cast(grad > 0.0, tf.float32) for grad in grads ] grad_sign_negative = [ tf.cast(grad < 0.0, tf.float32) for grad in grads ] # Calculate the probability of positive gradients based on equation (1) # in the GradDrop paper. grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads]) prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon)) # Implementation of different scales for the keep function. Larger # scales result in steeper keep functions. prob_pos *= p.keep_prob_function_scale if p.keep_prob_function == 'sigmoid': # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0 # allows the function scale in sigmoid to be compatible with the # function scale in the linear case. prob_pos = tf.sigmoid(4.0 * prob_pos) elif p.keep_prob_function == 'linear': prob_pos += 0.5 # The main, default mode of GradDrop. Only gradients of one sign are kept, # and which sign is calculated via equation (1) of the main paper. prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape), tf.float32) - 0.5 grad_masks = [ (gsp - gsn) * prob_pos >= 0 for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive) ] # This diag value gives us the percentage of grads which are kept. gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks] diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks)) summary_utils.scalar('average_grad_mask', diag) leak_ratios = [leak_ratio for _, leak_ratio in self._losses] transformed_per_loss_grads = [ grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32)) for (leak, grad, grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks) ] transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads), original_grad.dtype) if not p.keep_gradnorm_constant: return transformed_grad transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2)) original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2)) return transformed_grad * original_grad_norm / ( transformed_grad_norm + p.epsilon)
def _maybe_update_block_mask(self, weights, threshold): """Performs block-granular masking of the weights. Block pruning occurs only if the block_height or block_width is > 1 and if the weight tensor, when squeezed, has ndims = 2. Otherwise, elementwise pruning occurs. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if block pooling function is not AVG or MAX """ block_dims = self._get_block_dims(weights.op.name) squeezed_weights = tf.squeeze(weights) if squeezed_weights.get_shape().ndims != 2 or block_dims == [1, 1]: return self._update_mask(weights, threshold) for i in range(2): if block_dims[i] == -1: block_dims[i] = squeezed_weights.get_shape()[i] if self._block_pooling_function not in ['AVG', 'MAX']: raise ValueError( 'Unknown pooling function for block sparsity: %s' % self._block_pooling_function) with tf.name_scope(weights.op.name + '_pruning_ops'): abs_weights = tf.abs(squeezed_weights) pool_window = block_dims pool_fn = pruning_utils.factorized_pool squeeze_axis = None if not self._spec.use_tpu: pool_fn = tf.nn.pool abs_weights = tf.reshape(abs_weights, [ 1, abs_weights.get_shape()[0], abs_weights.get_shape()[1], 1 ]) squeeze_axis = [0, 3] pooled_weights = pool_fn(abs_weights, window_shape=pool_window, pooling_type=self._block_pooling_function, strides=pool_window, padding='SAME', name=weights.op.name + '_pooled') if pooled_weights.get_shape().ndims != 2: pooled_weights = tf.squeeze(pooled_weights, axis=squeeze_axis) smoothed_threshold, new_mask = self._update_mask( pooled_weights, threshold) updated_mask = pruning_utils.expand_tensor(new_mask, block_dims) sliced_mask = tf.slice(updated_mask, [0, 0], [ squeezed_weights.get_shape()[0], squeezed_weights.get_shape()[1] ]) return smoothed_threshold, tf.reshape(sliced_mask, tf.shape(weights))