def test_softmax_noncausal_attention_block_output(self): batch_size = 1 length = 2 num_heads = 1 dim = 8 num_random_features = 1000 query = tf.random.normal([batch_size, length, num_heads, dim]) key = tf.random.normal([batch_size, length, num_heads, dim]) value = tf.random.normal([batch_size, length, num_heads, dim]) kernel_transformation = favor.softmax_kernel_transformation projection_matrix = favor.create_projection_matrix( num_random_features, dim) query = tf.cast(query, tf.float64) key = tf.cast(key, tf.float64) value = tf.cast(value, tf.float64) projection_matrix = tf.cast(projection_matrix, tf.float64) attention_block_output = favor.favor_attention(query, key, value, None, kernel_transformation, False, projection_matrix) query = tf.multiply(query, 1.0 / math.sqrt(float(dim))) attention_scores = tf.einsum("BXHD,BYHD->BXYH", query, key) attention_scores = tf.nn.softmax(attention_scores, axis=2) exact_attention_block_output = tf.einsum("BXYH,BYHD->BXHD", attention_scores, value) max_error = 0.5 with self.session(use_gpu=False) as sess: favor_output, groundtruth_output = sess.run( [exact_attention_block_output, attention_block_output]) error = np.max( np.abs( (groundtruth_output - favor_output) / groundtruth_output)) self.assertLess(error, max_error)
def apply_mask(x, scope=''): """Apply mask to a given weight tensor. Args: x: Input weight tensor scope: The current variable scope. Defaults to "". Returns: Tensor representing masked_weights """ mask = pruning_utils.weight_mask_variable(x, scope) threshold = pruning_utils.weight_threshold_variable(x, scope) # Add masked_weights in the weights namescope so as to make it easier # for the quantization library to add quant ops. masked_weights = tf.multiply(mask, x, _MASKED_WEIGHT_NAME) # Make sure the mask for a given variable are not added multiple times to the # collection. This is particularly important when applying mask to RNN's # weight variables if mask not in tf.get_collection_ref(_MASK_COLLECTION): tf.add_to_collection(_THRESHOLD_COLLECTION, threshold) tf.add_to_collection(_MASK_COLLECTION, mask) tf.add_to_collection(_MASKED_WEIGHT_COLLECTION, masked_weights) tf.add_to_collection(_WEIGHT_COLLECTION, x) return masked_weights
def _update_mask(self, weights, threshold): """Updates the mask for a given weight tensor. This functions first computes the cdf of the weight tensor, and estimates the threshold value such that 'desired_sparsity' fraction of weights have magnitude less than the threshold. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if sparsity is not defined """ if self._sparsity is None: raise ValueError('Sparsity variable undefined') sparsity = self._get_sparsity(weights.op.name) with tf.name_scope(weights.op.name + '_pruning_ops'): abs_weights = tf.abs(weights) k = tf.cast( tf.round( tf.cast(tf.size(abs_weights), tf.float32) * (1 - sparsity)), tf.int32) # Sort the entire array values, _ = tf.nn.top_k(tf.reshape(abs_weights, [-1]), k=tf.size(abs_weights)) # Grab the (k-1) th value current_threshold = tf.gather(values, k - 1) smoothed_threshold = tf.add_n([ tf.multiply(current_threshold, 1 - self._spec.threshold_decay), tf.multiply(threshold, self._spec.threshold_decay) ]) new_mask = tf.cast( tf.greater_equal(abs_weights, smoothed_threshold), tf.float32) return smoothed_threshold, new_mask
def _get_sparsity(self, weight_name): """Returns target sparsity for the given layer/weight name.""" target_sparsity = [ sparsity for regexp, sparsity in self._weight_sparsity_map.items() if regexp.search(weight_name) ] if not target_sparsity: return self._sparsity if len(target_sparsity) > 1: raise ValueError( 'Multiple matches in weight_sparsity_map for weight %s' % weight_name) # TODO(suyoggupta): This will work when initial_sparsity = 0. Generalize # to handle other cases as well. return tf.multiply( self._sparsity, tf.div(target_sparsity[0], self._spec.target_sparsity))
def GetMixResult(cls, theta, concat, lstmobj): # pylint:disable=unused-argument """Compute the mix result. Args: theta: a theta object in the LSTM cells; concat: Tensor, concat of previous output and current state vector; lstmobj: a LSTM cell object. Returns: result Tensor. Raises: NotImplementedError if prune_option is not 'weight', 'first_order_gradient', or 'second_order_gradient'. """ return tf.matmul( concat, lstmobj.QWeight(tf.multiply(theta.wm, theta.mask, 'masked_weight')))
def _setup_sparsity(self): begin_step = self._spec.sparsity_function_begin_step end_step = self._spec.sparsity_function_end_step initial_sparsity = self._spec.initial_sparsity target_sparsity = self._spec.target_sparsity exponent = self._spec.sparsity_function_exponent with tf.name_scope(self._spec.name): p = tf.minimum( 1.0, tf.maximum( 0.0, tf.div(tf.cast(self._global_step - begin_step, tf.float32), end_step - begin_step))) sparsity = tf.add(tf.multiply(initial_sparsity - target_sparsity, tf.pow(1 - p, exponent)), target_sparsity, name='sparsity') return sparsity