def testWeightSpecificSparsity(self): param_list = [ "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100", "target_sparsity=0.5", "weight_sparsity_map=[layer2/weights:0.75]", "threshold_decay=0.0" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) with variable_scope.variable_scope("layer1"): w1 = variables.Variable( math_ops.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w1) with variable_scope.variable_scope("layer2"): w2 = variables.Variable( math_ops.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w2) p = pruning.Pruning(pruning_hparams) mask_update_op = p.conditional_mask_update_op() increment_global_step = state_ops.assign_add(self.global_step, 1) with self.cached_session() as session: variables.global_variables_initializer().run() for _ in range(110): session.run(mask_update_op) session.run(increment_global_step) self.assertAllEqual( session.run(pruning.get_weight_sparsity()), [0.5, 0.75])
def convolution(input, group, shape, trainable, name, **kwargs): w = tf.get_variable(initializer=tf.truncated_normal(shape, stddev=0.1), trainable=trainable, name=name + "_weight") if group == 1: layer = tf.nn.convolution(input, pruning.apply_mask(w, name + "_weight"), **kwargs) else: weight_groups = tf.split(w, num_or_size_splits=group, axis=-1) xs = tf.split(input, num_or_size_splits=group, axis=-1) convolved = [ tf.nn.convolution( x, pruning.apply_mask(weight, name + "_weight_groups"), **kwargs) for (x, weight) in zip(xs, weight_groups) ] layer = tf.concat(convolved, axis=-1) if name.endswith('_sc'): b = tf.get_variable(initializer=tf.truncated_normal( input.get_shape().as_list()[-1::], stddev=0.1), trainable=trainable, name=name + "_bias") layer = layer + b return layer
def testWeightSpecificSparsity(self): param_list = [ "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100", "target_sparsity=0.5", "weight_sparsity_map=[layer2/weights:0.75]", "threshold_decay=0.0" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) with variable_scope.variable_scope("layer1"): w1 = variables.Variable(math_ops.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w1) with variable_scope.variable_scope("layer2"): w2 = variables.Variable(math_ops.linspace(1.0, 100.0, 100), name="weights") _ = pruning.apply_mask(w2) p = pruning.Pruning(pruning_hparams) mask_update_op = p.conditional_mask_update_op() increment_global_step = state_ops.assign_add(self.global_step, 1) with self.test_session() as session: variables.global_variables_initializer().run() for _ in range(110): session.run(mask_update_op) session.run(increment_global_step) self.assertAllEqual(session.run(pruning.get_weight_sparsity()), [0.5, 0.75])
def testPerLayerBlockSparsity(self): param_list = [ "block_dims_map=[layer1/weights:1x1,layer2/weights:1x2]", "block_pooling_function=AVG", "threshold_decay=0.0" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) with variable_scope.variable_scope("layer1"): w1 = constant_op.constant([[-0.1, 0.1], [-0.2, 0.2]], name="weights") pruning.apply_mask(w1) with variable_scope.variable_scope("layer2"): w2 = constant_op.constant( [[0.1, 0.1, 0.3, 0.3], [0.2, 0.2, 0.4, 0.4]], name="weights") pruning.apply_mask(w2) sparsity = variables.VariableV1(0.5, name="sparsity") p = pruning.Pruning(pruning_hparams, sparsity=sparsity) mask_update_op = p.mask_update_op() with self.cached_session() as session: variables.global_variables_initializer().run() session.run(mask_update_op) mask1_eval = session.run(pruning.get_masks()[0]) mask2_eval = session.run(pruning.get_masks()[1]) self.assertAllEqual(session.run(pruning.get_weight_sparsity()), [0.5, 0.5]) self.assertAllEqual(mask1_eval, [[0.0, 0.0], [1., 1.]]) self.assertAllEqual(mask2_eval, [[0, 0, 1., 1.], [0, 0, 1., 1.]])
def _build_fc_layer(self, inputs, scope, weight_init, shape, activation=None): weights = self._variable_with_weight_decay( 'weights', shape=shape, initialization=weight_init, wd=0.0) biases = self._variable_on_cpu('biases', shape[1], initializer=tf.constant_initializer(0.001)) if activation is not None: return activation( tf.matmul(inputs, pruning.apply_mask(weights, scope)) + biases, name=scope.name) else: return tf.matmul(inputs, pruning.apply_mask(weights, scope)) + biases
def quant_fc(self, input_tensor, shape, init_func=msra_init, enable_bias=True, quant_bits=8, quant_input=True, enable_prune=False): W = init_func(shape, "fc_weights") if enable_prune: W = pruning.apply_mask(W) self.prune_weights.append(W) W = fake_quant(W, min=tf.reduce_min(W), max=tf.reduce_max(W), num_bits=quant_bits) if quant_input == True: input_tensor = fake_quant(input_tensor, min=tf.reduce_min(input_tensor), max=tf.reduce_max(input_tensor), num_bits=quant_bits) current_tensor = tf.matmul(input_tensor, W) if enable_bias: bias = zeros_init([shape[1]], "fc_bias") current_tensor = tf.add(current_tensor, bias) return current_tensor
def quant_conv2d(self, input_tensor, kernel, stride, init_func=msra_init, quant_bits=8, padding="SAME", quant_input=True, enable_prune=False): assert (len(kernel) == 4) assert (len(stride) == 2) W = init_func(kernel, "conv_weights") if enable_prune: W = pruning.apply_mask(W) self.prune_weights.append(W) W = fake_quant(W, min=tf.reduce_min(W), max=tf.reduce_max(W), num_bits=quant_bits) if quant_input == True: input_tensor = fake_quant(input_tensor, min=tf.reduce_min(input_tensor), max=tf.reduce_max(input_tensor), num_bits=quant_bits) conv_res = tf.nn.conv2d(input_tensor, W, [1, stride[0], stride[1], 1], padding=padding) return conv_res
def fc(x, num_in, num_out, name, prune=True, relu=True): """Create a fully connected layer.""" with tf.variable_scope(name) as scope: # Create tf variables for the weights and biases weights = tf.get_variable('weights', shape=[num_in, num_out], trainable=True) biases = tf.get_variable('biases', [num_out], trainable=True) # Matrix multiply weights and inputs and add bias if prune: act = tf.nn.xw_plus_b(x, pruning.apply_mask(weights, scope), biases, name=scope.name) else: act = tf.nn.xw_plus_b(x, weights, biases, name=scope.name) if relu: # Apply ReLu non linearity relu = tf.nn.relu(act) return relu else: return act
def _dense(inputs, weights=None, num_classes=1000, bias=0.0, l2_strength=0.0, initializer=tf.contrib.layers.xavier_initializer(), is_pruning=False, name="fully_connect"): last_shape = inputs.get_shape()[-1].value with tf.variable_scope(name): if weights == None: weights = _variable_with_weight_decay([last_shape, num_classes], initializer, wd=l2_strength) if isinstance(bias, float): bias = tf.get_variable("bias", [num_classes], dtype=tf.float32, initializer=tf.constant_initializer(bias)) if is_pruning: weights = pruning.apply_mask(weights) _variable_summaries(weights) _variable_summaries(bias) out = tf.nn.bias_add(tf.matmul(inputs, weights), bias) return out
def testConditionalMaskUpdate(self): param_list = [ "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) weights = variables.Variable(math_ops.linspace(1.0, 100.0, 100), name="weights") masked_weights = pruning.apply_mask(weights) sparsity = variables.Variable(0.00, name="sparsity") # Set up pruning p = pruning.Pruning(pruning_hparams, sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.conditional_mask_update_op() sparsity_val = math_ops.linspace(0.0, 0.9, 10) increment_global_step = state_ops.assign_add(self.global_step, 1) non_zero_count = [] with self.test_session() as session: variables.global_variables_initializer().run() for i in range(10): session.run(state_ops.assign(sparsity, sparsity_val[i])) session.run(mask_update_op) session.run(increment_global_step) non_zero_count.append(np.count_nonzero(masked_weights.eval())) # Weights pruned at steps 0,2,4,and,6 expected_non_zero_count = [100, 100, 80, 80, 60, 60, 40, 40, 40, 40] self.assertAllEqual(expected_non_zero_count, non_zero_count)
def testConditionalMaskUpdate(self): param_list = [ "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6" ] test_spec = ",".join(param_list) pruning_hparams = pruning.get_pruning_hparams().parse(test_spec) weights = variables.Variable( math_ops.linspace(1.0, 100.0, 100), name="weights") masked_weights = pruning.apply_mask(weights) sparsity = variables.Variable(0.00, name="sparsity") # Set up pruning p = pruning.Pruning(pruning_hparams, sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.conditional_mask_update_op() sparsity_val = math_ops.linspace(0.0, 0.9, 10) increment_global_step = state_ops.assign_add(self.global_step, 1) non_zero_count = [] with self.test_session() as session: variables.global_variables_initializer().run() for i in range(10): session.run(state_ops.assign(sparsity, sparsity_val[i])) session.run(mask_update_op) session.run(increment_global_step) non_zero_count.append(np.count_nonzero(masked_weights.eval())) # Weights pruned at steps 0,2,4,and,6 expected_non_zero_count = [100, 100, 80, 80, 60, 60, 40, 40, 40, 40] self.assertAllEqual(expected_non_zero_count, non_zero_count)
def _build_conv_layer(self, inputs, scope, weight_init, filter_height, filter_width, channel_in, channel_out, strides, padding='SAME'): kernel = self._variable_with_weight_decay( 'weights', shape=[filter_height, filter_width, channel_in, channel_out], initialization=weight_init, wd=0.0) conv = tf.nn.conv2d( input=inputs, filter=pruning.apply_mask(kernel, scope), padding=padding, strides=strides) biases = self._variable_on_cpu('biases', channel_out, initializer=tf.constant_initializer(0.001)) pre_activation = tf.nn.bias_add(conv, biases) return pre_activation
def masked_conv2d(x, W, b, strides=1, name=None): with tf.variable_scope(name) as scope: x = tf.nn.conv2d(x, pruning.apply_mask(W), strides=[1, strides, strides, 1], padding='SAME', name=scope.name) x = tf.nn.bias_add(x, b) return tf.nn.relu(x)
def testCreateMask2D(self): width = 10 height = 20 with self.test_session(): weights = variables.Variable( random_ops.random_normal([width, height], stddev=1), name="weights") masked_weights = pruning.apply_mask(weights, variable_scope.get_variable_scope()) variables.global_variables_initializer().run() weights_val = weights.eval() masked_weights_val = masked_weights.eval() self.assertAllEqual(weights_val, masked_weights_val)
def testCreateMask2D(self): width = 10 height = 20 with self.test_session(): weights = variables.Variable( random_ops.random_normal([width, height], stddev=1), name="weights") masked_weights = pruning.apply_mask(weights, variable_scope.get_variable_scope()) variables.global_variables_initializer().run() weights_val = weights.eval() masked_weights_val = masked_weights.eval() self.assertAllEqual(weights_val, masked_weights_val)
def testUpdateSingleMask(self): with self.test_session() as session: weights = variables.Variable(math_ops.linspace(1.0, 100.0, 100), name="weights") masked_weights = pruning.apply_mask(weights) sparsity = variables.Variable(0.5, name="sparsity") p = pruning.Pruning(sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.mask_update_op() variables.global_variables_initializer().run() masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 100) session.run(mask_update_op) masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
def testUpdateSingleMask(self): with self.test_session() as session: weights = variables.Variable( math_ops.linspace(1.0, 100.0, 100), name="weights") masked_weights = pruning.apply_mask(weights) sparsity = variables.Variable(0.5, name="sparsity") p = pruning.Pruning(sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.mask_update_op() variables.global_variables_initializer().run() masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 100) session.run(mask_update_op) masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
def embedding_lookup(input_ids, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="word_embeddings", use_one_hot_embeddings=False): """Looks up words embeddings for id tensor. Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary. embedding_size: int. Width of the word embeddings. initializer_range: float. Embedding initialization range. word_embedding_name: string. Name of the embedding table. use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.gather()`. Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. """ # This function assumes that the input is of shape [batch_size, seq_length, # num_inputs]. # # If the input is a 2D tensor of shape [batch_size, seq_length], we # reshape to [batch_size, seq_length, 1]. if input_ids.shape.ndims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) embedding_table = apply_mask(tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range)), scope='embed_mask') flat_input_ids = tf.reshape(input_ids, [-1]) if use_one_hot_embeddings: one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) output = tf.matmul(one_hot_input_ids, embedding_table) else: output = tf.gather(embedding_table, flat_input_ids) input_shape = get_shape_list(input_ids) output = tf.reshape(output, input_shape[0:-1] + [input_shape[-1] * embedding_size]) return (output, embedding_table)
def testPartitionedVariableMasking(self): partitioner = partitioned_variables.variable_axis_size_partitioner(40) with self.test_session() as session: with variable_scope.variable_scope("", partitioner=partitioner): sparsity = variables.Variable(0.5, name="Sparsity") weights = variable_scope.get_variable( "weights", initializer=math_ops.linspace(1.0, 100.0, 100)) masked_weights = pruning.apply_mask( weights, scope=variable_scope.get_variable_scope()) p = pruning.Pruning(sparsity=sparsity, partitioner=partitioner) p._spec.threshold_decay = 0.0 mask_update_op = p.mask_update_op() variables.global_variables_initializer().run() masked_weights_val = masked_weights.eval() session.run(mask_update_op) masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
def conv2d(self, input_tensor, kernel, stride, init_func=msra_init, padding="SAME", enable_prune=False): assert (len(kernel) == 4) assert (len(stride) == 2) W = init_func(kernel, "conv_weights") if enable_prune: W = pruning.apply_mask(W) self.prune_weights.append(W) conv_res = tf.nn.conv2d(input_tensor, W, [1, stride[0], stride[1], 1], padding=padding) return conv_res
def testPartitionedVariableMasking(self): partitioner = partitioned_variables.variable_axis_size_partitioner(40) with self.test_session() as session: with variable_scope.variable_scope("", partitioner=partitioner): sparsity = variables.Variable(0.5, name="Sparsity") weights = variable_scope.get_variable( "weights", initializer=math_ops.linspace(1.0, 100.0, 100)) masked_weights = pruning.apply_mask( weights, scope=variable_scope.get_variable_scope()) p = pruning.Pruning(sparsity=sparsity) p._spec.threshold_decay = 0.0 mask_update_op = p.mask_update_op() variables.global_variables_initializer().run() masked_weights_val = masked_weights.eval() session.run(mask_update_op) masked_weights_val = masked_weights.eval() self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
def _batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, param_initializers=None, param_regularizers=None, updates_collections=tf.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, batch_weights=None, fused=None, data_format='NHWC', zero_debias_moving_mean=False, scope=None, renorm=False, renorm_clipping=None, renorm_decay=0.99, adjustment=None): print("_batch_norm:center:", center) print("_batch_norm:scale :", scale) bn_with_scale_false = slim.batch_norm( inputs, decay, False, False, epsilon, activation_fn, param_initializers, param_regularizers, updates_collections, is_training, reuse, variables_collections, outputs_collections, trainable, batch_weights, fused, data_format, zero_debias_moving_mean, scope, renorm, renorm_clipping, renorm_decay, adjustment) with tf.variable_scope('XBatchNorm') as scbn: gamma = slim.model_variable('gamma', shape=[inputs.shape[-1]], initializer=tf.ones_initializer(), regularizer=slim.l1_regularizer( 0.0001)) #slim.l1_regularizer!!! beta = slim.model_variable('beta', shape=[inputs.shape[-1]], initializer=tf.zeros_initializer()) bn = tf.multiply(bn_with_scale_false, pruning.apply_mask(gamma, scbn)) bn = tf.add(bn, beta) return bn
def bottom_simple(x, model_hparams, vocab_size, name, reuse): """Bottom transformation.""" with tf.variable_scope(name, reuse=reuse): # Ensure the inputs are 3-D if len(x.get_shape()) == 4: x = tf.squeeze(x, axis=3) while len(x.get_shape()) < 3: x = tf.expand_dims(x, axis=-1) var = _get_weights(model_hparams, vocab_size) x = common_layers.dropout_no_scaling( x, 1.0 - model_hparams.symbol_dropout) sparsity_technique = model_hparams.get("sparsity_technique") training = model_hparams.get("mode") == tf.estimator.ModeKeys.TRAIN if sparsity_technique == "variational_dropout": if training: ret = vd.nn.embedding_lookup_train( var, x, clip_alpha=model_hparams.get("clip_log_alpha")) else: threshold = model_hparams.get("log_alpha_threshold") ret = vd.nn.embedding_lookup_eval( var, x, threshold=threshold) elif sparsity_technique == "l0_regularization": if training: ret = l0.nn.embedding_lookup_train(var, x) else: ret = l0.nn.embedding_lookup_eval(var, x) elif (sparsity_technique == "magnitude_pruning" or sparsity_technique == "random_pruning"): ret = common_layers.gather(pruning.apply_mask(var), x) else: ret = common_layers.gather(var, x) # post-process the embedding vectors if model_hparams.multiply_embedding_mode == "sqrt_depth": ret *= model_hparams.hidden_size**0.5 ret *= tf.expand_dims(tf.to_float(tf.not_equal(x, 0)), -1) return ret
def _conv2d(inputs, weights=None, num_filters=16, kernel_size=(3, 3), strides=(1, 1), padding='SAME', initializer=tf.contrib.layers.xavier_initializer(), l2_strenth=0.0, bias=0.0, name="conv2d", is_pruning=False): with tf.variable_scope(name): stride = [1, strides[0], strides[1], 1] kernel_shape = [ kernel_size[0], kernel_size[1], inputs.shape[-1], num_filters ] #with tf.name_scope(name): if weights == None: weights = _variable_with_weight_decay(kernel_shape, initializier=initializer, wd=l2_strenth, name="weights") if isinstance(bias, float): bias = tf.get_variable("bias", [num_filters], initializer=tf.constant_initializer(bias)) _variable_summaries(weights) _variable_summaries(bias) if is_pruning == True: weights = pruning.apply_mask(weights) conv2d_out = tf.nn.conv2d(inputs, weights, strides=stride, padding=padding) conv2d_out = tf.nn.bias_add(conv2d_out, bias) _variable_summaries(conv2d_out) return conv2d_out
def fc(self, input_tensor, shape, init_func=msra_init, enable_bn=False, enable_bias=True, enable_prune=False): W = init_func(shape, "fc_weights") if enable_prune: W = pruning.apply_mask(W) self.prune_weights.append(W) current_tensor = tf.matmul(input_tensor, W) if enable_bias: bias = zeros_init([shape[1]], "fc_bias") current_tensor = tf.add(current_tensor, bias) if enable_bn: scale = ones_init([shape[1]], "bn_scale") bias = zeros_init([shape[1]], "bn_bias") current_tensor = self.bn(current_tensor, scale, bias, is_conv=False) return current_tensor
def _build_conv_layer(self, input, scope, weight_init, filter_hight, filter_width, channel_in, channel_out, activation=None): kernel = self._variable_with_weight_decay( 'weights', shape=[filter_hight, filter_width, channel_in, channel_out], initialization=weight_init, wd=self.wd) conv = tf.nn.conv2d(input, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME') biases = self._variable_on_cpu('biases', channel_out, tf.constant_initializer(0.0)) pre_activation = tf.nn.bias_add(conv, biases) if activation: return activation(pre_activation, name=scope.name) else: return pre_activation
def inference(images): """Build the CIFAR-10 model. Args: images: Images returned from distorted_inputs() or inputs(). Returns: Logits. """ # We instantiate all variables using tf.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU training runs. # If we only ran this model on a single GPU, we could simplify this function # by replacing all instances of tf.get_variable() with tf.Variable(). # # While instantiating conv and local layers, we add mask and threshold # variables to the layer by calling the pruning.apply_mask() function. # Note that the masks are applied only to the weight tensors # conv1 with tf.variable_scope('conv1') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64], stddev=5e-2, wd=0.0) conv = tf.nn.conv2d( images, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) pre_activation = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(pre_activation, name=scope.name) _activation_summary(conv1) # pool1 pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1') # norm1 norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1') # conv2 with tf.variable_scope('conv2') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64], stddev=5e-2, wd=0.0) conv = tf.nn.conv2d( norm1, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) pre_activation = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(pre_activation, name=scope.name) _activation_summary(conv2) # norm2 norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2') # pool2 pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2') # local3 with tf.variable_scope('local3') as scope: # Move everything into depth so we can perform a single matrix multiply. reshape = tf.reshape(pool2, [BATCH_SIZE, -1]) dim = reshape.get_shape()[1].value weights = _variable_with_weight_decay('weights', shape=[dim, 384], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1)) local3 = tf.nn.relu( tf.matmul(reshape, pruning.apply_mask(weights, scope)) + biases, name=scope.name) _activation_summary(local3) # local4 with tf.variable_scope('local4') as scope: weights = _variable_with_weight_decay('weights', shape=[384, 192], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) local4 = tf.nn.relu( tf.matmul(local3, pruning.apply_mask(weights, scope)) + biases, name=scope.name) _activation_summary(local4) # linear layer(WX + b), # We don't apply softmax here because # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits # and performs the softmax internally for efficiency. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES], stddev=1/192.0, wd=0.0) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0)) softmax_linear = tf.add( tf.matmul(local4, pruning.apply_mask(weights, scope)), biases, name=scope.name) _activation_summary(softmax_linear) return softmax_linear
def conv(x, filter_height, filter_width, num_filters, stride_y, stride_x, name, prune=True, padding='SAME', groups=1): """Create a convolution layer. Adapted from: https://github.com/ethereon/caffe-tensorflow """ # Get number of input channels input_channels = int(x.get_shape()[-1]) # Create lambda function for the convolution if prune: convolve = lambda i, k, sc: tf.nn.conv2d( i, pruning.apply_mask(k, sc), #add mask for model pruning strides=[1, stride_y, stride_x, 1], padding=padding) else: convolve = lambda i, k, sc: tf.nn.conv2d( i, k, strides=[1, stride_y, stride_x, 1], padding=padding) with tf.variable_scope(name) as scope: # Create tf variables for the weights and biases of the conv layer weights = tf.get_variable('weights', shape=[ filter_height, filter_width, input_channels / groups, num_filters ]) biases = tf.get_variable('biases', shape=[num_filters]) if groups == 1: conv = convolve(x, weights, scope) # In the cases of multiple groups, split inputs & weights and else: # Split input and weights and convolve them separately input_groups = tf.split(axis=3, num_or_size_splits=groups, value=x) weight_groups = tf.split(axis=3, num_or_size_splits=groups, value=weights) num = 1 output_groups = list() for i, k in zip(input_groups, weight_groups): with tf.variable_scope(str(num)) as scope_next: output_groups.append(convolve(i, k, scope_next)) num += 1 # Concat the convolved output together again conv = tf.concat(axis=3, values=output_groups) # Add biases bias = tf.reshape(tf.nn.bias_add(conv, biases), tf.shape(conv)) # Apply relu function relu = tf.nn.relu(bias, name=scope.name) return relu
def compute_attention_component(antecedent, total_depth, filter_width=1, padding="VALID", name="c", vars_3d_num_heads=0, sparsity_technique=None, threshold=3.0, training=True, clip_alpha=None, initial_sparsity=None, split_heads=False, num_heads=None): """Computes attention compoenent (query, key or value). Args: antecedent: a Tensor with shape [batch, length, channels] total_depth: an integer filter_width: An integer specifying how wide you want the attention component to be. padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding. name: a string specifying scope name. vars_3d_num_heads: an optional integer (if we want to use 3d variables) sparsity_technique: technique used for sparsifying weights. threshold: log alpha threshold used for evaluation with variational dropout. training: whether model is being trained or not. clip_alpha: alpha clipping threshold for variational dropout. initial_sparsity: initial sparsity level for lottery ticket & scratch experiments. split_heads: Whether to prune each head separately. num_heads: The number of heads in the attention module. Returns: c : [batch, length, depth] tensor """ # We don't support 3d attention variables or filter_width > 1 with sparsity # techniques assert not sparsity_technique or (not vars_3d_num_heads and filter_width == 1) if vars_3d_num_heads > 0: assert filter_width == 1 input_depth = antecedent.get_shape().as_list()[-1] depth_per_head = total_depth // vars_3d_num_heads initializer_stddev = input_depth**-0.5 if "q" in name: initializer_stddev *= depth_per_head**-0.5 var = tf.get_variable( name, [input_depth, vars_3d_num_heads, total_depth // vars_3d_num_heads], initializer=tf.random_normal_initializer( stddev=initializer_stddev)) var = tf.cast(var, antecedent.dtype) var = tf.reshape(var, [input_depth, total_depth]) return tf.tensordot(antecedent, var, axes=1) if filter_width == 1: if sparsity_technique: if split_heads: # Prune each heads weights separately so that they are free # to have different weight magnitude distributions. if num_heads is None: raise ValueError( "`num_heads` must be set for split head pruning.") if total_depth % num_heads != 0: raise ValueError( "`total_depth` must be divisible by `num_heads`.") input_depth = antecedent.get_shape().as_list()[-1] depth_per_head = int(total_depth / num_heads) masked_head_weights = [] for head_id in range(num_heads): head_name = name + "_shard_{}".format(head_id) with tf.variable_scope(head_name) as vs: head_weights = tf.get_variable( "kernel", [input_depth, depth_per_head]) masked_head_weights.append( pruning.apply_mask(head_weights, vs)) component_weights = tf.concat(masked_head_weights, axis=1) # compute the full component result return tf.tensordot(antecedent, component_weights, axes=1) else: return common_sparse.dense( antecedent, total_depth, use_bias=False, sparsity_technique=sparsity_technique, threshold=threshold, training=training, clip_alpha=clip_alpha, name=name, initial_sparsity=initial_sparsity) else: return common_layers.dense(antecedent, total_depth, use_bias=False, name=name) else: return common_layers.conv1d(antecedent, total_depth, filter_width, padding=padding, name=name)
def inference(images): """Build the CIFAR-10 model. Args: images: Images returned from distorted_inputs() or inputs(). Returns: Logits. """ # We instantiate all variables using tf.compat.v1.get_variable() instead of # tf.Variable() in order to share variables across multiple GPU training runs. # If we only ran this model on a single GPU, we could simplify this function # by replacing all instances of tf.compat.v1.get_variable() with tf.Variable(). # # While instantiating conv and local layers, we add mask and threshold # variables to the layer by calling the pruning.apply_mask() function. # Note that the masks are applied only to the weight tensors # conv1 with tf.variable_scope('conv1') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64], stddev=5e-2, wd=0.0) conv = tf.nn.conv2d(images, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0)) pre_activation = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(pre_activation, name=scope.name) _activation_summary(conv1) # pool1 pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1') # norm1 norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1') # conv2 with tf.variable_scope('conv2') as scope: kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64], stddev=5e-2, wd=0.0) conv = tf.nn.conv2d(norm1, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME') biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1)) pre_activation = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(pre_activation, name=scope.name) _activation_summary(conv2) # norm2 norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm2') # pool2 pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2') # local3 with tf.variable_scope('local3') as scope: # Move everything into depth so we can perform a single matrix multiply. reshape = tf.reshape(pool2, [BATCH_SIZE, -1]) dim = reshape.get_shape()[1].value weights = _variable_with_weight_decay('weights', shape=[dim, 384], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1)) local3 = tf.nn.relu( tf.matmul(reshape, pruning.apply_mask(weights, scope)) + biases, name=scope.name) _activation_summary(local3) # local4 with tf.variable_scope('local4') as scope: weights = _variable_with_weight_decay('weights', shape=[384, 192], stddev=0.04, wd=0.004) biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1)) local4 = tf.nn.relu( tf.matmul(local3, pruning.apply_mask(weights, scope)) + biases, name=scope.name) _activation_summary(local4) # linear layer(WX + b), # We don't apply softmax here because # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits # and performs the softmax internally for efficiency. with tf.variable_scope('softmax_linear') as scope: weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES], stddev=1 / 192.0, wd=0.0) biases = _variable_on_cpu('biases', [NUM_CLASSES], tf.constant_initializer(0.0)) softmax_linear = tf.add(tf.matmul(local4, pruning.apply_mask(weights, scope)), biases, name=scope.name) _activation_summary(softmax_linear) return softmax_linear
def top(body_output, targets, model_hparams, vocab_size): """Generate logits. Args: body_output: A Tensor with shape [batch, p0, p1, body_input_depth] targets: Unused. model_hparams: tf.HParams, model hyperparmeters. vocab_size: int, vocabulary size. Returns: logits: A Tensor with shape [batch, p0, p1, ?, vocab_size]. """ del targets # unused arg # Sparsity techniques only support shared weight matrices for now sparsity_technique = model_hparams.get("sparsity_technique") assert (not sparsity_technique or model_hparams.shared_embedding_and_softmax_weights) if model_hparams.shared_embedding_and_softmax_weights: scope_name = "shared" reuse = tf.AUTO_REUSE else: scope_name = "softmax" reuse = False with tf.variable_scope(scope_name, reuse=reuse): body_output_shape = common_layers.shape_list(body_output) var = _get_weights(model_hparams, vocab_size, body_output_shape[-1]) if (model_hparams.factored_logits and model_hparams.mode == tf.estimator.ModeKeys.TRAIN): # Sparsity techniques only support non-factored logits for now assert not sparsity_technique # insert channels dimension body_output = tf.expand_dims(body_output, 3) return common_layers.FactoredTensor(body_output, var) else: body_output = tf.reshape(body_output, [-1, body_output_shape[-1]]) training = model_hparams.get("mode") == tf.estimator.ModeKeys.TRAIN if sparsity_technique == "variational_dropout": if training: logits = vd.nn.matmul_train( body_output, var, transpose_b=True, clip_alpha=model_hparams.get("clip_log_alpha")) else: threshold = model_hparams.get("log_alpha_threshold") logits = vd.nn.matmul_eval(body_output, var, transpose_b=True, threshold=threshold) elif sparsity_technique == "l0_regularization": if training: logits = l0.nn.matmul_train(body_output, var, transpose_b=True) else: logits = l0.nn.matmul_eval(body_output, var, transpose_b=True) elif (sparsity_technique == "magnitude_pruning" or sparsity_technique == "random_pruning"): logits = tf.matmul(body_output, pruning.apply_mask(var), transpose_b=True) else: logits = tf.matmul(body_output, var, transpose_b=True) return tf.reshape(logits, body_output_shape[:-1] + [1, vocab_size])
def MaskedConv2D( inputs, filters, kernel_size, strides=(1, 1), padding='same', data_format='channels_last', dilation_rate=(1, 1), activation=None, use_bias=True, kernel_initializer=None, bias_initializer=tf.zeros_initializer(), kernel_regularizer=None, bias_regularizer=None, activity_regularizer=None, split=1, masking=False): """ A wrapper around `tf.layers.Conv2D`. Some differences to maintain backward-compatibility: 1. Default kernel initializer is variance_scaling_initializer(2.0). 2. Default padding is 'same'. 3. Support 'split' argument to do group conv. Variable Names: * ``W``: weights * ``b``: bias """ if kernel_initializer is None: if get_tf_version_tuple() <= (1, 12): kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0) else: kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal') dilation_rate = shape2d(dilation_rate) if (masking == False) and (split == 1) and (dilation_rate == [1, 1]): # tf.layers.Conv2D has bugs with dilations (https://github.com/tensorflow/tensorflow/issues/26797) with rename_get_variable({'kernel': 'W', 'bias': 'b'}): layer = tf.layers.Conv2D( filters, kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, kernel_regularizer=kernel_regularizer, bias_regularizer=bias_regularizer, activity_regularizer=activity_regularizer, _reuse=tf.get_variable_scope().reuse) ret = layer.apply(inputs, scope=tf.get_variable_scope()) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=layer.kernel) if use_bias: ret.variables.b = layer.bias else: if masking == True: assert split == 1, "Pruining group conv is not supported yet" # group conv implementation data_format = get_data_format(data_format, keras_mode=False) in_shape = inputs.get_shape().as_list() channel_axis = 3 if data_format == 'NHWC' else 1 in_channel = in_shape[channel_axis] assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!" assert in_channel % split == 0 assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \ "Not supported by group conv or dilated conv!" out_channel = filters assert out_channel % split == 0 assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.' kernel_shape = shape2d(kernel_size) filter_shape = kernel_shape + [in_channel / split, out_channel] stride = shape4d(strides, data_format=data_format) kwargs = dict(data_format=data_format) if get_tf_version_tuple() >= (1, 5): kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format) W = tf.get_variable( 'W', filter_shape, initializer=kernel_initializer) if use_bias: b = tf.get_variable('b', [out_channel], initializer=bias_initializer) if split == 1: if masking: W = pruning.apply_mask(W) conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) else: conv = None if get_tf_version_tuple() >= (1, 13): try: conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs) except ValueError: log_once("CUDNN group convolution support is only available with " "https://github.com/tensorflow/tensorflow/pull/25818 . " "Will fall back to a loop-based slow implementation instead!", 'warn') if conv is None: inputs = tf.split(inputs, split, channel_axis) kernels = tf.split(W, split, 3) outputs = [tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs) for i, k in zip(inputs, kernels)] conv = tf.concat(outputs, channel_axis) ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv if activation is not None: ret = activation(ret) ret = tf.identity(ret, name='output') ret.variables = VariableHolder(W=W) if use_bias: ret.variables.b = b return ret