Exemple #1
0
  def testWeightSpecificSparsity(self):
    param_list = [
        "begin_pruning_step=1", "pruning_frequency=1", "end_pruning_step=100",
        "target_sparsity=0.5", "weight_sparsity_map=[layer2/weights:0.75]",
        "threshold_decay=0.0"
    ]
    test_spec = ",".join(param_list)
    pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)

    with variable_scope.variable_scope("layer1"):
      w1 = variables.Variable(
          math_ops.linspace(1.0, 100.0, 100), name="weights")
      _ = pruning.apply_mask(w1)
    with variable_scope.variable_scope("layer2"):
      w2 = variables.Variable(
          math_ops.linspace(1.0, 100.0, 100), name="weights")
      _ = pruning.apply_mask(w2)

    p = pruning.Pruning(pruning_hparams)
    mask_update_op = p.conditional_mask_update_op()
    increment_global_step = state_ops.assign_add(self.global_step, 1)

    with self.cached_session() as session:
      variables.global_variables_initializer().run()
      for _ in range(110):
        session.run(mask_update_op)
        session.run(increment_global_step)

      self.assertAllEqual(
          session.run(pruning.get_weight_sparsity()), [0.5, 0.75])
def convolution(input, group, shape, trainable, name, **kwargs):
    w = tf.get_variable(initializer=tf.truncated_normal(shape, stddev=0.1),
                        trainable=trainable,
                        name=name + "_weight")
    if group == 1:
        layer = tf.nn.convolution(input,
                                  pruning.apply_mask(w, name + "_weight"),
                                  **kwargs)
    else:
        weight_groups = tf.split(w, num_or_size_splits=group, axis=-1)
        xs = tf.split(input, num_or_size_splits=group, axis=-1)
        convolved = [
            tf.nn.convolution(
                x, pruning.apply_mask(weight, name + "_weight_groups"),
                **kwargs) for (x, weight) in zip(xs, weight_groups)
        ]
        layer = tf.concat(convolved, axis=-1)

    if name.endswith('_sc'):
        b = tf.get_variable(initializer=tf.truncated_normal(
            input.get_shape().as_list()[-1::], stddev=0.1),
                            trainable=trainable,
                            name=name + "_bias")
        layer = layer + b
    return layer
Exemple #3
0
    def testWeightSpecificSparsity(self):
        param_list = [
            "begin_pruning_step=1", "pruning_frequency=1",
            "end_pruning_step=100", "target_sparsity=0.5",
            "weight_sparsity_map=[layer2/weights:0.75]", "threshold_decay=0.0"
        ]
        test_spec = ",".join(param_list)
        pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)

        with variable_scope.variable_scope("layer1"):
            w1 = variables.Variable(math_ops.linspace(1.0, 100.0, 100),
                                    name="weights")
            _ = pruning.apply_mask(w1)
        with variable_scope.variable_scope("layer2"):
            w2 = variables.Variable(math_ops.linspace(1.0, 100.0, 100),
                                    name="weights")
            _ = pruning.apply_mask(w2)

        p = pruning.Pruning(pruning_hparams)
        mask_update_op = p.conditional_mask_update_op()
        increment_global_step = state_ops.assign_add(self.global_step, 1)

        with self.test_session() as session:
            variables.global_variables_initializer().run()
            for _ in range(110):
                session.run(mask_update_op)
                session.run(increment_global_step)

            self.assertAllEqual(session.run(pruning.get_weight_sparsity()),
                                [0.5, 0.75])
Exemple #4
0
    def testPerLayerBlockSparsity(self):
        param_list = [
            "block_dims_map=[layer1/weights:1x1,layer2/weights:1x2]",
            "block_pooling_function=AVG", "threshold_decay=0.0"
        ]

        test_spec = ",".join(param_list)
        pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)

        with variable_scope.variable_scope("layer1"):
            w1 = constant_op.constant([[-0.1, 0.1], [-0.2, 0.2]],
                                      name="weights")
            pruning.apply_mask(w1)

        with variable_scope.variable_scope("layer2"):
            w2 = constant_op.constant(
                [[0.1, 0.1, 0.3, 0.3], [0.2, 0.2, 0.4, 0.4]], name="weights")
            pruning.apply_mask(w2)

        sparsity = variables.VariableV1(0.5, name="sparsity")

        p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
        mask_update_op = p.mask_update_op()
        with self.cached_session() as session:
            variables.global_variables_initializer().run()
            session.run(mask_update_op)
            mask1_eval = session.run(pruning.get_masks()[0])
            mask2_eval = session.run(pruning.get_masks()[1])

            self.assertAllEqual(session.run(pruning.get_weight_sparsity()),
                                [0.5, 0.5])

            self.assertAllEqual(mask1_eval, [[0.0, 0.0], [1., 1.]])
            self.assertAllEqual(mask2_eval, [[0, 0, 1., 1.], [0, 0, 1., 1.]])
Exemple #5
0
def _build_fc_layer(self, inputs, scope, weight_init, shape, activation=None):
    weights = self._variable_with_weight_decay(
        'weights', shape=shape, initialization=weight_init, wd=0.0)
    biases = self._variable_on_cpu('biases', shape[1], initializer=tf.constant_initializer(0.001))
    if activation is not None:
        return activation(
            tf.matmul(inputs, pruning.apply_mask(weights, scope)) + biases,
            name=scope.name)
    else:
        return tf.matmul(inputs, pruning.apply_mask(weights, scope)) + biases
Exemple #6
0
 def quant_fc(self,
              input_tensor,
              shape,
              init_func=msra_init,
              enable_bias=True,
              quant_bits=8,
              quant_input=True,
              enable_prune=False):
     W = init_func(shape, "fc_weights")
     if enable_prune:
         W = pruning.apply_mask(W)
         self.prune_weights.append(W)
     W = fake_quant(W,
                    min=tf.reduce_min(W),
                    max=tf.reduce_max(W),
                    num_bits=quant_bits)
     if quant_input == True:
         input_tensor = fake_quant(input_tensor,
                                   min=tf.reduce_min(input_tensor),
                                   max=tf.reduce_max(input_tensor),
                                   num_bits=quant_bits)
     current_tensor = tf.matmul(input_tensor, W)
     if enable_bias:
         bias = zeros_init([shape[1]], "fc_bias")
         current_tensor = tf.add(current_tensor, bias)
     return current_tensor
Exemple #7
0
 def quant_conv2d(self,
                  input_tensor,
                  kernel,
                  stride,
                  init_func=msra_init,
                  quant_bits=8,
                  padding="SAME",
                  quant_input=True,
                  enable_prune=False):
     assert (len(kernel) == 4)
     assert (len(stride) == 2)
     W = init_func(kernel, "conv_weights")
     if enable_prune:
         W = pruning.apply_mask(W)
         self.prune_weights.append(W)
     W = fake_quant(W,
                    min=tf.reduce_min(W),
                    max=tf.reduce_max(W),
                    num_bits=quant_bits)
     if quant_input == True:
         input_tensor = fake_quant(input_tensor,
                                   min=tf.reduce_min(input_tensor),
                                   max=tf.reduce_max(input_tensor),
                                   num_bits=quant_bits)
     conv_res = tf.nn.conv2d(input_tensor,
                             W, [1, stride[0], stride[1], 1],
                             padding=padding)
     return conv_res
def fc(x, num_in, num_out, name, prune=True, relu=True):
    """Create a fully connected layer."""
    with tf.variable_scope(name) as scope:

        # Create tf variables for the weights and biases
        weights = tf.get_variable('weights',
                                  shape=[num_in, num_out],
                                  trainable=True)
        biases = tf.get_variable('biases', [num_out], trainable=True)

        # Matrix multiply weights and inputs and add bias
        if prune:
            act = tf.nn.xw_plus_b(x,
                                  pruning.apply_mask(weights, scope),
                                  biases,
                                  name=scope.name)
        else:
            act = tf.nn.xw_plus_b(x, weights, biases, name=scope.name)

    if relu:
        # Apply ReLu non linearity
        relu = tf.nn.relu(act)
        return relu
    else:
        return act
def _dense(inputs,
           weights=None,
           num_classes=1000,
           bias=0.0,
           l2_strength=0.0,
           initializer=tf.contrib.layers.xavier_initializer(),
           is_pruning=False,
           name="fully_connect"):

    last_shape = inputs.get_shape()[-1].value
    with tf.variable_scope(name):
        if weights == None:
            weights = _variable_with_weight_decay([last_shape, num_classes],
                                                  initializer,
                                                  wd=l2_strength)
        if isinstance(bias, float):
            bias = tf.get_variable("bias", [num_classes],
                                   dtype=tf.float32,
                                   initializer=tf.constant_initializer(bias))
        if is_pruning:
            weights = pruning.apply_mask(weights)
        _variable_summaries(weights)
        _variable_summaries(bias)

        out = tf.nn.bias_add(tf.matmul(inputs, weights), bias)

    return out
Exemple #10
0
 def testConditionalMaskUpdate(self):
     param_list = [
         "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6"
     ]
     test_spec = ",".join(param_list)
     pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
     weights = variables.Variable(math_ops.linspace(1.0, 100.0, 100),
                                  name="weights")
     masked_weights = pruning.apply_mask(weights)
     sparsity = variables.Variable(0.00, name="sparsity")
     # Set up pruning
     p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
     p._spec.threshold_decay = 0.0
     mask_update_op = p.conditional_mask_update_op()
     sparsity_val = math_ops.linspace(0.0, 0.9, 10)
     increment_global_step = state_ops.assign_add(self.global_step, 1)
     non_zero_count = []
     with self.test_session() as session:
         variables.global_variables_initializer().run()
         for i in range(10):
             session.run(state_ops.assign(sparsity, sparsity_val[i]))
             session.run(mask_update_op)
             session.run(increment_global_step)
             non_zero_count.append(np.count_nonzero(masked_weights.eval()))
     # Weights pruned at steps 0,2,4,and,6
     expected_non_zero_count = [100, 100, 80, 80, 60, 60, 40, 40, 40, 40]
     self.assertAllEqual(expected_non_zero_count, non_zero_count)
Exemple #11
0
 def testConditionalMaskUpdate(self):
   param_list = [
       "pruning_frequency=2", "begin_pruning_step=1", "end_pruning_step=6"
   ]
   test_spec = ",".join(param_list)
   pruning_hparams = pruning.get_pruning_hparams().parse(test_spec)
   weights = variables.Variable(
       math_ops.linspace(1.0, 100.0, 100), name="weights")
   masked_weights = pruning.apply_mask(weights)
   sparsity = variables.Variable(0.00, name="sparsity")
   # Set up pruning
   p = pruning.Pruning(pruning_hparams, sparsity=sparsity)
   p._spec.threshold_decay = 0.0
   mask_update_op = p.conditional_mask_update_op()
   sparsity_val = math_ops.linspace(0.0, 0.9, 10)
   increment_global_step = state_ops.assign_add(self.global_step, 1)
   non_zero_count = []
   with self.test_session() as session:
     variables.global_variables_initializer().run()
     for i in range(10):
       session.run(state_ops.assign(sparsity, sparsity_val[i]))
       session.run(mask_update_op)
       session.run(increment_global_step)
       non_zero_count.append(np.count_nonzero(masked_weights.eval()))
   # Weights pruned at steps 0,2,4,and,6
   expected_non_zero_count = [100, 100, 80, 80, 60, 60, 40, 40, 40, 40]
   self.assertAllEqual(expected_non_zero_count, non_zero_count)
Exemple #12
0
def _build_conv_layer(self, inputs, scope, weight_init, filter_height,
                      filter_width, channel_in, channel_out, strides, padding='SAME'):
    kernel = self._variable_with_weight_decay(
        'weights', shape=[filter_height, filter_width, channel_in, channel_out], initialization=weight_init, wd=0.0)
    conv = tf.nn.conv2d(
        input=inputs, filter=pruning.apply_mask(kernel, scope), padding=padding, strides=strides)
    biases = self._variable_on_cpu('biases', channel_out, initializer=tf.constant_initializer(0.001))
    pre_activation = tf.nn.bias_add(conv, biases)
    return pre_activation
Exemple #13
0
def masked_conv2d(x, W, b, strides=1, name=None):
    with tf.variable_scope(name) as scope:
        x = tf.nn.conv2d(x,
                         pruning.apply_mask(W),
                         strides=[1, strides, strides, 1],
                         padding='SAME',
                         name=scope.name)
        x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)
Exemple #14
0
 def testCreateMask2D(self):
   width = 10
   height = 20
   with self.test_session():
     weights = variables.Variable(
         random_ops.random_normal([width, height], stddev=1), name="weights")
     masked_weights = pruning.apply_mask(weights,
                                         variable_scope.get_variable_scope())
     variables.global_variables_initializer().run()
     weights_val = weights.eval()
     masked_weights_val = masked_weights.eval()
     self.assertAllEqual(weights_val, masked_weights_val)
Exemple #15
0
 def testCreateMask2D(self):
   width = 10
   height = 20
   with self.test_session():
     weights = variables.Variable(
         random_ops.random_normal([width, height], stddev=1), name="weights")
     masked_weights = pruning.apply_mask(weights,
                                         variable_scope.get_variable_scope())
     variables.global_variables_initializer().run()
     weights_val = weights.eval()
     masked_weights_val = masked_weights.eval()
     self.assertAllEqual(weights_val, masked_weights_val)
Exemple #16
0
 def testUpdateSingleMask(self):
     with self.test_session() as session:
         weights = variables.Variable(math_ops.linspace(1.0, 100.0, 100),
                                      name="weights")
         masked_weights = pruning.apply_mask(weights)
         sparsity = variables.Variable(0.5, name="sparsity")
         p = pruning.Pruning(sparsity=sparsity)
         p._spec.threshold_decay = 0.0
         mask_update_op = p.mask_update_op()
         variables.global_variables_initializer().run()
         masked_weights_val = masked_weights.eval()
         self.assertAllEqual(np.count_nonzero(masked_weights_val), 100)
         session.run(mask_update_op)
         masked_weights_val = masked_weights.eval()
         self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
Exemple #17
0
 def testUpdateSingleMask(self):
   with self.test_session() as session:
     weights = variables.Variable(
         math_ops.linspace(1.0, 100.0, 100), name="weights")
     masked_weights = pruning.apply_mask(weights)
     sparsity = variables.Variable(0.5, name="sparsity")
     p = pruning.Pruning(sparsity=sparsity)
     p._spec.threshold_decay = 0.0
     mask_update_op = p.mask_update_op()
     variables.global_variables_initializer().run()
     masked_weights_val = masked_weights.eval()
     self.assertAllEqual(np.count_nonzero(masked_weights_val), 100)
     session.run(mask_update_op)
     masked_weights_val = masked_weights.eval()
     self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size=128,
                     initializer_range=0.02,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=False):
    """Looks up words embeddings for id tensor.

  Args:
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
      ids.
    vocab_size: int. Size of the embedding vocabulary.
    embedding_size: int. Width of the word embeddings.
    initializer_range: float. Embedding initialization range.
    word_embedding_name: string. Name of the embedding table.
    use_one_hot_embeddings: bool. If True, use one-hot method for word
      embeddings. If False, use `tf.gather()`.

  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  """
    # This function assumes that the input is of shape [batch_size, seq_length,
    # num_inputs].
    #
    # If the input is a 2D tensor of shape [batch_size, seq_length], we
    # reshape to [batch_size, seq_length, 1].
    if input_ids.shape.ndims == 2:
        input_ids = tf.expand_dims(input_ids, axis=[-1])

    embedding_table = apply_mask(tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=create_initializer(initializer_range)),
                                 scope='embed_mask')

    flat_input_ids = tf.reshape(input_ids, [-1])
    if use_one_hot_embeddings:
        one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
        output = tf.matmul(one_hot_input_ids, embedding_table)
    else:
        output = tf.gather(embedding_table, flat_input_ids)

    input_shape = get_shape_list(input_ids)

    output = tf.reshape(output,
                        input_shape[0:-1] + [input_shape[-1] * embedding_size])
    return (output, embedding_table)
Exemple #19
0
 def testPartitionedVariableMasking(self):
     partitioner = partitioned_variables.variable_axis_size_partitioner(40)
     with self.test_session() as session:
         with variable_scope.variable_scope("", partitioner=partitioner):
             sparsity = variables.Variable(0.5, name="Sparsity")
             weights = variable_scope.get_variable(
                 "weights", initializer=math_ops.linspace(1.0, 100.0, 100))
             masked_weights = pruning.apply_mask(
                 weights, scope=variable_scope.get_variable_scope())
         p = pruning.Pruning(sparsity=sparsity, partitioner=partitioner)
         p._spec.threshold_decay = 0.0
         mask_update_op = p.mask_update_op()
         variables.global_variables_initializer().run()
         masked_weights_val = masked_weights.eval()
         session.run(mask_update_op)
         masked_weights_val = masked_weights.eval()
         self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
Exemple #20
0
 def conv2d(self,
            input_tensor,
            kernel,
            stride,
            init_func=msra_init,
            padding="SAME",
            enable_prune=False):
     assert (len(kernel) == 4)
     assert (len(stride) == 2)
     W = init_func(kernel, "conv_weights")
     if enable_prune:
         W = pruning.apply_mask(W)
         self.prune_weights.append(W)
     conv_res = tf.nn.conv2d(input_tensor,
                             W, [1, stride[0], stride[1], 1],
                             padding=padding)
     return conv_res
Exemple #21
0
 def testPartitionedVariableMasking(self):
   partitioner = partitioned_variables.variable_axis_size_partitioner(40)
   with self.test_session() as session:
     with variable_scope.variable_scope("", partitioner=partitioner):
       sparsity = variables.Variable(0.5, name="Sparsity")
       weights = variable_scope.get_variable(
           "weights", initializer=math_ops.linspace(1.0, 100.0, 100))
       masked_weights = pruning.apply_mask(
           weights, scope=variable_scope.get_variable_scope())
     p = pruning.Pruning(sparsity=sparsity)
     p._spec.threshold_decay = 0.0
     mask_update_op = p.mask_update_op()
     variables.global_variables_initializer().run()
     masked_weights_val = masked_weights.eval()
     session.run(mask_update_op)
     masked_weights_val = masked_weights.eval()
     self.assertAllEqual(np.count_nonzero(masked_weights_val), 51)
Exemple #22
0
 def _batch_norm(inputs,
                 decay=0.999,
                 center=True,
                 scale=False,
                 epsilon=0.001,
                 activation_fn=None,
                 param_initializers=None,
                 param_regularizers=None,
                 updates_collections=tf.GraphKeys.UPDATE_OPS,
                 is_training=True,
                 reuse=None,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
                 batch_weights=None,
                 fused=None,
                 data_format='NHWC',
                 zero_debias_moving_mean=False,
                 scope=None,
                 renorm=False,
                 renorm_clipping=None,
                 renorm_decay=0.99,
                 adjustment=None):
     print("_batch_norm:center:", center)
     print("_batch_norm:scale :", scale)
     bn_with_scale_false = slim.batch_norm(
         inputs, decay, False, False, epsilon, activation_fn,
         param_initializers, param_regularizers, updates_collections,
         is_training, reuse, variables_collections, outputs_collections,
         trainable, batch_weights, fused, data_format,
         zero_debias_moving_mean, scope, renorm, renorm_clipping,
         renorm_decay, adjustment)
     with tf.variable_scope('XBatchNorm') as scbn:
         gamma = slim.model_variable('gamma',
                                     shape=[inputs.shape[-1]],
                                     initializer=tf.ones_initializer(),
                                     regularizer=slim.l1_regularizer(
                                         0.0001))  #slim.l1_regularizer!!!
         beta = slim.model_variable('beta',
                                    shape=[inputs.shape[-1]],
                                    initializer=tf.zeros_initializer())
     bn = tf.multiply(bn_with_scale_false, pruning.apply_mask(gamma, scbn))
     bn = tf.add(bn, beta)
     return bn
def bottom_simple(x, model_hparams, vocab_size, name, reuse):
  """Bottom transformation."""
  with tf.variable_scope(name, reuse=reuse):
    # Ensure the inputs are 3-D
    if len(x.get_shape()) == 4:
      x = tf.squeeze(x, axis=3)
    while len(x.get_shape()) < 3:
      x = tf.expand_dims(x, axis=-1)

    var = _get_weights(model_hparams, vocab_size)
    x = common_layers.dropout_no_scaling(
        x, 1.0 - model_hparams.symbol_dropout)

    sparsity_technique = model_hparams.get("sparsity_technique")
    training = model_hparams.get("mode") == tf.estimator.ModeKeys.TRAIN
    if sparsity_technique == "variational_dropout":
      if training:
        ret = vd.nn.embedding_lookup_train(
            var,
            x,
            clip_alpha=model_hparams.get("clip_log_alpha"))
      else:
        threshold = model_hparams.get("log_alpha_threshold")
        ret = vd.nn.embedding_lookup_eval(
            var,
            x,
            threshold=threshold)
    elif sparsity_technique == "l0_regularization":
      if training:
        ret = l0.nn.embedding_lookup_train(var, x)
      else:
        ret = l0.nn.embedding_lookup_eval(var, x)
    elif (sparsity_technique == "magnitude_pruning" or
          sparsity_technique == "random_pruning"):
      ret = common_layers.gather(pruning.apply_mask(var), x)
    else:
      ret = common_layers.gather(var, x)

    # post-process the embedding vectors
    if model_hparams.multiply_embedding_mode == "sqrt_depth":
      ret *= model_hparams.hidden_size**0.5
    ret *= tf.expand_dims(tf.to_float(tf.not_equal(x, 0)), -1)
    return ret
def _conv2d(inputs,
            weights=None,
            num_filters=16,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding='SAME',
            initializer=tf.contrib.layers.xavier_initializer(),
            l2_strenth=0.0,
            bias=0.0,
            name="conv2d",
            is_pruning=False):

    with tf.variable_scope(name):
        stride = [1, strides[0], strides[1], 1]
        kernel_shape = [
            kernel_size[0], kernel_size[1], inputs.shape[-1], num_filters
        ]

        #with tf.name_scope(name):
        if weights == None:
            weights = _variable_with_weight_decay(kernel_shape,
                                                  initializier=initializer,
                                                  wd=l2_strenth,
                                                  name="weights")
        if isinstance(bias, float):
            bias = tf.get_variable("bias", [num_filters],
                                   initializer=tf.constant_initializer(bias))

        _variable_summaries(weights)
        _variable_summaries(bias)
        if is_pruning == True:
            weights = pruning.apply_mask(weights)
        conv2d_out = tf.nn.conv2d(inputs,
                                  weights,
                                  strides=stride,
                                  padding=padding)
        conv2d_out = tf.nn.bias_add(conv2d_out, bias)

        _variable_summaries(conv2d_out)
    return conv2d_out
Exemple #25
0
 def fc(self,
        input_tensor,
        shape,
        init_func=msra_init,
        enable_bn=False,
        enable_bias=True,
        enable_prune=False):
     W = init_func(shape, "fc_weights")
     if enable_prune:
         W = pruning.apply_mask(W)
         self.prune_weights.append(W)
     current_tensor = tf.matmul(input_tensor, W)
     if enable_bias:
         bias = zeros_init([shape[1]], "fc_bias")
         current_tensor = tf.add(current_tensor, bias)
     if enable_bn:
         scale = ones_init([shape[1]], "bn_scale")
         bias = zeros_init([shape[1]], "bn_bias")
         current_tensor = self.bn(current_tensor,
                                  scale,
                                  bias,
                                  is_conv=False)
     return current_tensor
Exemple #26
0
 def _build_conv_layer(self,
                       input,
                       scope,
                       weight_init,
                       filter_hight,
                       filter_width,
                       channel_in,
                       channel_out,
                       activation=None):
     kernel = self._variable_with_weight_decay(
         'weights',
         shape=[filter_hight, filter_width, channel_in, channel_out],
         initialization=weight_init,
         wd=self.wd)
     conv = tf.nn.conv2d(input,
                         pruning.apply_mask(kernel, scope), [1, 1, 1, 1],
                         padding='SAME')
     biases = self._variable_on_cpu('biases', channel_out,
                                    tf.constant_initializer(0.0))
     pre_activation = tf.nn.bias_add(conv, biases)
     if activation:
         return activation(pre_activation, name=scope.name)
     else:
         return pre_activation
def inference(images):
  """Build the CIFAR-10 model.

  Args:
    images: Images returned from distorted_inputs() or inputs().

  Returns:
    Logits.
  """
  # We instantiate all variables using tf.get_variable() instead of
  # tf.Variable() in order to share variables across multiple GPU training runs.
  # If we only ran this model on a single GPU, we could simplify this function
  # by replacing all instances of tf.get_variable() with tf.Variable().
  #
  # While instantiating conv and local layers, we add mask and threshold
  # variables to the layer by calling the pruning.apply_mask() function.
  # Note that the masks are applied only to the weight tensors
  # conv1
  with tf.variable_scope('conv1') as scope:
    kernel = _variable_with_weight_decay('weights',
                                         shape=[5, 5, 3, 64],
                                         stddev=5e-2,
                                         wd=0.0)

    conv = tf.nn.conv2d(
        images, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
    pre_activation = tf.nn.bias_add(conv, biases)
    conv1 = tf.nn.relu(pre_activation, name=scope.name)
    _activation_summary(conv1)

  # pool1
  pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1],
                         padding='SAME', name='pool1')
  # norm1
  norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                    name='norm1')

  # conv2
  with tf.variable_scope('conv2') as scope:
    kernel = _variable_with_weight_decay('weights',
                                         shape=[5, 5, 64, 64],
                                         stddev=5e-2,
                                         wd=0.0)
    conv = tf.nn.conv2d(
        norm1, pruning.apply_mask(kernel, scope), [1, 1, 1, 1], padding='SAME')
    biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
    pre_activation = tf.nn.bias_add(conv, biases)
    conv2 = tf.nn.relu(pre_activation, name=scope.name)
    _activation_summary(conv2)

  # norm2
  norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75,
                    name='norm2')
  # pool2
  pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1],
                         strides=[1, 2, 2, 1], padding='SAME', name='pool2')

  # local3
  with tf.variable_scope('local3') as scope:
    # Move everything into depth so we can perform a single matrix multiply.
    reshape = tf.reshape(pool2, [BATCH_SIZE, -1])
    dim = reshape.get_shape()[1].value
    weights = _variable_with_weight_decay('weights', shape=[dim, 384],
                                          stddev=0.04, wd=0.004)
    biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
    local3 = tf.nn.relu(
        tf.matmul(reshape, pruning.apply_mask(weights, scope)) + biases,
        name=scope.name)
    _activation_summary(local3)

  # local4
  with tf.variable_scope('local4') as scope:
    weights = _variable_with_weight_decay('weights', shape=[384, 192],
                                          stddev=0.04, wd=0.004)
    biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
    local4 = tf.nn.relu(
        tf.matmul(local3, pruning.apply_mask(weights, scope)) + biases,
        name=scope.name)
    _activation_summary(local4)

  # linear layer(WX + b),
  # We don't apply softmax here because
  # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
  # and performs the softmax internally for efficiency.
  with tf.variable_scope('softmax_linear') as scope:
    weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
                                          stddev=1/192.0, wd=0.0)
    biases = _variable_on_cpu('biases', [NUM_CLASSES],
                              tf.constant_initializer(0.0))
    softmax_linear = tf.add(
        tf.matmul(local4, pruning.apply_mask(weights, scope)),
        biases,
        name=scope.name)
    _activation_summary(softmax_linear)

  return softmax_linear
def conv(x,
         filter_height,
         filter_width,
         num_filters,
         stride_y,
         stride_x,
         name,
         prune=True,
         padding='SAME',
         groups=1):
    """Create a convolution layer.

    Adapted from: https://github.com/ethereon/caffe-tensorflow
    """
    # Get number of input channels
    input_channels = int(x.get_shape()[-1])

    # Create lambda function for the convolution
    if prune:
        convolve = lambda i, k, sc: tf.nn.conv2d(
            i,
            pruning.apply_mask(k, sc),  #add mask for model pruning
            strides=[1, stride_y, stride_x, 1],
            padding=padding)
    else:
        convolve = lambda i, k, sc: tf.nn.conv2d(
            i, k, strides=[1, stride_y, stride_x, 1], padding=padding)

    with tf.variable_scope(name) as scope:
        # Create tf variables for the weights and biases of the conv layer
        weights = tf.get_variable('weights',
                                  shape=[
                                      filter_height, filter_width,
                                      input_channels / groups, num_filters
                                  ])
        biases = tf.get_variable('biases', shape=[num_filters])

        if groups == 1:
            conv = convolve(x, weights, scope)

        # In the cases of multiple groups, split inputs & weights and
        else:
            # Split input and weights and convolve them separately
            input_groups = tf.split(axis=3, num_or_size_splits=groups, value=x)
            weight_groups = tf.split(axis=3,
                                     num_or_size_splits=groups,
                                     value=weights)
            num = 1
            output_groups = list()
            for i, k in zip(input_groups, weight_groups):
                with tf.variable_scope(str(num)) as scope_next:
                    output_groups.append(convolve(i, k, scope_next))
                    num += 1

            # Concat the convolved output together again
            conv = tf.concat(axis=3, values=output_groups)

    # Add biases
    bias = tf.reshape(tf.nn.bias_add(conv, biases), tf.shape(conv))

    # Apply relu function
    relu = tf.nn.relu(bias, name=scope.name)

    return relu
Exemple #29
0
def compute_attention_component(antecedent,
                                total_depth,
                                filter_width=1,
                                padding="VALID",
                                name="c",
                                vars_3d_num_heads=0,
                                sparsity_technique=None,
                                threshold=3.0,
                                training=True,
                                clip_alpha=None,
                                initial_sparsity=None,
                                split_heads=False,
                                num_heads=None):
    """Computes attention compoenent (query, key or value).

  Args:
    antecedent: a Tensor with shape [batch, length, channels]
    total_depth: an integer
    filter_width: An integer specifying how wide you want the attention
      component to be.
    padding: One of "VALID", "SAME" or "LEFT". Default is VALID: No padding.
    name: a string specifying scope name.
    vars_3d_num_heads: an optional integer (if we want to use 3d variables)
    sparsity_technique: technique used for sparsifying weights.
    threshold: log alpha threshold used for evaluation with variational dropout.
    training: whether model is being trained or not.
    clip_alpha: alpha clipping threshold for variational dropout.
    initial_sparsity: initial sparsity level for lottery ticket &
      scratch experiments.
    split_heads: Whether to prune each head separately.
    num_heads: The number of heads in the attention module.

  Returns:
    c : [batch, length, depth] tensor
  """
    # We don't support 3d attention variables or filter_width > 1 with sparsity
    # techniques
    assert not sparsity_technique or (not vars_3d_num_heads
                                      and filter_width == 1)

    if vars_3d_num_heads > 0:
        assert filter_width == 1
        input_depth = antecedent.get_shape().as_list()[-1]
        depth_per_head = total_depth // vars_3d_num_heads
        initializer_stddev = input_depth**-0.5
        if "q" in name:
            initializer_stddev *= depth_per_head**-0.5
        var = tf.get_variable(
            name,
            [input_depth, vars_3d_num_heads, total_depth // vars_3d_num_heads],
            initializer=tf.random_normal_initializer(
                stddev=initializer_stddev))
        var = tf.cast(var, antecedent.dtype)
        var = tf.reshape(var, [input_depth, total_depth])
        return tf.tensordot(antecedent, var, axes=1)
    if filter_width == 1:
        if sparsity_technique:
            if split_heads:
                # Prune each heads weights separately so that they are free
                # to have different weight magnitude distributions.
                if num_heads is None:
                    raise ValueError(
                        "`num_heads` must be set for split head pruning.")
                if total_depth % num_heads != 0:
                    raise ValueError(
                        "`total_depth` must be divisible by `num_heads`.")
                input_depth = antecedent.get_shape().as_list()[-1]
                depth_per_head = int(total_depth / num_heads)
                masked_head_weights = []
                for head_id in range(num_heads):
                    head_name = name + "_shard_{}".format(head_id)
                    with tf.variable_scope(head_name) as vs:
                        head_weights = tf.get_variable(
                            "kernel", [input_depth, depth_per_head])
                        masked_head_weights.append(
                            pruning.apply_mask(head_weights, vs))
                component_weights = tf.concat(masked_head_weights, axis=1)

                # compute the full component result
                return tf.tensordot(antecedent, component_weights, axes=1)
            else:
                return common_sparse.dense(
                    antecedent,
                    total_depth,
                    use_bias=False,
                    sparsity_technique=sparsity_technique,
                    threshold=threshold,
                    training=training,
                    clip_alpha=clip_alpha,
                    name=name,
                    initial_sparsity=initial_sparsity)
        else:
            return common_layers.dense(antecedent,
                                       total_depth,
                                       use_bias=False,
                                       name=name)
    else:
        return common_layers.conv1d(antecedent,
                                    total_depth,
                                    filter_width,
                                    padding=padding,
                                    name=name)
Exemple #30
0
def inference(images):
    """Build the CIFAR-10 model.

  Args:
    images: Images returned from distorted_inputs() or inputs().

  Returns:
    Logits.
  """
    # We instantiate all variables using tf.compat.v1.get_variable() instead of
    # tf.Variable() in order to share variables across multiple GPU training runs.
    # If we only ran this model on a single GPU, we could simplify this function
    # by replacing all instances of tf.compat.v1.get_variable() with tf.Variable().
    #
    # While instantiating conv and local layers, we add mask and threshold
    # variables to the layer by calling the pruning.apply_mask() function.
    # Note that the masks are applied only to the weight tensors
    # conv1
    with tf.variable_scope('conv1') as scope:
        kernel = _variable_with_weight_decay('weights',
                                             shape=[5, 5, 3, 64],
                                             stddev=5e-2,
                                             wd=0.0)

        conv = tf.nn.conv2d(images,
                            pruning.apply_mask(kernel, scope), [1, 1, 1, 1],
                            padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv1)

    # pool1
    pool1 = tf.nn.max_pool(conv1,
                           ksize=[1, 3, 3, 1],
                           strides=[1, 2, 2, 1],
                           padding='SAME',
                           name='pool1')
    # norm1
    norm1 = tf.nn.lrn(pool1,
                      4,
                      bias=1.0,
                      alpha=0.001 / 9.0,
                      beta=0.75,
                      name='norm1')

    # conv2
    with tf.variable_scope('conv2') as scope:
        kernel = _variable_with_weight_decay('weights',
                                             shape=[5, 5, 64, 64],
                                             stddev=5e-2,
                                             wd=0.0)
        conv = tf.nn.conv2d(norm1,
                            pruning.apply_mask(kernel, scope), [1, 1, 1, 1],
                            padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv2)

    # norm2
    norm2 = tf.nn.lrn(conv2,
                      4,
                      bias=1.0,
                      alpha=0.001 / 9.0,
                      beta=0.75,
                      name='norm2')
    # pool2
    pool2 = tf.nn.max_pool(norm2,
                           ksize=[1, 3, 3, 1],
                           strides=[1, 2, 2, 1],
                           padding='SAME',
                           name='pool2')

    # local3
    with tf.variable_scope('local3') as scope:
        # Move everything into depth so we can perform a single matrix multiply.
        reshape = tf.reshape(pool2, [BATCH_SIZE, -1])
        dim = reshape.get_shape()[1].value
        weights = _variable_with_weight_decay('weights',
                                              shape=[dim, 384],
                                              stddev=0.04,
                                              wd=0.004)
        biases = _variable_on_cpu('biases', [384],
                                  tf.constant_initializer(0.1))
        local3 = tf.nn.relu(
            tf.matmul(reshape, pruning.apply_mask(weights, scope)) + biases,
            name=scope.name)
        _activation_summary(local3)

    # local4
    with tf.variable_scope('local4') as scope:
        weights = _variable_with_weight_decay('weights',
                                              shape=[384, 192],
                                              stddev=0.04,
                                              wd=0.004)
        biases = _variable_on_cpu('biases', [192],
                                  tf.constant_initializer(0.1))
        local4 = tf.nn.relu(
            tf.matmul(local3, pruning.apply_mask(weights, scope)) + biases,
            name=scope.name)
        _activation_summary(local4)

    # linear layer(WX + b),
    # We don't apply softmax here because
    # tf.nn.sparse_softmax_cross_entropy_with_logits accepts the unscaled logits
    # and performs the softmax internally for efficiency.
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
                                              stddev=1 / 192.0,
                                              wd=0.0)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0))
        softmax_linear = tf.add(tf.matmul(local4,
                                          pruning.apply_mask(weights, scope)),
                                biases,
                                name=scope.name)
        _activation_summary(softmax_linear)

    return softmax_linear
Exemple #31
0
def top(body_output, targets, model_hparams, vocab_size):
    """Generate logits.

  Args:
    body_output: A Tensor with shape [batch, p0, p1, body_input_depth]
    targets: Unused.
    model_hparams: tf.HParams, model hyperparmeters.
    vocab_size: int, vocabulary size.

  Returns:
    logits: A Tensor with shape  [batch, p0, p1, ?, vocab_size].
  """
    del targets  # unused arg
    # Sparsity techniques only support shared weight matrices for now
    sparsity_technique = model_hparams.get("sparsity_technique")
    assert (not sparsity_technique
            or model_hparams.shared_embedding_and_softmax_weights)
    if model_hparams.shared_embedding_and_softmax_weights:
        scope_name = "shared"
        reuse = tf.AUTO_REUSE
    else:
        scope_name = "softmax"
        reuse = False

    with tf.variable_scope(scope_name, reuse=reuse):
        body_output_shape = common_layers.shape_list(body_output)
        var = _get_weights(model_hparams, vocab_size, body_output_shape[-1])
        if (model_hparams.factored_logits
                and model_hparams.mode == tf.estimator.ModeKeys.TRAIN):
            # Sparsity techniques only support non-factored logits for now
            assert not sparsity_technique

            # insert channels dimension
            body_output = tf.expand_dims(body_output, 3)
            return common_layers.FactoredTensor(body_output, var)
        else:
            body_output = tf.reshape(body_output, [-1, body_output_shape[-1]])

            training = model_hparams.get("mode") == tf.estimator.ModeKeys.TRAIN
            if sparsity_technique == "variational_dropout":
                if training:
                    logits = vd.nn.matmul_train(
                        body_output,
                        var,
                        transpose_b=True,
                        clip_alpha=model_hparams.get("clip_log_alpha"))
                else:
                    threshold = model_hparams.get("log_alpha_threshold")
                    logits = vd.nn.matmul_eval(body_output,
                                               var,
                                               transpose_b=True,
                                               threshold=threshold)
            elif sparsity_technique == "l0_regularization":
                if training:
                    logits = l0.nn.matmul_train(body_output,
                                                var,
                                                transpose_b=True)
                else:
                    logits = l0.nn.matmul_eval(body_output,
                                               var,
                                               transpose_b=True)
            elif (sparsity_technique == "magnitude_pruning"
                  or sparsity_technique == "random_pruning"):
                logits = tf.matmul(body_output,
                                   pruning.apply_mask(var),
                                   transpose_b=True)
            else:
                logits = tf.matmul(body_output, var, transpose_b=True)

            return tf.reshape(logits, body_output_shape[:-1] + [1, vocab_size])
def MaskedConv2D(
        inputs,
        filters,
        kernel_size,
        strides=(1, 1),
        padding='same',
        data_format='channels_last',
        dilation_rate=(1, 1),
        activation=None,
        use_bias=True,
        kernel_initializer=None,
        bias_initializer=tf.zeros_initializer(),
        kernel_regularizer=None,
        bias_regularizer=None,
        activity_regularizer=None,
        split=1,
        masking=False):
    """
    A wrapper around `tf.layers.Conv2D`.
    Some differences to maintain backward-compatibility:

    1. Default kernel initializer is variance_scaling_initializer(2.0).
    2. Default padding is 'same'.
    3. Support 'split' argument to do group conv.

    Variable Names:

    * ``W``: weights
    * ``b``: bias
    """
    if kernel_initializer is None:
        if get_tf_version_tuple() <= (1, 12):
            kernel_initializer = tf.contrib.layers.variance_scaling_initializer(2.0)
        else:
            kernel_initializer = tf.keras.initializers.VarianceScaling(2.0, distribution='untruncated_normal')
    dilation_rate = shape2d(dilation_rate)

    if (masking == False) and (split == 1) and (dilation_rate == [1, 1]):
        # tf.layers.Conv2D has bugs with dilations (https://github.com/tensorflow/tensorflow/issues/26797)
        with rename_get_variable({'kernel': 'W', 'bias': 'b'}):
            layer = tf.layers.Conv2D(
                filters,
                kernel_size,
                strides=strides,
                padding=padding,
                data_format=data_format,
                dilation_rate=dilation_rate,
                activation=activation,
                use_bias=use_bias,
                kernel_initializer=kernel_initializer,
                bias_initializer=bias_initializer,
                kernel_regularizer=kernel_regularizer,
                bias_regularizer=bias_regularizer,
                activity_regularizer=activity_regularizer,
                _reuse=tf.get_variable_scope().reuse)
            ret = layer.apply(inputs, scope=tf.get_variable_scope())
            ret = tf.identity(ret, name='output')

        ret.variables = VariableHolder(W=layer.kernel)
        if use_bias:
            ret.variables.b = layer.bias

    else:
        if masking == True:
            assert split == 1, "Pruining group conv is not supported yet"

        # group conv implementation
        data_format = get_data_format(data_format, keras_mode=False)
        in_shape = inputs.get_shape().as_list()
        channel_axis = 3 if data_format == 'NHWC' else 1
        in_channel = in_shape[channel_axis]
        assert in_channel is not None, "[Conv2D] Input cannot have unknown channel!"
        assert in_channel % split == 0

        assert kernel_regularizer is None and bias_regularizer is None and activity_regularizer is None, \
            "Not supported by group conv or dilated conv!"

        out_channel = filters
        assert out_channel % split == 0
        assert dilation_rate == [1, 1] or get_tf_version_tuple() >= (1, 5), 'TF>=1.5 required for dilated conv.'

        kernel_shape = shape2d(kernel_size)
        filter_shape = kernel_shape + [in_channel / split, out_channel]
        stride = shape4d(strides, data_format=data_format)

        kwargs = dict(data_format=data_format)
        if get_tf_version_tuple() >= (1, 5):
            kwargs['dilations'] = shape4d(dilation_rate, data_format=data_format)

        W = tf.get_variable(
            'W', filter_shape, initializer=kernel_initializer)

        if use_bias:
            b = tf.get_variable('b', [out_channel], initializer=bias_initializer)

        if split == 1:
            if masking:
                W = pruning.apply_mask(W)
            conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs)
        else:
            conv = None
            if get_tf_version_tuple() >= (1, 13):
                try:
                    conv = tf.nn.conv2d(inputs, W, stride, padding.upper(), **kwargs)
                except ValueError:
                    log_once("CUDNN group convolution support is only available with "
                             "https://github.com/tensorflow/tensorflow/pull/25818 . "
                             "Will fall back to a loop-based slow implementation instead!", 'warn')
            if conv is None:
                inputs = tf.split(inputs, split, channel_axis)
                kernels = tf.split(W, split, 3)
                outputs = [tf.nn.conv2d(i, k, stride, padding.upper(), **kwargs)
                           for i, k in zip(inputs, kernels)]
                conv = tf.concat(outputs, channel_axis)

        ret = tf.nn.bias_add(conv, b, data_format=data_format) if use_bias else conv
        if activation is not None:
            ret = activation(ret)
        ret = tf.identity(ret, name='output')

        ret.variables = VariableHolder(W=W)
        if use_bias:
            ret.variables.b = b
    return ret