def testSparseGlorotUniform_OutputShape(self):
     initializer = common_init.SparseGlorotUniform(.5)
     x = tf.get_variable("x",
                         shape=[512, 1024],
                         initializer=initializer,
                         dtype=tf.float32)
     with self.test_session() as sess:
         sess.run(tf.global_variables_initializer())
         res = sess.run(x)
     self.assertEqual(res.shape, (512, 1024))
    def testSparseGlorotUniform_NoSparsity(self):
        initializer = common_init.SparseGlorotUniform(0, seed=5)
        initializer_base = tf.glorot_uniform_initializer(seed=5)

        x = tf.get_variable("x",
                            shape=[512, 1024],
                            initializer=initializer,
                            dtype=tf.float32)
        y = tf.get_variable("y",
                            shape=[512, 1024],
                            initializer=initializer_base,
                            dtype=tf.float32)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            res_x = sess.run(x)
            res_y = sess.run(y)
        self.assertEqual(res_x.shape, (512, 1024))
        self.assertEqual(res_y.shape, (512, 1024))
        self.assertAllEqual(res_x, res_y)
def dense(x,
          units,
          activation=None,
          use_bias=True,
          kernel_initializer="glorot_uniform",
          bias_initializer="zeros",
          sparsity_technique="variational_dropout",
          auxiliary_initializer=None,
          threshold=3.0,
          clip_alpha=None,
          training=True,
          dtype=tf.float32,
          name=None,
          initial_sparsity=None):
    """Matmul & bias add that supports broadcasting for batched gemm.

  Supports a contrained set of functionality provided by tf.layers.dense.

  Args:
    x: input tensor.
    units: number of units in the dense layer.
    activation: activation function to use in the layer.
    use_bias: whether or not to add a bias to the output.
    kernel_initializer: weight initializer for the layer.
    bias_initializer: weight initializer for the bias.
    sparsity_technique: sparsification technique to apply to the weights.
    auxiliary_initializer: initializer for auxiliary variables use in
      variational dropout and l0 regularization.
    threshold: log-alpha threshold for variational dropout.
    clip_alpha: whether to clip the alpha values for variational dropout.
    training: whether this run is training or evaluation the model.
    dtype: data type for the weights and computation.
    name: name for the layer.
    initial_sparsity: initial weight sparsity at the start of training.

  Returns:
    Tensor representing the output of the layer.
  """
    activation = activations.get(activation)
    kernel_initializer = initializers.get(kernel_initializer)
    bias_initializer = initializers.get(bias_initializer)

    if (sparsity_technique == "magnitude_pruning"
            or sparsity_technique == "random_pruning"):
        if initial_sparsity is not None:
            # If the initial sparsity value is passed in, use the sparse glorot
            # uniform initializer to account for the zero valued weights.
            kernel_initializer = common_init.SparseGlorotUniform(
                initial_sparsity, dtype=dtype)
            tf.logging.info(
                "Using sparse initialization with sparsity {} for variable {}".
                format(initial_sparsity,
                       tf.get_variable_scope().name))

        # If the sparsity technique is magnitude_pruning, or random_pruning
        # use the model_pruning masked_fully_connected layer
        #
        # masked_fully_connected doesn't take use_bias arg, pass None for the
        # bias initializer if we don't want a bias variable
        bias_initializer = bias_initializer if use_bias else None
        with tf.variable_scope(name, default_name="dense"):
            return pruning_layers.masked_fully_connected(
                inputs=x,
                num_outputs=units,
                activation_fn=activation,
                weights_initializer=kernel_initializer,
                biases_initializer=bias_initializer)
    if initial_sparsity is not None:
        raise ValueError("initial_sparsity only supported for mp & rp")

    # layer_name = "%s_{}" % name if name else "{}"

    input_shape = x.get_shape().as_list()
    if input_shape[-1] is None:
        raise ValueError("The last dimension of the inputs to `Dense` "
                         "should be defined. Found `None`.")

    with tf.variable_scope(name, default_name="dense") as vs:
        kernel = tf.get_variable("kernel",
                                 shape=[input_shape[-1], units],
                                 initializer=kernel_initializer,
                                 dtype=dtype,
                                 trainable=True)

        bias = None
        if use_bias:
            bias = tf.get_variable("bias",
                                   shape=[
                                       units,
                                   ],
                                   initializer=bias_initializer,
                                   dtype=dtype,
                                   trainable=True)

    # Compute the dense layer
    if sparsity_technique == "variational_dropout":
        log_sigma2_initializer = initializers.get(auxiliary_initializer)

        if not log_sigma2_initializer:
            log_sigma2_initializer = tf.constant_initializer(value=-10,
                                                             dtype=dtype)

        with tf.variable_scope(vs, auxiliary_name_scope=False) as vs1:
            with tf.name_scope(vs1.original_name_scope):
                log_sigma2 = tf.get_variable(
                    "log_sigma2",
                    shape=[input_shape[-1], units],
                    initializer=log_sigma2_initializer,
                    dtype=dtype,
                    trainable=True)

        variational_parameters = (kernel, log_sigma2)
        tf.add_to_collection(VARIATIONAL_DROPOUT_PARAMETERS,
                             variational_parameters)

        input_rank = x.get_shape().ndims
        if input_rank > 2:
            if training:
                outputs = vd.nn.broadcast_matmul_train(x,
                                                       variational_parameters,
                                                       clip_alpha=clip_alpha)
            else:
                outputs = vd.nn.broadcast_matmul_eval(x,
                                                      variational_parameters,
                                                      threshold)
        else:
            if training:
                outputs = vd.nn.matmul_train(x,
                                             variational_parameters,
                                             clip_alpha=clip_alpha)
            else:
                outputs = vd.nn.matmul_eval(x, variational_parameters,
                                            threshold)
    else:
        if sparsity_technique != "l0_regularization":
            raise ValueError(
                "Unsupported sparsity technique {}".format(sparsity_technique))
        log_alpha_initializer = initializers.get(auxiliary_initializer)

        if not log_alpha_initializer:
            # Default to \alpha / (\alpha + 1) equal to 0.5
            # Default to \alpha / (\alpha + 1) = .1
            log_alpha_initializer = tf.random_normal_initializer(mean=2.197,
                                                                 stddev=0.01,
                                                                 dtype=dtype)

        with tf.variable_scope(vs, auxiliary_name_scope=False) as vs1:
            with tf.name_scope(vs1.original_name_scope):
                log_alpha = tf.get_variable("log_alpha",
                                            shape=[input_shape[-1], units],
                                            initializer=log_alpha_initializer,
                                            dtype=dtype,
                                            trainable=True)

        weight_parameters = (kernel, log_alpha)
        tf.add_to_collection(L0_REGULARIZATION_PARAMETERS, weight_parameters)

        input_rank = x.get_shape().ndims
        if input_rank > 2:
            if training:
                outputs = l0.nn.broadcast_matmul_train(x, weight_parameters)
            else:
                outputs = l0.nn.broadcast_matmul_eval(x, weight_parameters)
        else:
            if training:
                outputs = l0.nn.matmul_train(x, weight_parameters)
            else:
                outputs = l0.nn.matmul_eval(x, weight_parameters)

    # Handle the bias and activation
    if use_bias:
        outputs = tf.nn.bias_add(outputs, bias)
    if activation is not None:
        return activation(outputs)
    return outputs