コード例 #1
0
    def __init__(self,
                 step_dim,
                 W_regularizer=None,
                 b_regularizer=None,
                 W_constraint=None,
                 b_constraint=None,
                 bias=True,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2
            hidden = LSTM(64, return_sequences=True)(words)
            sentence = Attention()(hidden)
            # next add a Dense layer (for classification/regression) or whatever...
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0

        super(Attention, self).__init__(**kwargs)
コード例 #2
0
def dense(
    x,
    units,
    activation=None,
    use_bias=True,
    kernel_initializer="glorot_uniform",
    bias_initializer="zeros",
    sparsity_technique="variational_dropout",
    auxiliary_initializer=None,
    threshold=3.0,
    clip_alpha=None,
    training=True,
    dtype=tf.float32,
    name=None,
    initial_sparsity=None):
  """Matmul & bias add that supports broadcasting for batched gemm.

  Supports a contrained set of functionality provided by tf.layers.dense.

  Args:
    x: input tensor.
    units: number of units in the dense layer.
    activation: activation function to use in the layer.
    use_bias: whether or not to add a bias to the output.
    kernel_initializer: weight initializer for the layer.
    bias_initializer: weight initializer for the bias.
    sparsity_technique: sparsification technique to apply to the weights.
    auxiliary_initializer: initializer for auxiliary variables use in
      variational dropout and l0 regularization.
    threshold: log-alpha threshold for variational dropout.
    clip_alpha: whether to clip the alpha values for variational dropout.
    training: whether this run is training or evaluation the model.
    dtype: data type for the weights and computation.
    name: name for the layer.
    initial_sparsity: initial weight sparsity at the start of training.

  Returns:
    Tensor representing the output of the layer.
  """
  activation = activations.get(activation)
  kernel_initializer = initializers.get(kernel_initializer)
  bias_initializer = initializers.get(bias_initializer)

  if (sparsity_technique == "magnitude_pruning" or
      sparsity_technique == "random_pruning"):
    if initial_sparsity is not None:
      # If the initial sparsity value is passed in, use the sparse glorot
      # uniform initializer to account for the zero valued weights.
      kernel_initializer = common_init.SparseGlorotUniform(
          initial_sparsity, dtype=dtype)
      tf.logging.info(
          "Using sparse initialization with sparsity {} for variable {}"
          .format(initial_sparsity, tf.get_variable_scope().name))

    # If the sparsity technique is magnitude_pruning, or random_pruning
    # use the model_pruning masked_fully_connected layer
    #
    # masked_fully_connected doesn't take use_bias arg, pass None for the
    # bias initializer if we don't want a bias variable
    bias_initializer = bias_initializer if use_bias else None
    with tf.variable_scope(name, default_name="dense"):
      return pruning_layers.masked_fully_connected(
          inputs=x,
          num_outputs=units,
          activation_fn=activation,
          weights_initializer=kernel_initializer,
          biases_initializer=bias_initializer)
  if initial_sparsity is not None:
    raise ValueError("initial_sparsity only supported for mp & rp")

  # layer_name = "%s_{}" % name if name else "{}"

  input_shape = x.get_shape().as_list()
  if input_shape[-1] is None:
    raise ValueError("The last dimension of the inputs to `Dense` "
                     "should be defined. Found `None`.")

  with tf.variable_scope(name, default_name="dense") as vs:
    kernel = tf.get_variable(
        "kernel",
        shape=[input_shape[-1], units],
        initializer=kernel_initializer,
        dtype=dtype,
        trainable=True)

    bias = None
    if use_bias:
      bias = tf.get_variable(
          "bias",
          shape=[units,],
          initializer=bias_initializer,
          dtype=dtype,
          trainable=True)

  # Compute the dense layer
  if sparsity_technique == "variational_dropout":
    log_sigma2_initializer = initializers.get(auxiliary_initializer)

    if not log_sigma2_initializer:
      log_sigma2_initializer = tf.constant_initializer(value=-10, dtype=dtype)

    with tf.variable_scope(vs, auxiliary_name_scope=False) as vs1:
      with tf.name_scope(vs1.original_name_scope):
        log_sigma2 = tf.get_variable(
            "log_sigma2",
            shape=[input_shape[-1], units],
            initializer=log_sigma2_initializer,
            dtype=dtype,
            trainable=True)

    variational_parameters = (kernel, log_sigma2)
    tf.add_to_collection(
        VARIATIONAL_DROPOUT_PARAMETERS,
        variational_parameters)

    input_rank = x.get_shape().ndims
    if input_rank > 2:
      if training:
        outputs = vd.nn.broadcast_matmul_train(
            x,
            variational_parameters,
            clip_alpha=clip_alpha)
      else:
        outputs = vd.nn.broadcast_matmul_eval(
            x,
            variational_parameters,
            threshold)
    else:
      if training:
        outputs = vd.nn.matmul_train(
            x,
            variational_parameters,
            clip_alpha=clip_alpha)
      else:
        outputs = vd.nn.matmul_eval(
            x,
            variational_parameters,
            threshold)
  else:
    if sparsity_technique != "l0_regularization":
      raise ValueError("Unsupported sparsity technique {}"
                       .format(sparsity_technique))
    log_alpha_initializer = initializers.get(auxiliary_initializer)

    if not log_alpha_initializer:
      # Default to \alpha / (\alpha + 1) equal to 0.5
      # Default to \alpha / (\alpha + 1) = .1
      log_alpha_initializer = tf.random_normal_initializer(
          mean=2.197, stddev=0.01, dtype=dtype)

    with tf.variable_scope(vs, auxiliary_name_scope=False) as vs1:
      with tf.name_scope(vs1.original_name_scope):
        log_alpha = tf.get_variable(
            "log_alpha",
            shape=[input_shape[-1], units],
            initializer=log_alpha_initializer,
            dtype=dtype,
            trainable=True)

    weight_parameters = (kernel, log_alpha)
    tf.add_to_collection(
        L0_REGULARIZATION_PARAMETERS,
        weight_parameters)

    input_rank = x.get_shape().ndims
    if input_rank > 2:
      if training:
        outputs = l0.nn.broadcast_matmul_train(x, weight_parameters)
      else:
        outputs = l0.nn.broadcast_matmul_eval(x, weight_parameters)
    else:
      if training:
        outputs = l0.nn.matmul_train(x, weight_parameters)
      else:
        outputs = l0.nn.matmul_eval(x, weight_parameters)

  # Handle the bias and activation
  if use_bias:
    outputs = tf.nn.bias_add(outputs, bias)
  if activation is not None:
    return activation(outputs)
  return outputs