def __init__(self, step_dim, W_regularizer=None, b_regularizer=None, W_constraint=None, b_constraint=None, bias=True, **kwargs): """ Keras Layer that implements an Attention mechanism for temporal data. Supports Masking. Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756] # Input shape 3D tensor with shape: `(samples, steps, features)`. # Output shape 2D tensor with shape: `(samples, features)`. :param kwargs: Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. The dimensions are inferred based on the output shape of the RNN. Example: # 1 model.add(LSTM(64, return_sequences=True)) model.add(Attention()) # next add a Dense layer (for classification/regression) or whatever... # 2 hidden = LSTM(64, return_sequences=True)(words) sentence = Attention()(hidden) # next add a Dense layer (for classification/regression) or whatever... """ self.supports_masking = True self.init = initializers.get('glorot_uniform') self.W_regularizer = regularizers.get(W_regularizer) self.b_regularizer = regularizers.get(b_regularizer) self.W_constraint = constraints.get(W_constraint) self.b_constraint = constraints.get(b_constraint) self.bias = bias self.step_dim = step_dim self.features_dim = 0 super(Attention, self).__init__(**kwargs)
def dense( x, units, activation=None, use_bias=True, kernel_initializer="glorot_uniform", bias_initializer="zeros", sparsity_technique="variational_dropout", auxiliary_initializer=None, threshold=3.0, clip_alpha=None, training=True, dtype=tf.float32, name=None, initial_sparsity=None): """Matmul & bias add that supports broadcasting for batched gemm. Supports a contrained set of functionality provided by tf.layers.dense. Args: x: input tensor. units: number of units in the dense layer. activation: activation function to use in the layer. use_bias: whether or not to add a bias to the output. kernel_initializer: weight initializer for the layer. bias_initializer: weight initializer for the bias. sparsity_technique: sparsification technique to apply to the weights. auxiliary_initializer: initializer for auxiliary variables use in variational dropout and l0 regularization. threshold: log-alpha threshold for variational dropout. clip_alpha: whether to clip the alpha values for variational dropout. training: whether this run is training or evaluation the model. dtype: data type for the weights and computation. name: name for the layer. initial_sparsity: initial weight sparsity at the start of training. Returns: Tensor representing the output of the layer. """ activation = activations.get(activation) kernel_initializer = initializers.get(kernel_initializer) bias_initializer = initializers.get(bias_initializer) if (sparsity_technique == "magnitude_pruning" or sparsity_technique == "random_pruning"): if initial_sparsity is not None: # If the initial sparsity value is passed in, use the sparse glorot # uniform initializer to account for the zero valued weights. kernel_initializer = common_init.SparseGlorotUniform( initial_sparsity, dtype=dtype) tf.logging.info( "Using sparse initialization with sparsity {} for variable {}" .format(initial_sparsity, tf.get_variable_scope().name)) # If the sparsity technique is magnitude_pruning, or random_pruning # use the model_pruning masked_fully_connected layer # # masked_fully_connected doesn't take use_bias arg, pass None for the # bias initializer if we don't want a bias variable bias_initializer = bias_initializer if use_bias else None with tf.variable_scope(name, default_name="dense"): return pruning_layers.masked_fully_connected( inputs=x, num_outputs=units, activation_fn=activation, weights_initializer=kernel_initializer, biases_initializer=bias_initializer) if initial_sparsity is not None: raise ValueError("initial_sparsity only supported for mp & rp") # layer_name = "%s_{}" % name if name else "{}" input_shape = x.get_shape().as_list() if input_shape[-1] is None: raise ValueError("The last dimension of the inputs to `Dense` " "should be defined. Found `None`.") with tf.variable_scope(name, default_name="dense") as vs: kernel = tf.get_variable( "kernel", shape=[input_shape[-1], units], initializer=kernel_initializer, dtype=dtype, trainable=True) bias = None if use_bias: bias = tf.get_variable( "bias", shape=[units,], initializer=bias_initializer, dtype=dtype, trainable=True) # Compute the dense layer if sparsity_technique == "variational_dropout": log_sigma2_initializer = initializers.get(auxiliary_initializer) if not log_sigma2_initializer: log_sigma2_initializer = tf.constant_initializer(value=-10, dtype=dtype) with tf.variable_scope(vs, auxiliary_name_scope=False) as vs1: with tf.name_scope(vs1.original_name_scope): log_sigma2 = tf.get_variable( "log_sigma2", shape=[input_shape[-1], units], initializer=log_sigma2_initializer, dtype=dtype, trainable=True) variational_parameters = (kernel, log_sigma2) tf.add_to_collection( VARIATIONAL_DROPOUT_PARAMETERS, variational_parameters) input_rank = x.get_shape().ndims if input_rank > 2: if training: outputs = vd.nn.broadcast_matmul_train( x, variational_parameters, clip_alpha=clip_alpha) else: outputs = vd.nn.broadcast_matmul_eval( x, variational_parameters, threshold) else: if training: outputs = vd.nn.matmul_train( x, variational_parameters, clip_alpha=clip_alpha) else: outputs = vd.nn.matmul_eval( x, variational_parameters, threshold) else: if sparsity_technique != "l0_regularization": raise ValueError("Unsupported sparsity technique {}" .format(sparsity_technique)) log_alpha_initializer = initializers.get(auxiliary_initializer) if not log_alpha_initializer: # Default to \alpha / (\alpha + 1) equal to 0.5 # Default to \alpha / (\alpha + 1) = .1 log_alpha_initializer = tf.random_normal_initializer( mean=2.197, stddev=0.01, dtype=dtype) with tf.variable_scope(vs, auxiliary_name_scope=False) as vs1: with tf.name_scope(vs1.original_name_scope): log_alpha = tf.get_variable( "log_alpha", shape=[input_shape[-1], units], initializer=log_alpha_initializer, dtype=dtype, trainable=True) weight_parameters = (kernel, log_alpha) tf.add_to_collection( L0_REGULARIZATION_PARAMETERS, weight_parameters) input_rank = x.get_shape().ndims if input_rank > 2: if training: outputs = l0.nn.broadcast_matmul_train(x, weight_parameters) else: outputs = l0.nn.broadcast_matmul_eval(x, weight_parameters) else: if training: outputs = l0.nn.matmul_train(x, weight_parameters) else: outputs = l0.nn.matmul_eval(x, weight_parameters) # Handle the bias and activation if use_bias: outputs = tf.nn.bias_add(outputs, bias) if activation is not None: return activation(outputs) return outputs