def self_attention(output, seq_len, out_size, activation_fn=tf.tanh, dropout=None, is_train=False): with tf.variable_scope('self-attention'): context_vector = tf.get_variable(name='context_vector', shape=[out_size], dtype=tf.float32) mlp = tf.layers.dense(output, out_size, activation=activation_fn, name='mlp') attn = tf.tensordot(mlp, context_vector, axes=[[2], [0]]) attn_normed1 = masked_softmax(attn, seq_len) attn_normed = tf.expand_dims(attn_normed1, axis=-1) attn_ctx = tf.matmul(mlp, attn_normed, transpose_a=True) attn_ctx = tf.squeeze(attn_ctx, axis=[2]) if dropout is not None: attn_ctx = tf.layers.dropout(attn_ctx, rate=dropout, training=is_train) return attn, attn_normed1, attn_ctx
def task_specific_attention(in_x, xLen, out_sz, activation_fn=tf.tanh, dropout=None, is_train=False, scope=None): ''' :param in_x: shape(b_sz, tstp, dim) :param xLen: shape(b_sz,) :param out_sz: scalar :param activation_fn: activation :param dropout: :param is_train: :param scope: :return: ''' assert len( in_x.get_shape()) == 3 and in_x.get_shape()[-1].value is not None with tf.variable_scope(scope or 'attention') as scope: context_vector = tf.get_variable(name='context_vector', shape=[out_sz], dtype=tf.float32) ## q in_x_mlp = tf.layers.dense( in_x, out_sz, activation=activation_fn, name='mlp' ) ## h, https://www.tensorflow.org/api_docs/python/tf/layers/dense attn = tf.tensordot(in_x_mlp, context_vector, axes=[[2], [0]]) # shape(b_sz, tstp) ## u = q*h attn_normed = masked_softmax(attn, xLen) ## ai = exp(ui) / sum(exp(ui)) attn_normed = tf.expand_dims(attn_normed, axis=-1) attn_ctx = tf.matmul( in_x_mlp, attn_normed, transpose_a=True) # shape(b_sz, dim, 1) ## e = ai * hi attn_ctx = tf.squeeze(attn_ctx, axis=[2]) # shape(b_sz, dim) if dropout is not None: attn_ctx = tf.layers.dropout(attn_ctx, rate=dropout, training=is_train) return attn_ctx
def __call__(self, inputs, state, scope=None): """Most basic RNN: output = new_state = act(W * input + U * state + B).""" ''' inputs: shape(b_sz, emb) state: shape(b_sz, h_sz) ''' with vs.variable_scope(scope or "attention_cell"): tmp = _linear([inputs, state], self._num_units, bias=False, scope='attn_linear') # shape(b_sz, h_sz) tmp = tf.tile(tf.expand_dims(tmp, axis=1), [1, self.tstp_pre, 1]) # shape(b_sz, tstp_pre, h_sz) M_t = tanh(self.Premise_Linear + tmp) Mt_linear = tf.squeeze(last_dim_linear(M_t, 1, bias=False, scope='M_t_linear'), [2]) # shape(b_sz, tstp_pre) Alpha_t = masked_softmax(Mt_linear, self.Premise_seqLen) # shape(b_sz, tstp_pre) tmp1 = tf.reduce_sum(tf.expand_dims(Alpha_t, 2) * self.Premise_out, axis=1) # shape(b_sz, h_sz) tmp2 = tanh(_linear(state, self._num_units, bias=False, scope='final_linear')) # shape(b_sz, h_sz) next_state = tmp1 + tmp2 return next_state, next_state