Example #1
0
    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim
        #xw = K.reshape(K.dot(x[0], K.reshape(self.W, (features_dim, features_dim))), (-1, features_dim))
        #yavg=K.reshape(K.mean(K.mean(x[1], axis=1, keepdims=True),axis=0, keepdims=True), (features_dim,-1))
        xw1 = K.dot(x[0], K.reshape(self.W1, (features_dim, features_dim)))
        xw2 = K.dot(x[1], K.reshape(self.W2, (features_dim, features_dim)))
        xw1t = K.permute_dimensions(xw1, [0, 2, 1])
        xw2t = K.permute_dimensions(xw2, [0, 2, 1])
        xw11 = K.batch_dot(xw1, xw1t) / (step_dim**0.5)
        xw12 = K.batch_dot(xw1, xw2t) / (step_dim**0.5)

        s11 = self.ll * K.softmax(xw11)
        s12 = (1 - self.ll) * K.softmax(xw12)

        eij = s11 + s12
        print(eij.get_shape())
        V = x[0] * K.mean(eij, axis=2, keepdims=True)
        if self.get_alpha:
            return eij
        else:
            if self.get_sequence:
                return V
            else:
                return K.sum(V, axis=1)
 def call(self, inputs, training=None):
     def _l2normalize(v, eps=1e-12):
         return v / (K.sum(v ** 2) ** 0.5 + eps)
     def power_iteration(W, u):
         _u = u
         _v = _l2normalize(K.dot(_u, K.transpose(W)))
         _u = _l2normalize(K.dot(_v, W))
         return _u, _v
     W_shape = self.kernel.shape.as_list()
     #Flatten the Tensor
     W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]])
     _u, _v = power_iteration(W_reshaped, self.u)
     #Calculate Sigma
     sigma=K.dot(_v, W_reshaped)
     sigma=K.dot(sigma, K.transpose(_u))
     #normalize it
     W_bar = W_reshaped / sigma
     #reshape weight tensor
     if training in {0, False}:
         W_bar = K.reshape(W_bar, W_shape)
     else:
         with tf.control_dependencies([self.u.assign(_u)]):
              W_bar = K.reshape(W_bar, W_shape)  
     output = K.dot(inputs, W_bar)
     if self.use_bias:
         output = K.bias_add(output, self.bias, data_format='channels_last')
     if self.activation is not None:
         output = self.activation(output)
     return output 
Example #3
0
    def call(self, x, mask=None):
        '''
        shape=(batch_size,new_time_step,filters)
     x_cont=Tensor("layer_dropout_5/cond/Identity:0", shape=(None, None, 128), dtype=float32)
x_ques=Tensor("layer_dropout_11/cond/Identity:0", shape=(None, None, 128), dtype=float32)
c_mask=Tensor("batch_slice_4/Slice:0", shape=(None, None), dtype=bool)#
q_mask=Tensor("batch_slice_5/Slice:0", shape=(None, None), dtype=bool)
        '''
        x_cont, x_ques, c_mask, q_mask = x
        # get similarity matrix S
        ##K.dot(x_cont, self.W0)维度变化: [batch_size,time_step,dim] *[dim,1] =[batch_size,time_step,1]
        subres0 = K.tile(K.dot(x_cont, self.W0), [1, 1, self.q_maxlen])
        subres1 = K.tile(
            K.permute_dimensions(K.dot(x_ques, self.W1), pattern=(0, 2, 1)),
            [1, self.c_maxlen, 1])
        subres2 = K.batch_dot(x_cont * self.W2,
                              K.permute_dimensions(x_ques, pattern=(0, 2, 1)))
        S = subres0 + subres1 + subres2
        S += self.bias
        q_mask = tf.expand_dims(q_mask, 1)
        #默认是对最后一维度,即axis=-1
        S_ = tf.nn.softmax(self.mask_logits(S, q_mask))
        c_mask = tf.expand_dims(c_mask, 2)
        S_T = K.permute_dimensions(
            tf.nn.softmax(self.mask_logits(S, c_mask), axis=1), (0, 2, 1))
        c2q = tf.matmul(S_, x_ques)
        q2c = tf.matmul(tf.matmul(S_, S_T), x_cont)
        result = K.concatenate([x_cont, c2q, x_cont * c2q, x_cont * q2c],
                               axis=-1)

        return result
Example #4
0
        def energy_step(inputs, states):

            assert_msg = "States must be a list. However states {} is of type {}".format(
                states, type(states))

            assert isinstance(states, list) or isinstance(states,
                                                          tuple), assert_msg

            en_seq_len, en_hidden = encoder_out_seq.shape[
                1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a),
                                  (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s > ', W_a_dot_s.shape)

            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a),
                                      1)  # (batch_size, 1, latent_dim)
            if verbose:
                print('Ua.h > ', U_a_dot_h.shape)

            reshaped_Ws_plus_Uh = K.tanh(
                K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh > ', reshaped_Ws_plus_Uh.shape)

            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a),
                            (-1, en_seq_len))
            e_i = K.softmax(e_i)
            if verbose:
                print('ei > ', e_i.shape)

            return e_i, [e_i]
Example #5
0
 def call(self, inputs, **kwargs):
     if not (isinstance(inputs, list) and len(inputs) == 2):
         raise ValueError(
             'You can call this layer only with a list of two tensors '
             '(for keys/values and queries)')
     key_values_input, query_input = inputs
     _, value_seq_len, d_model = K.int_shape(key_values_input)
     query_seq_len = K.int_shape(inputs[1])[-2]
     # The first thing we need to do is to perform affine transformations
     # of the inputs to get the Queries, the Keys and the Values.
     kv = K.dot(K.reshape(key_values_input, [-1, d_model]), self.kv_weights)
     # splitting the keys, the values and the queries before further
     # processing
     pre_k, pre_v = [
         K.reshape(
             # K.slice(kv, (0, i * d_model), (-1, d_model)),
             kv[:, i * d_model: (i + 1) * d_model],
             (-1, value_seq_len,
              self.num_heads, d_model // self.num_heads))
         for i in range(2)]
     pre_q = K.reshape(
         K.dot(K.reshape(query_input, [-1, d_model]), self.q_weights),
         (-1, query_seq_len, self.num_heads, d_model // self.num_heads))
     return self.attention(pre_q, pre_v, pre_k, query_seq_len, d_model,
                           training=kwargs.get('training'))
    def _compute_carry_and_output(self, x, h_tm1, c_tm1):
        """Computes carry and output using split kernels."""
        # x, h_tm1, c_tm1 : complex64
        # c_tm1 = c_ops.check_nan(c_tm1, 'carry c_tm1')
        x_i, x_f, x_c, x_o = x
        h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1
        recurrent_kernel_complex = c_ops.tf_to_complex(
            self.recurrent_kernel_real, self.recurrent_kernel_imag)
        i = x_i + K.dot(h_tm1_i, recurrent_kernel_complex[:, :self.units])
        i = self.activate_complex_real_imag_independently(
            self.recurrent_activation, i)
        # i = c_ops.check_nan(i, 'carry i')
        i *= tf.complex(0.5, 0.0)

        f = self.activate_complex_real_imag_independently(
            self.recurrent_activation, x_f +
            K.dot(h_tm1_f,
                  recurrent_kernel_complex[:, self.units:self.units * 2]))
        # f = c_ops.check_nan(f, 'carry f')
        f *= tf.complex(0.5, 0.0)

        c = f * c_tm1 + i * self.activate_complex_real_imag_independently(
            self.activation, x_c +
            K.dot(h_tm1_c,
                  recurrent_kernel_complex[:, self.units * 2:self.units * 3]))
        # c = c_ops.check_nan(c, 'carry c')

        o = self.activate_complex_real_imag_independently(
            self.recurrent_activation,
            x_o + K.dot(h_tm1_o, recurrent_kernel_complex[:, self.units * 3:]))
        # o = c_ops.check_nan(o, 'carry o')

        return c, o  # complex
    def call(self, x):
        eij1 = K.reshape(
            K.dot(K.reshape(x[:, :, 0:768], (-1, self.features_dim)), K.reshape(self.W, (self.features_dim, 1))),
            (-1, self.step_dim))
        eij1 += self.b
        eij1 = K.expand_dims(eij1)

        eij2 = K.reshape(
            K.dot(K.reshape(x[:, :, 768:768*2], (-1, self.features_dim)), K.reshape(self.W, (self.features_dim, 1))),
            (-1, self.step_dim))
        eij2 += self.b
        eij2 = K.expand_dims(eij2)

        eij3 = K.reshape(
            K.dot(K.reshape(x[:, :, 768*2:768*3], (-1, self.features_dim)), K.reshape(self.W, (self.features_dim, 1))),
            (-1, self.step_dim))
        eij3 += self.b
        eij3 = K.expand_dims(eij3)


        eij = keras.layers.concatenate([eij1, eij2, eij3], axis=2)
        print(eij)
        eij = K.tanh(eij)
        a = K.exp(eij)
        a /= K.cast(K.sum(a, axis=2, keepdims=True) + K.epsilon(), K.floatx())
        print(a)
        temp = a[:,:,0:1] * x[:, :, 0:768] + a[:,:,1:2] * x[:, :, 768:768*2] + a[:,:,2:3] * x[:, :, 768*2:768*3]
        print(temp)

        return temp
Example #8
0
 def call(self, inputs, **kwargs):
     main_input, embedding_matrix = inputs
     input_shape_tensor = K.shape(main_input)
     last_input_dim = K.int_shape(main_input)[-1]
     emb_input_dim, emb_output_dim = K.int_shape(embedding_matrix)
     projected = K.dot(K.reshape(main_input, (-1, last_input_dim)),
                       self.embedding_weights['projection'])
     if self.add_biases:
         projected = K.bias_add(projected,
                                self.embedding_weights['biases'],
                                data_format='channels_last')
     if 0 < self.projection_dropout < 1:
         projected = K.in_train_phase(
             lambda: K.dropout(projected, self.projection_dropout),
             projected,
             training=kwargs.get('training'))
     attention = K.dot(projected, K.transpose(embedding_matrix))
     if self.scaled_attention:
         # scaled dot-product attention, described in
         # "Attention is all you need" (https://arxiv.org/abs/1706.03762)
         sqrt_d = K.constant(math.sqrt(emb_output_dim), dtype=K.floatx())
         attention = attention / sqrt_d
     result = K.reshape(
         self.activation(attention),
         (input_shape_tensor[0], input_shape_tensor[1], emb_input_dim))
     return result
Example #9
0
    def call(self, inputs, states, training=None):
        vh = states[0]

        dp_mask = self.get_dropout_mask_for_cell(inputs, training, count=2)
        rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(vh,
                                                               training,
                                                               count=2)

        if 0. < self.dropout < 1.:
            input1 = inputs * dp_mask[0]
            input2 = inputs * dp_mask[1]
        else:
            input1 = inputs
            input2 = inputs

        p11 = K.dot(input1, self.kernel[:, :self.units])
        p21 = K.dot(input2, self.kernel[:, self.units:])
        if self.use_bias:
            p11 = K.bias_add(p11, self.bias[:self.units])
            p21 = K.bias_add(p21, self.bias[self.units:])
        if 0. < self.recurrent_dropout < 1.:
            vh1 = vh * rec_dp_mask[0]
            vh2 = vh * rec_dp_mask[1]
        else:
            vh1 = vh
            vh2 = vh

        v1 = self.recurrent_activation(
            p11 + K.dot(vh1, self.recurrent_kernel[:, :self.units]))
        v2 = self.activation(p21 +
                             K.dot(vh2 *
                                   v1, self.recurrent_kernel[:, self.units:]))
        vh = (1 - v1) * vh + v1 * v2
        return vh, [vh]
Example #10
0
    def call(self, inputs, states, training=None):
        # get the standard hidden state from super
        output = super(STTAUCell, self).call(inputs, states)
        h_before = output[0]
        c = output[1][1]

        # the following part modifies the hidden state to create STTAU
        # sizes: B = batch size, H = hidden dimension size,
        # C = number of centroids
        # BxC = BxH & HxC
        unnormalized_probs = K.dot(h_before, self.centroid_kernel)

        # Gumbel-Softmax sample with (learnt) temperature & unnormalized_probs
        q_y = tfp.distributions.RelaxedOneHotCategorical(
            self.temperature_weight, unnormalized_probs)

        # BxC
        y = q_y.sample()
        if self.hard_sample is True:
            # y_hard is a one-hot vector with BxC
            y_hard = tf.cast(tf.one_hot(tf.argmax(y, -1), self.centroids),
                             y.dtype)
            y = tf.stop_gradient(y_hard - y) + y

        # BxH = BxC & CxH
        h_after = K.dot(y, K.transpose(self.centroid_kernel))
        # end of STTAU modification

        if 0 < self.dropout + self.recurrent_dropout:
            if training is None:
                h_after._uses_learning_phase = True
        return h_before, [h_after, c]
    def call(self, x):
        print(x)
        features_dim = x.shape[-1].value
        step_dim = x.shape[-2].value
        print(K.reshape(self.kernel, (-1, features_dim)))  # n, d
        print(K.reshape(self.W, (features_dim, 1)))  # w= dx1
        print(K.dot(K.reshape(self.kernel, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))))  # nx1

        eij = K.reshape(K.dot(K.reshape(self.kernel, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))),
                        (-1, step_dim))  # batch,step
        print(eij)

        eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)


        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = tf.transpose(a,(1,0))
        print(a)

        print("x:")
        print(self.kernel)
        weighted_input = self.kernel * a  # 自动填充为相同的维度相乘 N T K
        print(weighted_input.shape)
        temp = K.sum(weighted_input, axis=0)  # N K  权重相加
        temp = K.tile(K.expand_dims(temp, 0), [step_dim, 1])
        temp = keras.layers.concatenate([self.kernel, temp])
        temp = K.dot(temp, self.W2) + self.b2
        return x + temp
Example #12
0
        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """

            # input: (batch_size, latent_dim)
            assert_msg = "States must be a list. However states {} is of type {}".format(
                states, type(states))
            assert isinstance(states, list) or isinstance(states,
                                                          tuple), assert_msg
            """ Computing sj.Ua """
            # (batch_size, 1, d3)
            U_a_dot_s = K.expand_dims(K.dot(inputs, self.U_a), 1)
            if verbose:
                print('Ua.h>', K.int_shape(U_a_dot_s))
            """ tanh(h.Wa + s.Ua) """
            # (batch_size, h1*h2*...*hn, d3) = (batch_size, h1*h2*...*hn, d3) + (batch_size, 1, d3)
            Wh_plus_Us = K.tanh(W_hi + U_a_dot_s)
            # (batch_size, d3, h1*h2*...*hn)
            Wh_plus_Us = K.permute_dimensions(Wh_plus_Us, (0, 2, 1))
            if verbose:
                print('Wh+Us>', K.int_shape(Wh_plus_Us))
            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # (1, batch_size, h1*h2*...*hn) = (1, d3) . (batch_size, d3, h1*h2*...*hn)
            Wh_plus_Us_dot_Va = K.dot(self.V_a, Wh_plus_Us)
            # (batch_size, h1*h2*...*hn)
            e_i = K.squeeze(Wh_plus_Us_dot_Va, 0)
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', K.int_shape(e_i))

            # (batch_size, h1*h2*...*hn)
            return e_i, states
Example #13
0
        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state
            inputs: (batchsize * 1 * de_in_dim)
            states: (batchsize * 1 * de_latent_dim)
            """
            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[
                1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]
            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch size * en_seq_len * latent_dim
            W_a_dot_s = K.dot(encoder_out_seq, self.W_a)
            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a),
                                      1)  # <= batch_size, 1, latent_dim
            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)
            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=-1)
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            return e_i, [e_i]
Example #14
0
        def energy_step(decode_outs, states):  # decode_outs(batch,dim)

            # decoder_seq [N,30,512] 30是字符串长度
            en_seq_len, en_hidden = encoder_out_seq.shape[
                1], encoder_out_seq.shape[2]  # 30, 512
            de_hidden = decode_outs.shape[-1]
            #  W * h_j
            reshaped_enc_outputs = K.reshape(
                encoder_out_seq, (-1, en_hidden))  #[b,64,512]=> [b*64,512]

            # W_a[512x512],reshaped_enc_outputs[b*64,512] => [b*64,512] => [b,64,512]
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a),
                                  (-1, en_seq_len, en_hidden))

            # U * S_t - 1,decode_outs[b,512],U_a[512,512] => [b,512]    => [b,1,512]
            U_a_dot_h = K.expand_dims(K.dot(decode_outs, self.U_a),
                                      axis=1)  # <= batch_size, 1, latent_dim

            # 这个细节很变态,其实就是完成了decoder的输出复制time(64)个,和encoder的输出【64,512】,相加的过程

            # tanh ( W * h_j + U * S_t-1 + b ),[b,64,512] = [b*64,512]
            reshaped_Ws_plus_Uh = K.tanh(
                K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))

            # V * tanh ( W * h_j + U * S_t-1 + b ), [b*64,512]*[512,1] => [b*64,1] => [b,64]
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a),
                            (-1, en_seq_len))

            e_i = K.softmax(e_i)

            return e_i, [e_i]
Example #15
0
    def call(self, inputs, prev_projection, states, training=None):
        prev_output = states[0]

        dp_mask = self.get_dropout_mask_for_cell(inputs, training)
        rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(
            prev_output, training)

        if dp_mask is not None:
            inputs = inputs * dp_mask
        output = K.dot(inputs, self.kernel)

        if self.use_recurrent:
            if rec_dp_mask is not None:
                prev_output = prev_output * rec_dp_mask
            output += K.dot(prev_output, self.recurrent_kernel)

        if self.use_feedback:
            if self.projection_activation is not None:
                prev_projection = self.projection_activation(prev_projection)
            output += K.dot(prev_projection, self.feedback_kernel)

        if self.bias is not None:
            output = K.bias_add(output, self.bias)

        if self.activation is not None:
            output = self.activation(output)

        projection = K.dot(output, self.projection_kernel)

        if self.projection_bias is not None:
            projection = K.bias_add(projection, self.projection_bias)

        return output, projection, [output]
Example #16
0
    def generator(self, src_enc):
        G_h = K.bias_add(K.dot(src_enc, self.G_w1), self.G_b1)
        G_h_relu = tf.nn.relu(G_h)
        G_log_prob = K.bias_add(K.dot(G_h_relu, self.G_w2), self.G_b2)
        G_prob = tf.nn.sigmoid(G_log_prob)

        return G_prob
 def call(self, inputs):
     if K.dtype(inputs) != 'int32':
         inputs = K.cast(inputs, 'int32')
         
     def _l2normalize(v, eps=1e-12):
         return v / (K.sum(v ** 2) ** 0.5 + eps)
     def power_iteration(W, u):
         #Accroding the paper, we only need to do power iteration one time.
         _u = u
         _v = _l2normalize(K.dot(_u, K.transpose(W)))
         _u = _l2normalize(K.dot(_v, W))
         return _u, _v
     W_shape = self.embeddings.shape.as_list()
     #Flatten the Tensor
     W_reshaped = K.reshape(self.embeddings, [-1, W_shape[-1]])
     _u, _v = power_iteration(W_reshaped, self.u)
     #Calculate Sigma
     sigma=K.dot(_v, W_reshaped)
     sigma=K.dot(sigma, K.transpose(_u))
     #normalize it
     W_bar = W_reshaped / sigma
     #reshape weight tensor
     if training in {0, False}:
         W_bar = K.reshape(W_bar, W_shape)
     else:
         with tf.control_dependencies([self.u.assign(_u)]):
             W_bar = K.reshape(W_bar, W_shape)
     self.embeddings = W_bar
         
     out = K.gather(self.embeddings, inputs)
     return out 
Example #18
0
    def call(self, inputs, training=None):
        def _l2normalize(v, eps=1e-12):
            return v / (K.sum(v**2)**0.5 + eps)

        def power_iteration(W, u):
            _u = u
            _v = _l2normalize(K.dot(_u, K.transpose(W)))
            _u = _l2normalize(K.dot(_v, W))
            return _u, _v

        if self.spectral_normalization:
            W_shape = self.kernel.shape.as_list()
            # Flatten the Tensor
            W_reshaped = K.reshape(self.kernel, [-1, W_shape[-1]])
            _u, _v = power_iteration(W_reshaped, self.u)
            # Calculate Sigma
            sigma = K.dot(_v, W_reshaped)
            sigma = K.dot(sigma, K.transpose(_u))
            # normalize it
            W_bar = W_reshaped / sigma
            # reshape weight tensor
            if training in {0, False}:
                W_bar = K.reshape(W_bar, W_shape)
            else:
                with tf.control_dependencies([self.u.assign(_u)]):
                    W_bar = K.reshape(W_bar, W_shape)

            # update weitht
            self.kernel = W_bar

        if self.rank == 1:
            outputs = K.conv1d(inputs,
                               self.kernel,
                               strides=self.strides[0],
                               padding=self.padding,
                               data_format=self.data_format,
                               dilation_rate=self.dilation_rate[0])
        if self.rank == 2:
            outputs = K.conv2d(inputs,
                               self.kernel,
                               strides=self.strides,
                               padding=self.padding,
                               data_format=self.data_format,
                               dilation_rate=self.dilation_rate)
        if self.rank == 3:
            outputs = K.conv3d(inputs,
                               self.kernel,
                               strides=self.strides,
                               padding=self.padding,
                               data_format=self.data_format,
                               dilation_rate=self.dilation_rate)

        if self.use_bias:
            outputs = K.bias_add(outputs,
                                 self.bias,
                                 data_format=self.data_format)

        if self.activation is not None:
            return self.activation(outputs)
        return outputs
 def power_iteration(self, u, W):
     '''
     Accroding the paper, we only need to do power iteration one time.
     '''
     v = self._l2normalize(K.dot(u, K.transpose(W)))
     u = self._l2normalize(K.dot(v, W))
     return u, v
Example #20
0
    def call(self, inputs):
        """
            Args:
                (query, context) ->
                      query: a Tensor with shape [batch_size, query_length, channels]
                      context: a Tensor with shape [batch_size, context_length, channels]

            Returns:
                similarity: a Tensor with shape [batch_size, context_length, query_length]
        """
        query, context = inputs
        if self.dropout:
            query = self.dropout(query)
            context = self.dropout(context)

        # context_weighted -> Tensor with shape [batch_size, context_length, 1]
        context_weighted = K.dot(context, self.context_weights)

        # query_weighted -> Tensor with shape [batch_size, 1, query_length]
        query_weighted = tf.transpose(
            K.dot(query, self.query_weights), (0, 2, 1))

        # weighted_context_query -> Tensor with shape [batch_size, context_length, query_length]
        weighted_context_query = tf.matmul(
            K.dot(context, self.dot_weights), query, transpose_b=True)

        similarity = weighted_context_query + context_weighted + query_weighted
        return similarity
Example #21
0
    def call(self, inputs,**kwargs):

        if K.ndim(inputs[0]) != 3:
            raise ValueError("Unexpected inputs dimensions %d, expect to be 3 dimensions" % (K.ndim(inputs)))

        embeds_vec_list = inputs
        row = []
        col = []
        num_inputs = len(embeds_vec_list)
        for i in range(num_inputs - 1):
            for j in range(i + 1, num_inputs):
                row.append(i)
                col.append(j)
        p = concatenate([embeds_vec_list[idx] for idx in row],axis=1)# batch num_pairs k
        q = concatenate([embeds_vec_list[idx] for idx in col],axis=1)  # Reshape([num_pairs, self.embedding_size])
        inner_product = p * q

        bi_interaction = inner_product

        attention_temp = Dense(self.attention_factor,'relu',kernel_regularizer=l2(self.l2_reg_w))(bi_interaction)
        attention_weight = softmax(K.dot(attention_temp, self.projection_h),axis=1)

        attention_output = K.sum(attention_weight*bi_interaction,axis=1)
        attention_output = tf.nn.dropout(attention_output,self.keep_prob,seed=1024)
            # Dropout(1-self.keep_prob)(attention_output)
        afm_out = K.dot(attention_output, self.projection_p)

        return afm_out
Example #22
0
def step_gru(cell_inputs, cell_state, kernel, recurrent_kernel, input_bias,
             recurrent_bias):
    """Step function that will be used by Keras RNN backend."""
    h_tm1 = cell_state

    # inputs projected by all gate matrices at once
    matrix_x = K.dot(cell_inputs, kernel)
    matrix_x = K.bias_add(matrix_x, input_bias)

    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)

    # hidden state projected by all gate matrices at once
    matrix_inner = K.dot(h_tm1, recurrent_kernel)
    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)

    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner,
                                                            3,
                                                            axis=1)
    z = nn.sigmoid(x_z + recurrent_z)
    r = nn.sigmoid(x_r + recurrent_r)
    hh = nn.tanh(x_h + r * recurrent_h)

    # previous and candidate state mixed by update gate
    h = z * h_tm1 + (1 - z) * hh
    return h, [h]
 def call(self, inputs, states):
     prev_output = states[0]
     h = K.dot(inputs, self.kernel)
     output = h + K.dot(prev_output, self.recurrent_kernel)
     activation = activations.get(self.activation)
     output = activation(output)
     return output, [output]
Example #24
0
    def call(self, inputs, mask=None):
        # output = softmax(score)
        k, q = inputs
        if len(q.shape) == 2:
            q = K.expand_dims(q, axis=1)
        # k: (?, K_LEN, EMBED_DIM,)
        # q: (?, Q_LEN, EMBED_DIM,)
        # score: (?, Q_LEN, K_LEN,)
        if self.score_function == 'scaled_dot_product':
            kt = K.permute_dimensions(k, (0, 2, 1))
            qkt = K.batch_dot(q, kt)
            score = qkt / self.EMBED_DIM
        elif self.score_function == 'mlp':
            kq = K.concatenate([k, q], axis=1)
            kqw2 = K.tanh(K.dot(kq, self.W2))
            score = K.permute_dimensions(K.dot(self.W1, kqw2), (1, 0, 2))
        elif self.score_function == 'bi_linear':
            qw = K.dot(q, self.W)
            kt = K.permute_dimensions(k, (0, 2, 1))
            score = K.batch_dot(qw, kt)
        else:
            raise RuntimeError('invalid score_function')
        score = K.softmax(score)
        # if mask is not None:
        #     score *= K.cast(mask[0], K.floatx())
        # output: (?, Q_LEN, EMBED_DIM,)
        output = K.batch_dot(score, k)

        return output
Example #25
0
    def call(self, inputs):
        X = inputs[0]  # Node features (N x F)
        A = inputs[1]  # Adjacency matrix (N x N)

        outputs = []
        for head in range(self.attn_heads):
            kernel = self.kernels[head]  # W in the paper (F x F")
            attention_kernel = self.attn_kernels[
                head]  # Attention kernel a in the paper (2F" x 1)

            # Compute inputs to attention network
            features = K.dot(X, kernel)  # (N x F")

            # Compute feature combinations
            # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j]
            attn_for_self = K.dot(
                features, attention_kernel[0])  # (N x 1), [a_1]^T [Wh_i]
            attn_for_neighs = K.dot(
                features, attention_kernel[1])  # (N x 1), [a_2]^T [Wh_j]

            # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]]
            dense = attn_for_self + K.transpose(
                attn_for_neighs)  # (N x N) via broadcasting

            # Add nonlinearty
            dense = LeakyReLU(alpha=0.2)(dense)

            # Mask values before activation (Vaswani et al., 2017)
            mask = -10e9 * (1.0 - A)
            dense += mask

            # Apply softmax to get attention coefficients
            dense = K.softmax(dense)  # (N x N)

            # Apply dropout to features and attention coefficients
            dropout_attn = Dropout(self.dropout_rate)(dense)  # (N x N)
            dropout_feat = Dropout(self.dropout_rate)(features)  # (N x F")

            # Linear combination with neighbors" features
            node_features = K.dot(dropout_attn, dropout_feat)  # (N x F")

            if self.use_bias:
                node_features = K.bias_add(node_features, self.biases[head])

            if self.attn_heads_reduction == "concat":
                # If "concat", compute the activation here (Eq. 5)
                node_features = self.activation(node_features)

            # Add output of attention head to final output
            outputs.append(node_features)

        # Aggregate the heads" output according to the reduction method
        if self.attn_heads_reduction == "concat":
            output = K.concatenate(outputs)  # (N x KF")
        else:
            output = K.mean(K.stack(outputs), axis=0)  # N x F")

        output = self.activation(output)
        return output
    def call(self, inputs):
        X = inputs[0]  # Node features (B x N x F)
        A = inputs[1]  # Adjacency matrix (B x N x N)

        X_dims = X.get_shape().as_list()
        B, N, F = X_dims

        outputs = []
        attentions = []
        for head in range(self.attn_heads):
            # W in the paper (F x F")
            kernel = self.kernels[head]

            # Compute inputs to attention network
            features = K.dot(X, kernel)  # (B x N x F")
            dropout_feat = Dropout(self.dropout_rate)(features)  # (B x N x F")

            neighbor_kernel = self.neighbor_kernels[head]
            attn_kernel = self.attn_kernels[head]

            neighbor_features = K.dot(X, neighbor_kernel)
            dropout_neighbor = Dropout(self.dropout_rate)(neighbor_features)

            merged = tf.matmul(K.dot(dropout_feat, attn_kernel),
                               tf.transpose(dropout_neighbor, (0, 2, 1)))

            attention = tf.nn.tanh(merged)
            attention = K.reshape(attention, (-1, N, N))

            mask = -10e9 * (1.0 - A)
            attention += mask

            attention = tf.nn.softmax(attention)
            dropout_attn = Dropout(self.dropout_rate)(attention)

            node_features = tf.matmul(dropout_attn, dropout_feat)

            if self.use_bias:
                node_features = K.bias_add(node_features, self.biases[head])

            if self.return_attention:
                attentions.append(attention)
            # Add output of attention head to final output
            outputs.append(node_features)

        # Aggregate the heads" output according to the reduction method
        if self.attn_heads_reduction == "concat":
            output = K.concatenate(outputs, axis=-1)  # (B x N x KF")
        else:
            output = K.mean(K.stack(outputs), axis=0)  # (B x N x F")
            # If "average", compute the activation here (Eq. 6)

        output = self.activation(output)

        if self.return_attention:
            attentions = K.stack(attentions, axis=1)
            return (output, attentions)
        else:
            return output
Example #27
0
    def call(self, inputs, mask=None, training=None):
        inputs, relatives, memories, bias_context, bias_relative = inputs
        full = K.concatenate([memories, inputs], axis=1)      # (batch, prev_len + seq_len, units)
        w_q = K.dot(inputs, self.kernel_q)                    # (batch, seq_len, units)
        w_kv = K.dot(full, self.kernel_kv)                    # (batch, prev_len + seq_len, units * 2)
        w_r = K.dot(relatives, self.kernel_r)                 # (batch, prev_len + seq_len, units)
        if self.use_bias:
            w_q = K.bias_add(w_q, self.bias_q)
            w_kv = K.bias_add(w_kv, self.bias_kv)
            w_r = K.bias_add(w_r, self.bias_r)
        if self.activation is not None:
            w_q = self.activation(w_q)
            w_kv = self.activation(w_kv)
            w_r = self.activation(w_r)

        w_k = w_kv[:, :, :self.units]                         # (batch, prev_len + seq_len, units)
        w_v = w_kv[:, :, self.units:]                         # (batch, prev_len + seq_len, units)

        w_qc = K.bias_add(w_q, bias_context)
        w_qc = self._reshape_to_batches(w_qc)                 # (batch * n_head, seq_len, units_head)
        w_k = self._reshape_to_batches(w_k)                   # (batch * n_head, prev_len + seq_len, units_head)
        a_context = K.batch_dot(w_qc, w_k, axes=2)            # (batch * n_head, seq_len, prev_len + seq_len)

        w_qr = K.bias_add(w_q, bias_relative)
        w_qr = self._reshape_to_batches(w_qr)                 # (batch * n_head, seq_len, units_head)
        w_r = self._reshape_to_batches(w_r)                   # (batch * n_head, prev_len + seq_len, units_head)
        a_relative = K.batch_dot(w_qr, w_r, axes=2)           # (batch * n_head, seq_len, prev_len + seq_len)
        a_relative = self._relative_shift(a_relative)         # (batch * n_head, seq_len, prev_len + seq_len)

        att = (a_context + a_relative) / K.sqrt(K.constant(self.units_head, dtype=K.floatx()))
        exp = K.exp(att - K.max(att, axis=-1, keepdims=True))

        q_len, k_len = K.shape(w_q)[1], K.shape(w_k)[1]
        indices = K.expand_dims(K.arange(0, k_len), axis=0)
        upper = K.expand_dims(K.arange(k_len - q_len, k_len), axis=-1)
        exp *= K.expand_dims(K.cast(indices <= upper, K.floatx()), axis=0)
        if mask is not None and mask[0] is not None:
            mask = K.cast(mask[0], K.floatx())
            mask = K.concatenate([K.ones_like(memories[:, :, 0]), mask], axis=1)
            exp *= K.expand_dims(self._reshape_mask(mask), axis=1)

        att = exp / K.sum(exp, axis=-1, keepdims=True)
        if self.att_drop_layer is not None:
            att = self.att_drop_layer(att, training=training)
        w_v = self._reshape_to_batches(w_v)                   # (batch * n_head, prev_len + seq_len, units_head)
        w_o = K.batch_dot(att, w_v)                           # (batch * n_head, seq_len, units_head)

        w_o = self._reshape_from_batches(w_o)                 # (batch, seq_len, units)
        w_o = K.dot(w_o, self.kernel_o)                       # (batch, seq_len, units)
        if self.use_bias:
            w_o = K.bias_add(w_o, self.bias_o)
        if self.activation is not None:
            w_o = self.activation(w_o)

        # Add shape information to tensor when using `tf.keras`
        input_shape = K.int_shape(inputs)
        if input_shape[1] is not None:
            w_o = K.reshape(w_o, (-1,) + input_shape[1:])
        return w_o
Example #28
0
 def call(self, x, mask=None):
     energy = self.activation(K.dot(x, self.W0) + self.b0)
     #energy=self.activation(K.dot(energy, self.W) + self.b)
     energy = K.dot(energy, self.W) + self.b
     energy = K.reshape(energy, (-1, self.input_length))
     energy = K.softmax(energy)
     xx = K.batch_dot(energy, x, axes=(1, 1))
     all = K.concatenate([xx, energy])
     return all
def power_iteration(W, u, rounds=1):
    '''
    Accroding the paper, we only need to do power iteration one time.
    '''
    _u = u

    for i in range(rounds):
        _v = _l2normalizer(K.dot(_u, W))
        _u = _l2normalizer(K.dot(_v, K.transpose(W)))

    W_sn = K.sum(K.dot(_u, W) * _v)
    return W_sn, _u, _v
Example #30
0
    def call(self, inputs, **kwargs):
        W = K.tanh(self.W_hat) * K.sigmoid(self.M_hat)
        a = K.dot(inputs, W)

        if self.nac_only:
            outputs = a
        else:
            m = K.exp(K.dot(K.log(K.abs(inputs) + self.epsilon), W))
            g = K.sigmoid(K.dot(inputs, self.G))
            outputs = g * a + (1. - g) * m

        return outputs
Example #31
0
  def step(cell_inputs, cell_states):
    """Step function that will be used by Keras RNN backend."""
    h_tm1 = cell_states[0]  # previous memory state
    c_tm1 = cell_states[1]  # previous carry state

    z = K.dot(cell_inputs, kernel)
    z += K.dot(h_tm1, recurrent_kernel)
    z = K.bias_add(z, bias)

    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)

    i = recurrent_activation(z0)
    f = recurrent_activation(z1)
    c = f * c_tm1 + i * activation(z2)
    o = recurrent_activation(z3)

    h = o * activation(c)
    return h, [h, c]
Example #32
0
  def step(cell_inputs, cell_states):
    h_tm1 = cell_states[0]  # previous memory state
    c_tm1 = cell_states[1]  # previous carry state

    # Only use the second half of the bias weights.
    _, real_bias = array_ops.split(bias, 2)

    z = K.dot(cell_inputs, kernel)
    z += K.dot(h_tm1, recurrent_kernel)
    z = K.bias_add(z, real_bias)

    z0 = z[:, :units]
    z1 = z[:, units:2 * units]
    z2 = z[:, 2 * units:3 * units]
    z3 = z[:, 3 * units:]

    i = recurrent_activation(z0)
    f = recurrent_activation(z1)
    c = f * c_tm1 + i * activation(z2)
    o = recurrent_activation(z3)

    h = o * activation(c)
    return h, [h, c]
Example #33
0
  def step(cell_inputs, cell_states):
    """Step function that will be used by Keras RNN backend."""
    h_tm1 = cell_states[0]

    # inputs projected by all gate matrices at once
    matrix_x = K.dot(cell_inputs, kernel)
    matrix_x = K.bias_add(matrix_x, input_bias)

    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)

    # hidden state projected by all gate matrices at once
    matrix_inner = K.dot(h_tm1, recurrent_kernel)
    matrix_inner = K.bias_add(matrix_inner, recurrent_bias)

    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
                                                            axis=1)
    z = recurrent_activation(x_z + recurrent_z)
    r = recurrent_activation(x_r + recurrent_r)
    hh = activation(x_h + r * recurrent_h)

    # previous and candidate state mixed by update gate
    h = z * h_tm1 + (1 - z) * hh
    return h, [h]