Esempio n. 1
0
 def get_constants(self, x):
     '''
      get_constants方法有父类LSTM调用,定义了在step函数外的组件,这些组件就不需要序列中的每次输入都重新计算        
     '''
     constants = super(AttentionLSTM, self).get_constants(x)
     constants.append(K.dot(self.attention_vec, self.U_m) + self.b_m)
     return constants
Esempio n. 2
0
 def call(self,inputs,mask=None):
     #w_c=K.repeat(self.W_c,self.input_num)
     #w_m=K.repeat(self.W_m,self.input_num)
 
     x=inputs[0]
     mem_vector=inputs[1]
     
     c=K.dot(x,self.W_c)+self.b_c #context向量
     m=K.dot(x,self.W_m)+self.b_m #memory向量
     mem_vec=K.repeat(mem_vector,self.input_num) #与问题进行内积
     m=K.sum(m*mem_vec,axis=2,keepdims=False)
     s=K.softmax(m)  #softmax
     s=K.reshape(s,(-1,self.input_num,1))
     ctx=self.activation(c*s)
     
     return ctx#self.activation(ctx)
Esempio n. 3
0
    def call(self, inputs, mask=None):
        #w_c=K.repeat(self.W_c,self.input_num)
        #w_m=K.repeat(self.W_m,self.input_num)

        x = inputs[0]
        mem_vector = inputs[1]

        c = K.dot(x, self.W_c) + self.b_c  #context向量
        m = K.dot(x, self.W_m) + self.b_m  #memory向量
        mem_vec = K.repeat(mem_vector, self.input_num)  #与问题进行内积
        m = K.sum(m * mem_vec, axis=2, keepdims=False)
        s = K.softmax(m)  #softmax
        s = K.reshape(s, (-1, self.input_num, 1))
        ctx = self.activation(c * s)

        return ctx  #self.activation(ctx)
Esempio n. 4
0
 def get_constants(self,inputs):
     '''
      get_constants方法有父类LSTM调用,定义了在step函数外的组件,这些组件就不需要序列中的每次输入都重新计算        
     '''
     x=inputs[0]
     attention_vec=inputs[1]
     constants=super(AttentionLSTM,self).get_constants(x)
     constants.append(K.dot(attention_vec,self.U_m)+self.b_m)
     return constants    
Esempio n. 5
0
    def step(self, x, states):
        '''
            step方法由父类RNN调用,定义每次输入在网络中的传播的运算
            states[4]存放attention_vec到attention层的输出状态        
        '''
        h, [h, c] = super(AttentionLSTM, self).step(x, states)
        attention = states[4]

        m = self.attn_inner_activation(
            K.dot(h, self.U_a) * attention + self.b_a)
        # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems
        # which I think might have been caused by the exponential function -> gradients blow up)
        s = self.attn_activation(K.dot(m, self.U_s) + self.b_s)

        if self.single_attention_param:
            h = h * K.repeat_elements(s, self.output_dim, axis=1)
        else:
            h = h * s
        return h, [h, c]
Esempio n. 6
0
    def keras_loss(y_true, y_pred):

        regularization_constant_1 = regularization_constant_2 = 1e-4
        epsilon = 1e-12

        o1 = o2 = int(y_pred.shape[1] // 2)

        h_1 = y_pred[:, 0:o1]
        h_2 = y_pred[:, o1:o1+o2]

        h_1 = tf.transpose(h_1)
        h_2 = tf.transpose(h_2)

        m = tf.shape(h_1)[1]

        centered_h_1 = h_1 - tf.cast(tf.divide(1, m),  tf.float32) * tf.matmul(h_1, tf.ones(shape=(m, m)))
        centered_h_2 = h_2 - tf.cast(tf.divide(1, m),  tf.float32) * tf.matmul(h_2, tf.ones(shape=(m, m)))

        sigma_hat_12 = tf.cast(tf.divide(1, m - 1),  tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_2))
        sigma_hat_11 = tf.cast(tf.divide(1, m - 1),  tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_1)) + regularization_constant_1 * tf.eye(num_rows=o1)
        sigma_hat_22 = tf.cast(tf.divide(1, m - 1),  tf.float32) * tf.matmul(centered_h_2, tf.transpose(centered_h_2)) + regularization_constant_2 * tf.eye(num_rows=o2)

        w_1, v_1 = tf.self_adjoint_eig(sigma_hat_11)
        w_2, v_2 = tf.self_adjoint_eig(sigma_hat_22)

        idx_pos_entries_1 = tf.where(tf.equal(tf.greater(w_1, epsilon), True))
        idx_pos_entries_1 = tf.reshape(idx_pos_entries_1, [-1, tf.shape(idx_pos_entries_1)[0]])[0]

        w_1 = tf.gather(w_1, idx_pos_entries_1)
        v_1 = tf.gather(v_1, idx_pos_entries_1)

        idx_pos_entries_2 = tf.where(tf.equal(tf.greater(w_2, epsilon), True))
        idx_pos_entries_2 = tf.reshape(idx_pos_entries_2, [-1, tf.shape(idx_pos_entries_2)[0]])[0]
        w_2 = tf.gather(w_2, idx_pos_entries_2)
        v_2 = tf.gather(v_2, idx_pos_entries_2)

        sigma_hat_rootinvert_11 = tf.matmul(tf.matmul(v_1, tf.diag(tf.divide(1,tf.sqrt(w_1)))), tf.transpose(v_1))
        sigma_hat_rootinvert_22 = tf.matmul(tf.matmul(v_2, tf.diag(tf.divide(1,tf.sqrt(w_2)))), tf.transpose(v_2))

        t_matrix = tf.matmul(tf.matmul(sigma_hat_rootinvert_11, sigma_hat_12), sigma_hat_rootinvert_22)

        if k_singular_values == representation_size:    # use all
            correlation = tf.sqrt(tf.trace(tf.matmul(tf.transpose(t_matrix), t_matrix)))
        else:
            w, v = tf.self_adjoint_eig(K.dot(K.transpose(t_matrix), t_matrix))
            non_critical_indexes = tf.where(tf.equal(tf.greater(w, epsilon), True))
            non_critical_indexes = tf.reshape(non_critical_indexes, [-1, tf.shape(non_critical_indexes)[0]])[0]
            w = tf.gather(w, non_critical_indexes)
            w = tf.gather(w, tf.nn.top_k(w[:, 2]).indices)
            correlation = tf.reduce_sum(tf.sqrt(w[0:representation_size]))

        return -correlation
Esempio n. 7
0
    def step(self, x, states):
        '''
            step方法由父类RNN调用,定义每次输入在网络中的传播的运算
            states[4]存放attention_vec到attention层的输出状态        
        '''
        h_tm1 = states[0]
        c_tm1 = states[1]
        B_U = states[2]
        B_W = states[3]

        if self.consume_less == 'cpu':
            x_i = x[:, :self.output_dim]
            x_f = x[:, self.output_dim:2 * self.output_dim]
            x_c = x[:, 2 * self.output_dim:3 * self.output_dim]
            x_o = x[:, 3 * self.output_dim:]
        else:
            x_i = K.dot(x * B_W[0], self.W_i) + self.b_i
            x_f = K.dot(x * B_W[1], self.W_f) + self.b_f
            x_c = K.dot(x * B_W[2], self.W_c) + self.b_c
            x_o = K.dot(x * B_W[3], self.W_o) + self.b_o

        i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i))
        f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f))
        c = f * c_tm1 + i * self.activation(x_c +
                                            K.dot(h_tm1 * B_U[2], self.U_c))
        o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o))

        h = o * self.activation(c)

        attention = states[4]
        m = self.attn_inner_activation(
            K.dot(K.dot(x_i, self.W_i.T), self.U_a) + attention + self.b_a)
        # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems
        # which I think might have been caused by the exponential function -> gradients blow up)
        s = self.attn_activation(K.dot(m, self.U_s) + self.b_s)

        if self.single_attention_param:
            h = h * K.repeat_elements(s, self.output_dim, axis=1)
        else:
            h = h * s
        return h, [h, c]
Esempio n. 8
0
    def step(self,x,states):
        '''
            step方法由父类RNN调用,定义每次输入在网络中的传播的运算
            states[4]存放attention_vec到attention层的输出状态        
        '''
        h_tm1 = states[0]
        c_tm1 = states[1]
        B_U = states[2]
        B_W = states[3]

        if self.consume_less == 'cpu':
            x_i = x[:, :self.output_dim]
            x_f = x[:, self.output_dim: 2 * self.output_dim]
            x_c = x[:, 2 * self.output_dim: 3 * self.output_dim]
            x_o = x[:, 3 * self.output_dim:]
        else:
            x_i = K.dot(x * B_W[0], self.W_i) + self.b_i
            x_f = K.dot(x * B_W[1], self.W_f) + self.b_f
            x_c = K.dot(x * B_W[2], self.W_c) + self.b_c
            x_o = K.dot(x * B_W[3], self.W_o) + self.b_o

        i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i))
        f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f))
        c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * B_U[2], self.U_c))
        o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o))

        h = o * self.activation(c)
        
        attention=states[4]
        m = self.attn_inner_activation(K.dot(K.dot(x_i,self.W_i.T), self.U_a) +attention + self.b_a)
        # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems
        # which I think might have been caused by the exponential function -> gradients blow up)
        s = self.attn_activation(K.dot(m, self.U_s) + self.b_s)

        if self.single_attention_param:
            h = h * K.repeat_elements(s, self.output_dim, axis=1)
        else:
            h = h * s
        return h, [h, c]