def get_constants(self, x): ''' get_constants方法有父类LSTM调用,定义了在step函数外的组件,这些组件就不需要序列中的每次输入都重新计算 ''' constants = super(AttentionLSTM, self).get_constants(x) constants.append(K.dot(self.attention_vec, self.U_m) + self.b_m) return constants
def call(self,inputs,mask=None): #w_c=K.repeat(self.W_c,self.input_num) #w_m=K.repeat(self.W_m,self.input_num) x=inputs[0] mem_vector=inputs[1] c=K.dot(x,self.W_c)+self.b_c #context向量 m=K.dot(x,self.W_m)+self.b_m #memory向量 mem_vec=K.repeat(mem_vector,self.input_num) #与问题进行内积 m=K.sum(m*mem_vec,axis=2,keepdims=False) s=K.softmax(m) #softmax s=K.reshape(s,(-1,self.input_num,1)) ctx=self.activation(c*s) return ctx#self.activation(ctx)
def call(self, inputs, mask=None): #w_c=K.repeat(self.W_c,self.input_num) #w_m=K.repeat(self.W_m,self.input_num) x = inputs[0] mem_vector = inputs[1] c = K.dot(x, self.W_c) + self.b_c #context向量 m = K.dot(x, self.W_m) + self.b_m #memory向量 mem_vec = K.repeat(mem_vector, self.input_num) #与问题进行内积 m = K.sum(m * mem_vec, axis=2, keepdims=False) s = K.softmax(m) #softmax s = K.reshape(s, (-1, self.input_num, 1)) ctx = self.activation(c * s) return ctx #self.activation(ctx)
def get_constants(self,inputs): ''' get_constants方法有父类LSTM调用,定义了在step函数外的组件,这些组件就不需要序列中的每次输入都重新计算 ''' x=inputs[0] attention_vec=inputs[1] constants=super(AttentionLSTM,self).get_constants(x) constants.append(K.dot(attention_vec,self.U_m)+self.b_m) return constants
def step(self, x, states): ''' step方法由父类RNN调用,定义每次输入在网络中的传播的运算 states[4]存放attention_vec到attention层的输出状态 ''' h, [h, c] = super(AttentionLSTM, self).step(x, states) attention = states[4] m = self.attn_inner_activation( K.dot(h, self.U_a) * attention + self.b_a) # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems # which I think might have been caused by the exponential function -> gradients blow up) s = self.attn_activation(K.dot(m, self.U_s) + self.b_s) if self.single_attention_param: h = h * K.repeat_elements(s, self.output_dim, axis=1) else: h = h * s return h, [h, c]
def keras_loss(y_true, y_pred): regularization_constant_1 = regularization_constant_2 = 1e-4 epsilon = 1e-12 o1 = o2 = int(y_pred.shape[1] // 2) h_1 = y_pred[:, 0:o1] h_2 = y_pred[:, o1:o1+o2] h_1 = tf.transpose(h_1) h_2 = tf.transpose(h_2) m = tf.shape(h_1)[1] centered_h_1 = h_1 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_1, tf.ones(shape=(m, m))) centered_h_2 = h_2 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_2, tf.ones(shape=(m, m))) sigma_hat_12 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_2)) sigma_hat_11 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_1)) + regularization_constant_1 * tf.eye(num_rows=o1) sigma_hat_22 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_2, tf.transpose(centered_h_2)) + regularization_constant_2 * tf.eye(num_rows=o2) w_1, v_1 = tf.self_adjoint_eig(sigma_hat_11) w_2, v_2 = tf.self_adjoint_eig(sigma_hat_22) idx_pos_entries_1 = tf.where(tf.equal(tf.greater(w_1, epsilon), True)) idx_pos_entries_1 = tf.reshape(idx_pos_entries_1, [-1, tf.shape(idx_pos_entries_1)[0]])[0] w_1 = tf.gather(w_1, idx_pos_entries_1) v_1 = tf.gather(v_1, idx_pos_entries_1) idx_pos_entries_2 = tf.where(tf.equal(tf.greater(w_2, epsilon), True)) idx_pos_entries_2 = tf.reshape(idx_pos_entries_2, [-1, tf.shape(idx_pos_entries_2)[0]])[0] w_2 = tf.gather(w_2, idx_pos_entries_2) v_2 = tf.gather(v_2, idx_pos_entries_2) sigma_hat_rootinvert_11 = tf.matmul(tf.matmul(v_1, tf.diag(tf.divide(1,tf.sqrt(w_1)))), tf.transpose(v_1)) sigma_hat_rootinvert_22 = tf.matmul(tf.matmul(v_2, tf.diag(tf.divide(1,tf.sqrt(w_2)))), tf.transpose(v_2)) t_matrix = tf.matmul(tf.matmul(sigma_hat_rootinvert_11, sigma_hat_12), sigma_hat_rootinvert_22) if k_singular_values == representation_size: # use all correlation = tf.sqrt(tf.trace(tf.matmul(tf.transpose(t_matrix), t_matrix))) else: w, v = tf.self_adjoint_eig(K.dot(K.transpose(t_matrix), t_matrix)) non_critical_indexes = tf.where(tf.equal(tf.greater(w, epsilon), True)) non_critical_indexes = tf.reshape(non_critical_indexes, [-1, tf.shape(non_critical_indexes)[0]])[0] w = tf.gather(w, non_critical_indexes) w = tf.gather(w, tf.nn.top_k(w[:, 2]).indices) correlation = tf.reduce_sum(tf.sqrt(w[0:representation_size])) return -correlation
def step(self, x, states): ''' step方法由父类RNN调用,定义每次输入在网络中的传播的运算 states[4]存放attention_vec到attention层的输出状态 ''' h_tm1 = states[0] c_tm1 = states[1] B_U = states[2] B_W = states[3] if self.consume_less == 'cpu': x_i = x[:, :self.output_dim] x_f = x[:, self.output_dim:2 * self.output_dim] x_c = x[:, 2 * self.output_dim:3 * self.output_dim] x_o = x[:, 3 * self.output_dim:] else: x_i = K.dot(x * B_W[0], self.W_i) + self.b_i x_f = K.dot(x * B_W[1], self.W_f) + self.b_f x_c = K.dot(x * B_W[2], self.W_c) + self.b_c x_o = K.dot(x * B_W[3], self.W_o) + self.b_o i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i)) f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f)) c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * B_U[2], self.U_c)) o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o)) h = o * self.activation(c) attention = states[4] m = self.attn_inner_activation( K.dot(K.dot(x_i, self.W_i.T), self.U_a) + attention + self.b_a) # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems # which I think might have been caused by the exponential function -> gradients blow up) s = self.attn_activation(K.dot(m, self.U_s) + self.b_s) if self.single_attention_param: h = h * K.repeat_elements(s, self.output_dim, axis=1) else: h = h * s return h, [h, c]
def step(self,x,states): ''' step方法由父类RNN调用,定义每次输入在网络中的传播的运算 states[4]存放attention_vec到attention层的输出状态 ''' h_tm1 = states[0] c_tm1 = states[1] B_U = states[2] B_W = states[3] if self.consume_less == 'cpu': x_i = x[:, :self.output_dim] x_f = x[:, self.output_dim: 2 * self.output_dim] x_c = x[:, 2 * self.output_dim: 3 * self.output_dim] x_o = x[:, 3 * self.output_dim:] else: x_i = K.dot(x * B_W[0], self.W_i) + self.b_i x_f = K.dot(x * B_W[1], self.W_f) + self.b_f x_c = K.dot(x * B_W[2], self.W_c) + self.b_c x_o = K.dot(x * B_W[3], self.W_o) + self.b_o i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i)) f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f)) c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * B_U[2], self.U_c)) o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o)) h = o * self.activation(c) attention=states[4] m = self.attn_inner_activation(K.dot(K.dot(x_i,self.W_i.T), self.U_a) +attention + self.b_a) # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems # which I think might have been caused by the exponential function -> gradients blow up) s = self.attn_activation(K.dot(m, self.U_s) + self.b_s) if self.single_attention_param: h = h * K.repeat_elements(s, self.output_dim, axis=1) else: h = h * s return h, [h, c]