def step(self, x, states): ''' step方法由父类RNN调用,定义每次输入在网络中的传播的运算 states[4]存放attention_vec到attention层的输出状态 ''' h, [h, c] = super(AttentionLSTM, self).step(x, states) attention = states[4] m = self.attn_inner_activation( K.dot(h, self.U_a) * attention + self.b_a) # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems # which I think might have been caused by the exponential function -> gradients blow up) s = self.attn_activation(K.dot(m, self.U_s) + self.b_s) if self.single_attention_param: h = h * K.repeat_elements(s, self.output_dim, axis=1) else: h = h * s return h, [h, c]
def step(self, x, states): ''' step方法由父类RNN调用,定义每次输入在网络中的传播的运算 states[4]存放attention_vec到attention层的输出状态 ''' h_tm1 = states[0] c_tm1 = states[1] B_U = states[2] B_W = states[3] if self.consume_less == 'cpu': x_i = x[:, :self.output_dim] x_f = x[:, self.output_dim:2 * self.output_dim] x_c = x[:, 2 * self.output_dim:3 * self.output_dim] x_o = x[:, 3 * self.output_dim:] else: x_i = K.dot(x * B_W[0], self.W_i) + self.b_i x_f = K.dot(x * B_W[1], self.W_f) + self.b_f x_c = K.dot(x * B_W[2], self.W_c) + self.b_c x_o = K.dot(x * B_W[3], self.W_o) + self.b_o i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i)) f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f)) c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * B_U[2], self.U_c)) o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o)) h = o * self.activation(c) attention = states[4] m = self.attn_inner_activation( K.dot(K.dot(x_i, self.W_i.T), self.U_a) + attention + self.b_a) # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems # which I think might have been caused by the exponential function -> gradients blow up) s = self.attn_activation(K.dot(m, self.U_s) + self.b_s) if self.single_attention_param: h = h * K.repeat_elements(s, self.output_dim, axis=1) else: h = h * s return h, [h, c]
def step(self,x,states): ''' step方法由父类RNN调用,定义每次输入在网络中的传播的运算 states[4]存放attention_vec到attention层的输出状态 ''' h_tm1 = states[0] c_tm1 = states[1] B_U = states[2] B_W = states[3] if self.consume_less == 'cpu': x_i = x[:, :self.output_dim] x_f = x[:, self.output_dim: 2 * self.output_dim] x_c = x[:, 2 * self.output_dim: 3 * self.output_dim] x_o = x[:, 3 * self.output_dim:] else: x_i = K.dot(x * B_W[0], self.W_i) + self.b_i x_f = K.dot(x * B_W[1], self.W_f) + self.b_f x_c = K.dot(x * B_W[2], self.W_c) + self.b_c x_o = K.dot(x * B_W[3], self.W_o) + self.b_o i = self.inner_activation(x_i + K.dot(h_tm1 * B_U[0], self.U_i)) f = self.inner_activation(x_f + K.dot(h_tm1 * B_U[1], self.U_f)) c = f * c_tm1 + i * self.activation(x_c + K.dot(h_tm1 * B_U[2], self.U_c)) o = self.inner_activation(x_o + K.dot(h_tm1 * B_U[3], self.U_o)) h = o * self.activation(c) attention=states[4] m = self.attn_inner_activation(K.dot(K.dot(x_i,self.W_i.T), self.U_a) +attention + self.b_a) # Intuitively it makes more sense to use a sigmoid (was getting some NaN problems # which I think might have been caused by the exponential function -> gradients blow up) s = self.attn_activation(K.dot(m, self.U_s) + self.b_s) if self.single_attention_param: h = h * K.repeat_elements(s, self.output_dim, axis=1) else: h = h * s return h, [h, c]