class LstmLayer(object): def __init__(self, input_width, state_width, learning_rate): self.input_width = input_width self.state_width = state_width self.learning_rate = learning_rate # 门的激活函数 self.gate_activator = SigmoidActivator() # 输出的激活函数 self.output_activator = TanhActivator() # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec() # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf self.Wfh, self.Wfx, self.bf = (self.init_weight_mat()) # 输入门权重矩阵Wfh, Wfx, 偏置项bf self.Wih, self.Wix, self.bi = (self.init_weight_mat()) # 输出门权重矩阵Wfh, Wfx, 偏置项bf self.Woh, self.Wox, self.bo = (self.init_weight_mat()) # 单元状态权重矩阵Wfh, Wfx, 偏置项bf self.Wch, self.Wcx, self.bc = (self.init_weight_mat()) def init_state_vec(self): ''' 初始化保存状态的向量 ''' state_vec_list = [] state_vec_list.append(np.zeros((self.state_width, 1))) return state_vec_list def init_weight_mat(self): ''' 初始化权重矩阵 ''' Wh = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.state_width)) Wx = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.input_width)) b = np.zeros((self.state_width, 1)) return Wh, Wx, b def forward(self, x): ''' 根据式1-式6进行前向计算 ''' self.times += 1 # 遗忘门 fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf, self.gate_activator) self.f_list.append(fg) # 输入门 ig = self.calc_gate(x, self.Wix, self.Wih, self.bi, self.gate_activator) self.i_list.append(ig) # 输出门 og = self.calc_gate(x, self.Wox, self.Woh, self.bo, self.gate_activator) self.o_list.append(og) # 即时状态 ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc, self.output_activator) self.ct_list.append(ct) # 单元状态 c = fg * self.c_list[self.times - 1] + ig * ct self.c_list.append(c) # 输出 h = og * self.output_activator.forward(c) self.h_list.append(h) def calc_gate(self, x, Wx, Wh, b, activator): ''' 计算门 ''' h = self.h_list[self.times - 1] # 上次的LSTM输出 net = np.dot(Wh, h) + np.dot(Wx, x) + b gate = activator.forward(net) return gate def backward(self, x, delta_h, activator): ''' 实现LSTM训练算法 ''' self.calc_delta(delta_h, activator) self.calc_gradient(x) def update(self): ''' 按照梯度下降,更新权重 ''' self.Wfh -= self.learning_rate * self.Whf_grad self.Wfx -= self.learning_rate * self.Whx_grad self.bf -= self.learning_rate * self.bf_grad self.Wih -= self.learning_rate * self.Whi_grad self.Wix -= self.learning_rate * self.Whi_grad self.bi -= self.learning_rate * self.bi_grad self.Woh -= self.learning_rate * self.Wof_grad self.Wox -= self.learning_rate * self.Wox_grad self.bo -= self.learning_rate * self.bo_grad self.Wch -= self.learning_rate * self.Wcf_grad self.Wcx -= self.learning_rate * self.Wcx_grad self.bc -= self.learning_rate * self.bc_grad def calc_delta(self, delta_h, activator): # 初始化各个时刻的误差项 self.delta_h_list = self.init_delta() # 输出误差项 self.delta_o_list = self.init_delta() # 输出门误差项 self.delta_i_list = self.init_delta() # 输入门误差项 self.delta_f_list = self.init_delta() # 遗忘门误差项 self.delta_ct_list = self.init_delta() # 即时输出误差项 # 保存从上一层传递下来的当前时刻的误差项 self.delta_h_list[-1] = delta_h # 迭代计算每个时刻的误差项 for k in range(self.times, 0, -1): self.calc_delta_k(k) def init_delta(self): ''' 初始化误差项 ''' delta_list = [] for i in range(self.times + 1): delta_list.append(np.zeros((self.state_width, 1))) return delta_list def calc_delta_k(self, k): ''' 根据k时刻的delta_h,计算k时刻的delta_f、 delta_i、delta_o、delta_ct,以及k-1时刻的delta_h ''' # 获得k时刻前向计算的值 ig = self.i_list[k] og = self.o_list[k] fg = self.f_list[k] ct = self.ct_list[k] c = self.c_list[k] c_prev = self.c_list[k - 1] tanh_c = self.output_activator.forward(c) delta_k = self.delta_h_list[k] # 根据式9计算delta_o delta_o = (delta_k * tanh_c * self.gate_activator.backward(og)) delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev * self.gate_activator.backward(fg)) delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct * self.gate_activator.backward(ig)) delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig * self.output_activator.backward(ct)) delta_h_prev = (np.dot(delta_o.transpose(), self.Woh) + np.dot(delta_i.transpose(), self.Wih) + np.dot(delta_f.transpose(), self.Wfh) + np.dot(delta_ct.transpose(), self.Wch)).transpose() # 保存全部delta值 self.delta_h_list[k - 1] = delta_h_prev self.delta_f_list[k] = delta_f self.delta_i_list[k] = delta_i self.delta_o_list[k] = delta_o self.delta_ct_list[k] = delta_ct def calc_gradient(self, x): # 初始化遗忘门权重梯度矩阵和偏置项 self.Wfh_grad, self.Wfx_grad, self.bf_grad = ( self.init_weight_gradient_mat()) # 初始化输入门权重梯度矩阵和偏置项 self.Wih_grad, self.Wix_grad, self.bi_grad = ( self.init_weight_gradient_mat()) # 初始化输出门权重梯度矩阵和偏置项 self.Woh_grad, self.Wox_grad, self.bo_grad = ( self.init_weight_gradient_mat()) # 初始化单元状态权重梯度矩阵和偏置项 self.Wch_grad, self.Wcx_grad, self.bc_grad = ( self.init_weight_gradient_mat()) # 计算对上一次输出h的权重梯度 for t in range(self.times, 0, -1): # 计算各个时刻的梯度 (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad, bc_grad) = (self.calc_gradient_t(t)) # 实际梯度是各时刻梯度之和 self.Wfh_grad += Wfh_grad self.bf_grad += bf_grad self.Wih_grad += Wih_grad self.bi_grad += bi_grad self.Woh_grad += Woh_grad self.bo_grad += bo_grad self.Wch_grad += Wch_grad self.bc_grad += bc_grad # 计算对本次输入x的权重梯度 xt = x.transpose() self.Wfx_grad = np.dot(self.delta_f_list[-1], xt) self.Wix_grad = np.dot(self.delta_i_list[-1], xt) self.Wox_grad = np.dot(self.delta_o_list[-1], xt) self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt) def init_weight_gradient_mat(self): ''' 初始化权重矩阵 ''' Wh_grad = np.zeros((self.state_width, self.state_width)) Wx_grad = np.zeros((self.state_width, self.input_width)) b_grad = np.zeros((self.state_width, 1)) return Wh_grad, Wx_grad, b_grad def calc_gradient_t(self, t): ''' 计算每个时刻t权重的梯度 ''' h_prev = self.h_list[t - 1].transpose() Wfh_grad = np.dot(self.delta_f_list[t], h_prev) bf_grad = self.delta_f_list[t] Wih_grad = np.dot(self.delta_i_list[t], h_prev) bi_grad = self.delta_f_list[t] Woh_grad = np.dot(self.delta_o_list[t], h_prev) bo_grad = self.delta_f_list[t] Wch_grad = np.dot(self.delta_ct_list[t], h_prev) bc_grad = self.delta_ct_list[t] return Wfh_grad, bf_grad, Wih_grad, bi_grad, \ Woh_grad, bo_grad, Wch_grad, bc_grad def reset_state(self): # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec()
class LstmLayer(object): ''' 隐藏层相比rnn(只有一个hidden state),额外增加cell state 怎样控制长期状态cell state?有三个开关,(每个开关的实现涉及到门的输入到输出的激活计算): 第一个开关,负责控制继续保存长期状态c; 遗忘门:决定上一个时刻的单元状态Ct-1有多少保留到当前时刻的Ct 1.遗忘门:Ft=f(Wf*[Ht-1,Xt]+Bf) 第二个开关,负责控制把即时状态输入到长期状态c; 输入门:决定当前时刻网络的输入Xt有多少保存到单元状态Ct 1.输入门:It=f(Wc*[Ht-1,Xt]+Bi) 2.当前输入的单元状态Ct' = tanh(Wc*[Ht-1,Xt]+Bc) 3.当前时刻的单元状态(当前的记忆Ct'和长期的记忆Ct-1融合一起)Ct = Ft。Ct-1+It。Ct' 第三个开关,负责控制是否把长期状态c作为当前的LSTM的输出。 输出门:控制单元状态Ct有多少输出到LSTM的当前输出值ht; 1.输出门:Ot = f(Wo*[Ht-1,Xt]+Bo) 2.LSTM最终的输出:Ht = Ot。tanh(Ct) ''' def __init__(self, input_width, state_width, learning_rate): self.input_width = input_width self.state_width = state_width self.learning_rate = learning_rate # 门的激活函数 self.gate_activator = SigmoidActivator() # 输出的激活函数 self.output_activator = TanhActivator() # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元cell状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec() # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf self.Wfh, self.Wfx, self.bf = (self.init_weight_mat()) # 输入门权重矩阵Wfh, Wfx, 偏置项bf self.Wih, self.Wix, self.bi = (self.init_weight_mat()) # 输出门权重矩阵Wfh, Wfx, 偏置项bf self.Woh, self.Wox, self.bo = (self.init_weight_mat()) # 单元状态权重矩阵Wfh, Wfx, 偏置项bf self.Wch, self.Wcx, self.bc = (self.init_weight_mat()) def init_state_vec(self): ''' 初始化保存状态的向量 ''' state_vec_list = [] state_vec_list.append(np.zeros((self.state_width, 1))) return state_vec_list def init_weight_mat(self): ''' 初始化权重矩阵 ''' Wh = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.state_width)) Wx = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.input_width)) b = np.zeros((self.state_width, 1)) return Wh, Wx, b def forward(self, x): ''' 前向传播 ''' self.times += 1 # 遗忘门 fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf, self.gate_activator) self.f_list.append(fg) # 输入门 ig = self.calc_gate(x, self.Wix, self.Wih, self.bi, self.gate_activator) self.i_list.append(ig) # 输出门 og = self.calc_gate(x, self.Wox, self.Woh, self.bo, self.gate_activator) self.o_list.append(og) # 即时状态 ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc, self.output_activator) self.ct_list.append(ct) # 单元状态 c = fg * self.c_list[self.times - 1] + ig * ct self.c_list.append(c) # 输出 h = og * self.output_activator(c) self.h_list.append(h) def calc_gate(self, x, Wx, Wh, b, activator): ''' 计算门 ''' h = self.h_list[self.times - 1] # 上次的LSTM输出 net = np.dot(Wh, h) + np.dot(Wx, x) + b gate = activator.forward(net) return gate def backward(self, x, delta_h, activator): ''' 实现lstm训练算法 ''' self.calc_delta(delta_h, activator) self.calc_gradient(x) def calc_delta(self, delta_h, activator): ##初始化各个时刻误差 self.delta_h_list = self.init_delta() # 输出误差项 self.delta_o_list = self.init_delta() # 输出门误差项 self.delta_i_list = self.init_delta() # 输入门误差项 self.delta_f_list = self.init_delta() # 遗忘门误差项 self.delta_ct_list = self.init_delta() # 即时输出误差项 # 保存上一层传递下来的当前时刻的误差项 self.delta_h_list[-1] = delta_h # 迭代计算每个时刻的误差项 for k in range(self.times, 0, -1): self.calc_delta_k(k) def calc_delta_k(self, k): ''' t时刻的误差沿着时间的反向传播公式: 根据k时刻的delta_h,计算k时刻的delta_f、delta_i、delta_o、delta_ct,以及k-1时刻的delta_h ''' ig = self.i_list[k] og = self.o_list[k] fg = self.f_list[k] ct = self.ct_list[k] c = self.c_list[k] c_prev = self.c_list[k - 1] tanh_c = self.output_activator.forward(c) delta_k = self.delta_h_list[k] delta_o = (delta_k * tanh_c * self.gate_activator.backward(og)) delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev * self.gate_activator.backward(fg)) delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct * self.gate_activator.backward(ig)) delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig * self.output_activator.backward(ct)) delta_h_prev = (np.dot(delta_o.transpose(), self.Woh) + np.dot(delta_i.transpose(), self.Wih) + np.dot(delta_f.transpose(), self.Wfh) + np.dot(delta_ct.transpose(), self.Wch)).transpose() # 保存全部的delta值 self.delta_h_list[k - 1] = delta_h_prev self.delta_f_list[k] = delta_f self.delta_i_list[k] = delta_i self.delta_o_list[k] = delta_o self.delta_ct_list[k] = delta_ct def init_delta(self): ''' 初始化误差项 ''' delta_list = [] for i in range(self.times + 1): delta_list.append(np.zeros((self.state_width, 1))) return delta_list def calc_gradient(self, x): # 初始化遗忘门权重梯度矩阵和偏置项 self.Wfh_grad, self.Wfx_grad, self.bf_grad = ( self.init_weight_gradient_mat()) # 初始化输入门权重梯度矩阵和偏置项 self.Wih_grad, self.Wix_grad, self.bi_grad = ( self.init_weight_gradient_mat()) # 初始化输出门权重梯度矩阵和偏置项 self.Woh_grad, self.Wox_grad, self.bo_grad = ( self.init_weight_gradient_mat()) # 初始化单元状态权重梯度矩阵和偏置项 self.Wch_grad, self.Wcx_grad, self.bc_grad = ( self.init_weight_gradient_mat()) # 计算对上一次输出h的权重梯度 for t in range(self.times, 0, -1): # 计算各个时刻的梯度 (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad, bc_grad) = (self.calc_gradient_t(t)) def init_weight_gradient_mat(self): ''' 初始化权重矩阵 ''' Wh_grad = np.zeros((self.state_width, self.state_width)) Wx_grad = np.zeros((self.state_width, self.input_width)) b_grad = np.zeros((self.state_width, 1)) return Wh_grad, Wx_grad, b_grad def calc_gradient_t(self, t): ''' 计算每个时刻t权重的梯度 ''' h_prev = self.h_list[t - 1].transpose() Wf
class LstmLayer(object): def __init__(self, input_width, state_width, learning_rate): self.input_width = input_width self.state_width = state_width self.learning_rate = learning_rate # 门的激活函数 self.gate_activator = SigmoidActivator() # 输出的激活函数 self.output_activator = TanhActivator() # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec() # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf self.Wfh, self.Wfx, self.bf = ( self.init_weight_mat()) # 输入门权重矩阵Wfh, Wfx, 偏置项bf self.Wih, self.Wix, self.bi = ( self.init_weight_mat()) # 输出门权重矩阵Wfh, Wfx, 偏置项bf self.Woh, self.Wox, self.bo = ( self.init_weight_mat()) # 单元状态权重矩阵Wfh, Wfx, 偏置项bf self.Wch, self.Wcx, self.bc = ( self.init_weight_mat()) def init_state_vec(self): ''' 初始化保存状态的向量 ''' state_vec_list = [] state_vec_list.append(np.zeros( (self.state_width, 1))) return state_vec_list def init_weight_mat(self): ''' 初始化权重矩阵 ''' Wh = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.state_width)) Wx = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.input_width)) b = np.zeros((self.state_width, 1)) return Wh, Wx, b def forward(self, x): ''' 根据式1-式6进行前向计算 ''' self.times += 1 # 遗忘门 fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf, self.gate_activator) self.f_list.append(fg) # 输入门 ig = self.calc_gate(x, self.Wix, self.Wih, self.bi, self.gate_activator) self.i_list.append(ig) # 输出门 og = self.calc_gate(x, self.Wox, self.Woh, self.bo, self.gate_activator) self.o_list.append(og) # 即时状态 ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc, self.output_activator) self.ct_list.append(ct) # 单元状态 c = fg * self.c_list[self.times - 1] + ig * ct self.c_list.append(c) # 输出 h = og * self.output_activator.forward(c) self.h_list.append(h) def calc_gate(self, x, Wx, Wh, b, activator): ''' 计算门 ''' h = self.h_list[self.times - 1] # 上次的LSTM输出 net = np.dot(Wh, h) + np.dot(Wx, x) + b gate = activator.forward(net) return gate def backward(self, x, delta_h, activator): ''' 实现LSTM训练算法 ''' self.calc_delta(delta_h, activator) self.calc_gradient(x) def update(self): ''' 按照梯度下降,更新权重 ''' self.Wfh -= self.learning_rate * self.Whf_grad self.Wfx -= self.learning_rate * self.Whx_grad self.bf -= self.learning_rate * self.bf_grad self.Wih -= self.learning_rate * self.Whi_grad self.Wix -= self.learning_rate * self.Whi_grad self.bi -= self.learning_rate * self.bi_grad self.Woh -= self.learning_rate * self.Wof_grad self.Wox -= self.learning_rate * self.Wox_grad self.bo -= self.learning_rate * self.bo_grad self.Wch -= self.learning_rate * self.Wcf_grad self.Wcx -= self.learning_rate * self.Wcx_grad self.bc -= self.learning_rate * self.bc_grad def calc_delta(self, delta_h, activator): # 初始化各个时刻的误差项 self.delta_h_list = self.init_delta() # 输出误差项 self.delta_o_list = self.init_delta() # 输出门误差项 self.delta_i_list = self.init_delta() # 输入门误差项 self.delta_f_list = self.init_delta() # 遗忘门误差项 self.delta_ct_list = self.init_delta() # 即时输出误差项 # 保存从上一层传递下来的当前时刻的误差项 self.delta_h_list[-1] = delta_h # 迭代计算每个时刻的误差项 for k in range(self.times, 0, -1): self.calc_delta_k(k) def init_delta(self): ''' 初始化误差项 ''' delta_list = [] for i in range(self.times + 1): delta_list.append(np.zeros( (self.state_width, 1))) return delta_list def calc_delta_k(self, k): ''' 根据k时刻的delta_h,计算k时刻的delta_f、 delta_i、delta_o、delta_ct,以及k-1时刻的delta_h ''' # 获得k时刻前向计算的值 ig = self.i_list[k] og = self.o_list[k] fg = self.f_list[k] ct = self.ct_list[k] c = self.c_list[k] c_prev = self.c_list[k-1] tanh_c = self.output_activator.forward(c) delta_k = self.delta_h_list[k] # 根据式9计算delta_o delta_o = (delta_k * tanh_c * self.gate_activator.backward(og)) delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev * self.gate_activator.backward(fg)) delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct * self.gate_activator.backward(ig)) delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig * self.output_activator.backward(ct)) delta_h_prev = ( np.dot(delta_o.transpose(), self.Woh) + np.dot(delta_i.transpose(), self.Wih) + np.dot(delta_f.transpose(), self.Wfh) + np.dot(delta_ct.transpose(), self.Wch) ).transpose() # 保存全部delta值 self.delta_h_list[k-1] = delta_h_prev self.delta_f_list[k] = delta_f self.delta_i_list[k] = delta_i self.delta_o_list[k] = delta_o self.delta_ct_list[k] = delta_ct def calc_gradient(self, x): # 初始化遗忘门权重梯度矩阵和偏置项 self.Wfh_grad, self.Wfx_grad, self.bf_grad = ( self.init_weight_gradient_mat()) # 初始化输入门权重梯度矩阵和偏置项 self.Wih_grad, self.Wix_grad, self.bi_grad = ( self.init_weight_gradient_mat()) # 初始化输出门权重梯度矩阵和偏置项 self.Woh_grad, self.Wox_grad, self.bo_grad = ( self.init_weight_gradient_mat()) # 初始化单元状态权重梯度矩阵和偏置项 self.Wch_grad, self.Wcx_grad, self.bc_grad = ( self.init_weight_gradient_mat()) # 计算对上一次输出h的权重梯度 for t in range(self.times, 0, -1): # 计算各个时刻的梯度 (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad, bc_grad) = ( self.calc_gradient_t(t)) # 实际梯度是各时刻梯度之和 self.Wfh_grad += Wfh_grad self.bf_grad += bf_grad self.Wih_grad += Wih_grad self.bi_grad += bi_grad self.Woh_grad += Woh_grad self.bo_grad += bo_grad self.Wch_grad += Wch_grad self.bc_grad += bc_grad # 计算对本次输入x的权重梯度 xt = x.transpose() self.Wfx_grad = np.dot(self.delta_f_list[-1], xt) self.Wix_grad = np.dot(self.delta_i_list[-1], xt) self.Wox_grad = np.dot(self.delta_o_list[-1], xt) self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt) def init_weight_gradient_mat(self): ''' 初始化权重矩阵 ''' Wh_grad = np.zeros((self.state_width, self.state_width)) Wx_grad = np.zeros((self.state_width, self.input_width)) b_grad = np.zeros((self.state_width, 1)) return Wh_grad, Wx_grad, b_grad def calc_gradient_t(self, t): ''' 计算每个时刻t权重的梯度 ''' h_prev = self.h_list[t-1].transpose() Wfh_grad = np.dot(self.delta_f_list[t], h_prev) bf_grad = self.delta_f_list[t] Wih_grad = np.dot(self.delta_i_list[t], h_prev) bi_grad = self.delta_f_list[t] Woh_grad = np.dot(self.delta_o_list[t], h_prev) bo_grad = self.delta_f_list[t] Wch_grad = np.dot(self.delta_ct_list[t], h_prev) bc_grad = self.delta_ct_list[t] return Wfh_grad, bf_grad, Wih_grad, bi_grad, \ Woh_grad, bo_grad, Wch_grad, bc_grad def reset_state(self): # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec()
class LstmLayer(object): def __init__(self, input_width, state_width, learning_rate): self.input_width = input_width self.state_width = state_width self.learning_rate = learning_rate self.gate_activator = SigmoidActivator() self.output_activator = TanhActivator() self.times = 0 self.c_list = self.init_state_vec() self.h_list = self.init_state_vec() self.f_list = self.init_state_vec() self.i_list = self.init_state_vec() self.o_list = self.init_state_vec() self.ct_list = self.init_state_vec() self.Wfh, self.Wfx, self.bf = (self.init_weight_mat()) self.Wih, self.Wix, self.bi = (self.init_weight_mat()) self.Woh, self.Wox, self.bo = (self.init_weight_mat()) self.Wch, self.Wcx, self.bc = (self.init_weight_mat()) def init_state_vec(self): state_vec_list = [] state_vec_list.append(np.zeros((self.state_width, 1))) return state_vec_list def init_weight_mat(self): Wh = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.state_width)) Wx = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.input_width)) b = np.zeros((self.state_width, 1)) return Wh, Wx, b def forward(self, x): self.times += 1 fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf, self.gate_activator) self.f_list.append(fg) ig = self.calc_gate(x, self.Wix, self.Wih, self.bi, self.gate_activator) self.i_list.append(ig) og = self.calc_gate(x, self.Wox, self.Woh, self.bo, self.gate_activator) self.o_list.append(og) ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc, self.output_activator) self.ct_list.append(ct) c = fg * self.c_list[self.times - 1] + ig * ct self.c_list.append(c) h = og * self.output_activator.forward(c) self.h_list.append(h) def calc_gate(self, x, Wx, Wh, b, activator): h = self.h_list[self.times - 1] net = np.dot(Wh, h) + np.dot(Wx, x) + b gate = activator.forward(net) return gate def backward(self, x, delta_h, activator): self.calc_delta(delta_h, activator) self.calc_gradient(x) def update(self): self.Wfh -= self.learning_rate * self.Whf_grad self.Wfx -= self.learning_rate * self.Whx_grad self.bf -= self.learning_rate * self.bf_grad self.Wih -= self.learning_rate * self.Whi_grad self.Wix -= self.learning_rate * self.Whi_grad self.bi -= self.learning_rate * self.bi_grad self.Woh -= self.learning_rate * self.Wof_grad self.Wox -= self.learning_rate * self.Wox_grad self.bo -= self.learning_rate * self.bo_grad self.Wch -= self.learning_rate * self.Wcf_grad self.Wcx -= self.learning_rate * self.Wcx_grad self.bc -= self.learning_rate * self.bc_grad def calc_delta(self, delta_h, activator): self.delta_h_list = self.init_delta() self.delta_o_list = self.init_delta() self.delta_i_list = self.init_delta() self.delta_f_list = self.init_delta() self.delta_ct_list = self.init_delta() self.delta_h_list[-1] = delta_h for k in range(self.times, 0, -1): self.calc_delta_k(k) def init_delta(self): delta_list = [] for i in range(self.times + 1): delta_list.append(np.zeros((self.state_width, 1))) return delta_list def calc_delta_k(self, k): ig = self.i_list[k] og = self.o_list[k] fg = self.f_list[k] ct = self.ct_list[k] c = self.c_list[k] c_prev = self.c_list[k - 1] tanh_c = self.output_activator.forward(c) delta_k = self.delta_h_list[k] delta_o = (delta_k * tanh_c * self.gate_activator.backward(og)) delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev * self.gate_activator.backward(fg)) delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct * self.gate_activator.backward(ig)) delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig * self.output_activator.backward(ct)) delta_h_prev = (np.dot(delta_o.transpose(), self.Woh) + np.dot(delta_i.transpose(), self.Wih) + np.dot(delta_f.transpose(), self.Wfh) + np.dot(delta_ct.transpose(), self.Wch)).transpose() self.delta_h_list[k - 1] = delta_h_prev self.delta_f_list[k] = delta_f self.delta_i_list[k] = delta_i self.delta_o_list[k] = delta_o self.delta_ct_list[k] = delta_ct def calc_gradient(self, x): self.Wfh_grad, self.Wfx_grad, self.bf_grad = ( self.init_weight_gradient_mat()) self.Wih_grad, self.Wix_grad, self.bi_grad = ( self.init_weight_gradient_mat()) self.Woh_grad, self.Wox_grad, self.bo_grad = ( self.init_weight_gradient_mat()) self.Wch_grad, self.Wcx_grad, self.bc_grad = ( self.init_weight_gradient_mat()) for t in range(self.times, 0, -1): (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad, bc_grad) = (self.calc_gradient_t(t)) self.Wfh_grad += Wfh_grad self.bf_grad += bf_grad self.Wih_grad += Wih_grad self.bi_grad += bi_grad self.Woh_grad += Woh_grad self.bo_grad += bo_grad self.Wch_grad += Wch_grad self.bc_grad += bc_grad xt = x.transpose() self.Wfx_grad = np.dot(self.delta_f_list[-1], xt) self.Wix_grad = np.dot(self.delta_i_list[-1], xt) self.Wox_grad = np.dot(self.delta_o_list[-1], xt) self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt) def init_weight_gradient_mat(self): Wh_grad = np.zeros((self.state_width, self.state_width)) Wx_grad = np.zeros((self.state_width, self.input_width)) b_grad = np.zeros((self.state_width, 1)) return Wh_grad, Wx_grad, b_grad def calc_gradient_t(self, t): h_prev = self.h_list[t - 1].transpose() Wfh_grad = np.dot(self.delta_f_list[t], h_prev) bf_grad = self.delta_f_list[t] Wih_grad = np.dot(self.delta_i_list[t], h_prev) bi_grad = self.delta_f_list[t] Woh_grad = np.dot(self.delta_o_list[t], h_prev) bo_grad = self.delta_f_list[t] Wch_grad = np.dot(self.delta_ct_list[t], h_prev) bc_grad = self.delta_ct_list[t] return Wfh_grad, bf_grad, Wih_grad, bi_grad, \ Woh_grad, bo_grad, Wch_grad, bc_grad def reset_state(self): self.times = 0 self.c_list = self.init_state_vec() self.h_list = self.init_state_vec() self.f_list = self.init_state_vec() self.i_list = self.init_state_vec() self.o_list = self.init_state_vec() self.ct_list = self.init_state_vec() def __str__(self): result = 'Wfh:\n%s\nWfx:\n%s\nbf:\n%s\n' % (self.Wfh, self.Wfx, self.bf) result += 'Wih:\n%s\nWix:\n%s\nbi:\n%s\n' % (self.Wih, self.Wix, self.bi) result += 'Woh:\n%s\nWox:\n%s\nbo:\n%s\n' % (self.Woh, self.Wox, self.bo) result += 'Wch:\n%s\nWcx:\n%s\nbc:\n%s\n' % (self.Wch, self.Wcx, self.bc) result += 'Wfh_grad:\n%s\nWfx_grad:\n%s\nbf_grad:\n%s\n' % ( self.Wfh_grad, self.Wfx_grad, self.bf_grad) result += 'Wih_grad:\n%s\nWix_grad:\n%s\nbi_grad:\n%s\n' % ( self.Wih_grad, self.Wix_grad, self.bi_grad) result += 'Woh_grad:\n%s\nWox_grad:\n%s\nbo_grad:\n%s\n' % ( self.Woh_grad, self.Wox_grad, self.bo_grad) result += 'Wch_grad:\n%s\nWcx_grad:\n%s\nbc_grad:\n%s\n' % ( self.Wch_grad, self.Wcx_grad, self.bc_grad) return result
class LstmLayer(object): def __init__(self, input_width, state_width, learning_rate): self.input_width = input_width self.state_width = state_width self.learning_rate = learning_rate # 门的激活函数 self.gate_activator = SigmoidActivator() # 输出的激活函数 self.output_activator = TanhActivator() # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec() # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf self.Wfh, self.Wfx, self.bf = (self.init_weight_mat()) # 输入门权重矩阵Wih, Wix, 偏置项bi self.Wih, self.Wix, self.bi = (self.init_weight_mat()) # 输出门权重矩阵Woh, Wox, 偏置项bo self.Woh, self.Wox, self.bo = (self.init_weight_mat()) # 单元状态权重矩阵Wch, Wcx, 偏置项bc self.Wch, self.Wcx, self.bc = (self.init_weight_mat()) ## 初始化保存各类中间状态的向量 def init_state_vec(self): ''' 初始化保存状态的向量 ''' state_vec_list = [] state_vec_list.append(np.zeros((self.state_width, 1))) return state_vec_list ## 初始化保存各类权重矩阵 def init_weight_mat(self): ''' 初始化权重矩阵 ''' Wh = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.state_width)) Wx = np.random.uniform(-1e-4, 1e-4, (self.state_width, self.input_width)) b = np.zeros((self.state_width, 1)) return Wh, Wx, b ## forward方法实现了LSTM的前向计算 def forward(self, x): ''' 根据式1-式6进行前向计算 ''' self.times += 1 # 遗忘门 fg = self.calc_gate(x, self.Wfx, self.Wfh, self.bf, self.gate_activator) self.f_list.append(fg) # 输入门 ig = self.calc_gate(x, self.Wix, self.Wih, self.bi, self.gate_activator) self.i_list.append(ig) # 输出门 og = self.calc_gate(x, self.Wox, self.Woh, self.bo, self.gate_activator) self.o_list.append(og) # 即时状态 ct = self.calc_gate(x, self.Wcx, self.Wch, self.bc, self.output_activator) self.ct_list.append(ct) # 单元状态: Ct = Ft * Ct-1 + It * c~t c = fg * self.c_list[self.times - 1] + ig * ct self.c_list.append(c) # 输出: Ht = Ot * TANH(Ct) h = og * self.output_activator.forward(c) self.h_list.append(h) ## 门的计算都是相同的算法,而门和c~t的计算仅仅是激活函数不同。因此提出了calc_gate方法,这样减少了很多重复代码。 def calc_gate(self, x, Wx, Wh, b, activator): ''' 计算门 ''' h = self.h_list[self.times - 1] # 上次的LSTM输出 net = np.dot(Wh, h) + np.dot(Wx, x) + b gate = activator.forward(net) return gate ## backward方法实现了LSTM的反向传播算法。需要注意的是,与backword相关的内部状态变量是在调用backward方法之后才初始化的。 ## 这种延迟初始化的一个好处是,如果LSTM只是用来推理,那么就不需要初始化这些变量,节省了很多内存。 def backward(self, x, delta_h, activator): ''' 实现LSTM训练算法,主要包含两部分: STEP 1: 计算误差项 STEP 2: 计算梯度 ''' self.calc_delta(delta_h, activator) self.calc_gradient(x) ## 梯度下降算法来更新权重 def update(self): ''' 按照梯度下降,更新权重 ''' self.Wfh -= self.learning_rate * self.Whf_grad self.Wfx -= self.learning_rate * self.Whx_grad self.bf -= self.learning_rate * self.bf_grad self.Wih -= self.learning_rate * self.Whi_grad self.Wix -= self.learning_rate * self.Whi_grad self.bi -= self.learning_rate * self.bi_grad self.Woh -= self.learning_rate * self.Wof_grad self.Wox -= self.learning_rate * self.Wox_grad self.bo -= self.learning_rate * self.bo_grad self.Wch -= self.learning_rate * self.Wcf_grad self.Wcx -= self.learning_rate * self.Wcx_grad self.bc -= self.learning_rate * self.bc_grad ##STEP 1: 计算误差项 def calc_delta(self, delta_h, activator): # 初始化各个时刻的误差项 self.delta_h_list = self.init_delta() # 输出误差项 self.delta_o_list = self.init_delta() # 输出门误差项 self.delta_i_list = self.init_delta() # 输入门误差项 self.delta_f_list = self.init_delta() # 遗忘门误差项 self.delta_ct_list = self.init_delta() # 即时输出误差项 # 保存从上一层传递下来的当前时刻的误差项,将最后一项【重新赋值】为delta_h self.delta_h_list[-1] = delta_h # 迭代计算每个时刻的误差项 for k in range(self.times, 0, -1): # [times, times-1, ``` , 1] self.calc_delta_k(k) ## 初始化各类误差项 def init_delta(self): ''' 初始化误差项,全部初始化为0 ''' delta_list = [] for i in range(self.times + 1): delta_list.append(np.zeros((self.state_width, 1))) return delta_list ## 计算k时刻的误差项 def calc_delta_k(self, k): ''' 根据k时刻的delta_h,计算k时刻的delta_f、delta_i、delta_o、delta_ct,以及k-1时刻的delta_h ''' # 获得k时刻前向计算的值 ig = self.i_list[k] og = self.o_list[k] fg = self.f_list[k] ct = self.ct_list[k] c = self.c_list[k] c_prev = self.c_list[k - 1] tanh_c = self.output_activator.forward(c) delta_k = self.delta_h_list[k] # 根据【式9 - 式12】计算delta_o, delta_f, delta_i, delta_ct delta_o = (delta_k * tanh_c * self.gate_activator.backward(og)) delta_f = (delta_k * og * (1 - tanh_c * tanh_c) * c_prev * self.gate_activator.backward(fg)) delta_i = (delta_k * og * (1 - tanh_c * tanh_c) * ct * self.gate_activator.backward(ig)) delta_ct = (delta_k * og * (1 - tanh_c * tanh_c) * ig * self.output_activator.backward(ct)) # 根据【式8】计算delta_h[k-1] delta_h_prev = ( np.dot(delta_o.transpose(), self.Woh) + np.dot(delta_i.transpose(), self.Wih) + np.dot(delta_f.transpose(), self.Wfh) + np.dot(delta_ct.transpose(), self.Wch) ).transpose() # 保存全部delta值 self.delta_h_list[k - 1] = delta_h_prev self.delta_f_list[k] = delta_f self.delta_i_list[k] = delta_i self.delta_o_list[k] = delta_o self.delta_ct_list[k] = delta_ct ##STEP 2: 计算梯度 def calc_gradient(self, x): # 初始化遗忘门权重梯度矩阵和偏置项 self.Wfh_grad, self.Wfx_grad, self.bf_grad = (self.init_weight_gradient_mat()) # 初始化输入门权重梯度矩阵和偏置项 self.Wih_grad, self.Wix_grad, self.bi_grad = (self.init_weight_gradient_mat()) # 初始化输出门权重梯度矩阵和偏置项 self.Woh_grad, self.Wox_grad, self.bo_grad = (self.init_weight_gradient_mat()) # 初始化单元状态权重梯度矩阵和偏置项 self.Wch_grad, self.Wcx_grad, self.bc_grad = (self.init_weight_gradient_mat()) # 计算对上一次输出h的权重梯度 for t in range(self.times, 0, -1): # 计算各个时刻的梯度 (Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad, bc_grad) = (self.calc_gradient_t(t)) # 实际梯度是各时刻梯度之和 self.Wfh_grad += Wfh_grad self.bf_grad += bf_grad self.Wih_grad += Wih_grad self.bi_grad += bi_grad self.Woh_grad += Woh_grad self.bo_grad += bo_grad self.Wch_grad += Wch_grad self.bc_grad += bc_grad # 计算对本次输入x的权重梯度 xt = x.transpose() self.Wfx_grad = np.dot(self.delta_f_list[-1], xt) self.Wix_grad = np.dot(self.delta_i_list[-1], xt) self.Wox_grad = np.dot(self.delta_o_list[-1], xt) self.Wcx_grad = np.dot(self.delta_ct_list[-1], xt) ## 初始化权重梯度矩阵 def init_weight_gradient_mat(self): ''' 初始化权重梯度矩阵 ''' Wh_grad = np.zeros((self.state_width, self.state_width)) Wx_grad = np.zeros((self.state_width, self.input_width)) b_grad = np.zeros((self.state_width, 1)) return Wh_grad, Wx_grad, b_grad ## 计算每个时刻t权重的梯度 def calc_gradient_t(self, t): ''' 计算每个时刻t权重的梯度 ''' h_prev = self.h_list[t - 1].transpose() Wfh_grad = np.dot(self.delta_f_list[t], h_prev) bf_grad = self.delta_f_list[t] Wih_grad = np.dot(self.delta_i_list[t], h_prev) bi_grad = self.delta_f_list[t] Woh_grad = np.dot(self.delta_o_list[t], h_prev) bo_grad = self.delta_f_list[t] Wch_grad = np.dot(self.delta_ct_list[t], h_prev) bc_grad = self.delta_ct_list[t] return Wfh_grad, bf_grad, Wih_grad, bi_grad, Woh_grad, bo_grad, Wch_grad, bc_grad ## 和RecurrentLayer一样,为了支持梯度检查,我们需要支持重置内部状态: def reset_state(self): # 当前时刻初始化为t0 self.times = 0 # 各个时刻的单元状态向量c self.c_list = self.init_state_vec() # 各个时刻的输出向量h self.h_list = self.init_state_vec() # 各个时刻的遗忘门f self.f_list = self.init_state_vec() # 各个时刻的输入门i self.i_list = self.init_state_vec() # 各个时刻的输出门o self.o_list = self.init_state_vec() # 各个时刻的即时状态c~ self.ct_list = self.init_state_vec()
class LstmLayer(): def __init__(self, input_width, state_width, output_width, learning_rate, penaltyL2, momentum): self.input_width = input_width self.state_width = state_width self.output_width= output_width self.learning_rate = learning_rate self.penaltyL2 = penaltyL2 self.momentum = momentum # 门的激活函数 self.gate_activator = SigmoidActivator() # 输出的激活函数 self.output_activator = TanhActivator() self.class_activator = SoftmaxActivator() # 遗忘门权重矩阵Wfh, Wfx, 偏置项bf self.Wfh, self.Wfx, self.bf, self.vWfh, self.vWfx, self.vbf =(self.init_weight_mat(0)) # 输入门权重矩阵Wfh, Wfx, 偏置项bf self.Wih, self.Wix, self.bi, self.vWih, self.vWix, self.vbi =(self.init_weight_mat(0)) # 输出门权重矩阵Wfh, Wfx, 偏置项bf self.Woh, self.Wox, self.bo, self.vWoh, self.vWox, self.vbo =(self.init_weight_mat(0)) # 单元状态权重矩阵Wfh, Wfx, 偏置项bf self.Wch, self.Wcx, self.bc, self.vWch, self.vWcx, self.vbc =(self.init_weight_mat(0)) # 下一层权重Wy, 偏值by self.Wy, self.by, self.vWy, self.vby =(self.init_weight_mat(1)) def init_weight_mat(self,i): ''' 初始化权重矩阵 ''' if (i<1): Wh = np.mat(np.random.uniform(-0.5, 0.5, (self.state_width, self.state_width)))/self.state_width vWh = np.mat(np.zeros(Wh.shape)) Wx = np.mat(np.random.uniform(-0.5, 0.5, (self.state_width, self.input_width)))/self.input_width vWx = np.mat(np.zeros(Wx.shape)) b = np.mat(np.random.uniform(-0.5,0.5,(self.state_width, 1)))/self.state_width vb = np.mat(np.zeros(b.shape)) return Wh, Wx, b, vWh, vWx, vb else: Wy = np.mat(np.random.uniform(-0.5, 0.5, (self.output_width, self.state_width)))/self.output_width vWy = np.mat(np.zeros(Wy.shape)) by = np.mat(np.random.uniform(-0.5,0.5,(self.output_width, 1)))/self.output_width vby = np.mat(np.zeros(by.shape)) return Wy, by, vWy, vby def forward(self, x): ''' 根据式1-式6进行前向计算 ''' self.x = x self.reset_state() n = x.shape[0] for i in range(n): # 遗忘门 fg = self.calc_gate(x[i], self.Wfx, self.Wfh, self.bf, self.gate_activator,i) self.f_list[i] = fg # 输入门 ig = self.calc_gate(x[i], self.Wix, self.Wih, self.bi, self.gate_activator,i) self.i_list[i] = ig # 输出门 og = self.calc_gate(x[i], self.Wox, self.Woh, self.bo, self.gate_activator,i) self.o_list[i] = og # 即时状态 ct = self.calc_gate(x[i], self.Wcx, self.Wch, self.bc, self.output_activator,i) self.ct_list[i] = ct # 单元状态 if i==0: c = np.multiply(ig, ct) else: c = np.multiply(fg, self.c_list[i - 1]) + np.multiply(ig, ct) self.c_list[i] = c # 输出 h = np.multiply(og, self.output_activator.forward(c)) self.h_list[i] = h y = self.class_activator.forward(self.Wy * h.T + self.by) self.y_list[i] = y.T def reset_state(self): # 各个时刻的单元状态向量c self.c_list = np.mat(np.zeros((self.x.shape[0], self.state_width))) # 各个时刻的输出向量h self.h_list = np.mat(np.zeros((self.x.shape[0], self.state_width))) # 各个时刻的遗忘门f self.f_list = np.mat(np.zeros((self.x.shape[0], self.state_width))) # 各个时刻的输入门i self.i_list = np.mat(np.zeros((self.x.shape[0], self.state_width))) # 各个时刻的输出门o self.o_list = np.mat(np.zeros((self.x.shape[0], self.state_width))) # 各个时刻的即时状态c~ self.ct_list = np.mat(np.zeros((self.x.shape[0], self.state_width))) self.y_list = np.mat(np.zeros((self.x.shape[0], self.output_width))) def calc_gate(self, x, Wx, Wh, b, activator,i): ''' 计算门 ''' if i==0: h = np.mat(np.zeros((1, self.state_width))) else: h = self.h_list[i-1] # 上次的LSTM输出 net = (Wh * h.T + Wx * x.T + b).T gate = activator.forward(net) return gate def backward(self, e): ''' 实现LSTM训练算法 ''' self.e = e self.calc_delta() self.calc_gradient() self.update() def update(self): ''' 按照梯度下降,更新权重 ''' self.vWfh = self.momentum * self.vWfh - self.learning_rate * \ (self.Wfh_grad + self.penaltyL2 * self.Wfh) self.vWfx = self.momentum * self.vWfx - self.learning_rate * \ (self.Wfx_grad + self.penaltyL2 * \ np.concatenate((np.mat(np.zeros((self.Wfx.shape[0],1))),self.Wfx[:,1:]),axis=1)) self.vbf = self.momentum * self.vbf - self.learning_rate * self.bf_grad self.vWih = self.momentum * self.vWih - self.learning_rate * \ (self.Wih_grad + self.penaltyL2 * self.Wih) self.vWix = self.momentum * self.vWix - self.learning_rate * \ (self.Wix_grad + self.penaltyL2 * \ np.concatenate((np.mat(np.zeros((self.Wix.shape[0],1))),self.Wix[:,1:]),axis=1)) self.vbi = self.momentum * self.vbi - self.learning_rate * self.bi_grad self.vWoh = self.momentum * self.vWoh - self.learning_rate * \ (self.Woh_grad + self.penaltyL2 * self.Woh) self.vWox = self.momentum * self.vWox - self.learning_rate * \ (self.Wox_grad + self.penaltyL2 * \ np.concatenate((np.mat(np.zeros((self.Wox.shape[0],1))),self.Wox[:,1:]),axis=1)) self.vbo = self.momentum * self.vbo - self.learning_rate * self.bo_grad self.vWch = self.momentum * self.vWch - self.learning_rate * \ (self.Wch_grad + self.penaltyL2 * self.Wch) self.vWcx = self.momentum * self.vWcx - self.learning_rate * \ (self.Wcx_grad + self.penaltyL2 * \ np.concatenate((np.mat(np.zeros((self.Wcx.shape[0],1))),self.Wcx[:,1:]),axis=1)) self.vbc = self.momentum * self.vbc - self.learning_rate * self.bc_grad self.vWy = self.momentum * self.vWy - self.learning_rate * \ (self.Wy_grad + self.penaltyL2 * self.Wy) self.vby = self.momentum * self.vby - self.learning_rate * self.by_grad self.Wfh += self.vWfh self.Wfx += self.vWfx self.bf += self.vbf self.Wih += self.vWih self.Wix += self.vWix self.bi += self.vbi self.Woh += self.vWoh self.Wox += self.vWox self.bo += self.vbo self.Wch += self.vWch self.Wcx += self.vWcx self.bc += self.vbc self.Wy += self.vWy self.by += self.vby def calc_delta(self): # 初始化各个时刻的误差项 self.delta_h_list = self.init_delta() # 输出误差项 self.delta_o_list = self.init_delta() # 输出门误差项 self.delta_i_list = self.init_delta() # 输入门误差项 self.delta_f_list = self.init_delta() # 遗忘门误差项 self.delta_ct_list = self.init_delta() # 即时输出误差项 self.delta_c_list = self.init_delta() #state c self.delta_h_list[-1] = self.e[-1] * self.Wy a = self.output_activator.backward(self.output_activator.forward(self.c_list[-1])) self.delta_c_list[-1] = np.multiply(self.delta_h_list[-1], self.o_list[-1], a) m = self.e.shape[0] for k in range(m-1, 0, -1): self.calc_delta_k(k) def init_delta(self): ''' 初始化误差项 ''' delta_list = np.mat(np.zeros((self.e.shape[0],self.state_width))) return delta_list def calc_delta_k(self, k): ''' 根据k时刻的delta_h,计算k时刻的delta_f、 delta_i、delta_o、delta_ct,以及k-1时刻的delta_h ''' # 获得k时刻前向计算的值 ig = self.i_list[k] og = self.o_list[k] fg = self.f_list[k] ct = self.ct_list[k] c = self.c_list[k] c_prev = self.c_list[k-1] tan_c = self.output_activator.forward(c) delta_h = self.delta_h_list[k] delta_c = self.delta_c_list[k] delta_y = self.e[k-1] # 根据式9计算delta_o gate_o = np.multiply(tan_c, self.gate_activator.backward(og)) delta_o = np.multiply(delta_h, gate_o) gate_f = np.multiply(c_prev, self.gate_activator.backward(fg)) delta_f = np.multiply(delta_c, gate_f) gate_i = np.multiply(ct, self.gate_activator.backward(ig)) delta_i = np.multiply(delta_c, gate_i) gate_c = np.multiply(ig, self.output_activator.backward(ct)) delta_ct = np.multiply(delta_c, gate_c) delc = np.multiply(og, self.output_activator.backward(tan_c)) delta_h_prev = np.multiply(delta_h, (gate_o * self.Woh + \ np.multiply(gate_i, delc) * self.Wih + \ np.multiply(gate_f, delc) * self.Wfh + \ np.multiply(gate_c, delc) * self.Wch)) + delta_y * self.Wy delc1 = np.multiply(self.o_list[k-1],self.output_activator.backward(self.output_activator.forward(c_prev))) delta_c_prev = np.multiply(delta_c, fg) + np.multiply(delta_h_prev, delc1) # 保存全部delta值 self.delta_h_list[k-1] = delta_h_prev self.delta_c_list[k-1] = delta_c_prev self.delta_f_list[k] = delta_f self.delta_i_list[k] = delta_i self.delta_o_list[k] = delta_o self.delta_ct_list[k] = delta_ct def calc_gradient(self): # 初始化遗忘门权重梯度矩阵和偏置项 Wfh_grad, Wfx_grad, bf_grad = ( self.init_weight_gradient_mat(0)) # 初始化输入门权重梯度矩阵和偏置项 Wih_grad, Wix_grad, bi_grad = ( self.init_weight_gradient_mat(0)) # 初始化输出门权重梯度矩阵和偏置项 Woh_grad, Wox_grad, bo_grad = ( self.init_weight_gradient_mat(0)) # 初始化单元状态权重梯度矩阵和偏置项 Wch_grad, Wcx_grad, bc_grad = ( self.init_weight_gradient_mat(0)) Wy_grad, by_grad = (self.init_weight_gradient_mat(1)) m = self.e.shape[0] # 计算对上一次输出h的权重梯度 for t in range(m-1, 0, -1): h = self.h_list[t] h_prev = self.h_list[t - 1] x = self.x[t] Wfh_grad += self.delta_f_list[t].T * h_prev Wfx_grad += self.delta_f_list[t].T * x bf_grad += self.delta_f_list[t].T Wih_grad += self.delta_i_list[t].T * h_prev Wix_grad += self.delta_i_list[t].T * x bi_grad += self.delta_i_list[t].T Woh_grad += self.delta_o_list[t].T * h_prev Wox_grad += self.delta_o_list[t].T * x bo_grad += self.delta_o_list[t].T Wch_grad += self.delta_ct_list[t].T * h_prev Wcx_grad += self.delta_ct_list[t].T * x bc_grad += self.delta_ct_list[t].T Wy_grad += self.e[t].T * h by_grad += self.e[t].T self.Wfh_grad = Wfh_grad/(m-1) self.Wfx_grad = Wfx_grad/m self.bf_grad = bf_grad/m self.Wih_grad = Wih_grad/(m-1) self.Wix_grad = Wix_grad/m self.bi_grad = bi_grad/m self.Woh_grad = Woh_grad/(m-1) self.Wox_grad = Wox_grad/m self.bo_grad = bo_grad/m self.Wch_grad = Wch_grad/(m-1) self.Wcx_grad = Wcx_grad/m self.bc_grad = bc_grad/m self.Wy_grad = Wy_grad/m self.by_grad = by_grad/m def init_weight_gradient_mat(self,i): ''' 初始化权重矩阵 ''' if i<1: Wh_grad = np.mat(np.zeros((self.state_width, self.state_width))) Wx_grad = np.mat(np.zeros((self.state_width, self.input_width))) b_grad = np.mat(np.zeros((self.state_width, 1))) return Wh_grad, Wx_grad, b_grad else: Wy_grad = np.mat(np.zeros((self.output_width, self.state_width))) by_grad = np.mat(np.zeros((self.output_width, 1))) return Wy_grad, by_grad