def rnn_step_backward(self, dnext_h, cache): """ Backward pass for a single timestep of a vanilla RNN. Inputs: - dnext_h: Gradient of loss with respect to next hidden state, of shape (N, H) - cache: Cache object from the forward pass Returns a tuple of: - dx: Gradients of input data, of shape (N, D) - dprev_h: Gradients of previous hidden state, of shape (N, H) - dWx: Gradients of input-to-hidden weights, of shape (D, H) - dWh: Gradients of hidden-to-hidden weights, of shape (H, H) - db: Gradients of bias vector, of shape (H,) """ dx, dprev_h, dWx, dWh, db = None, None, None, None, None ############################################################################## # TODO: Implement the backward pass for a single step of a vanilla RNN. # # # # HINT: For the tanh function, you can compute the local derivative in terms # # of the output value from tanh. # ############################################################################## x, prev_h, Wx, Wh, dtanh = cache dz = dnext_h * dtanh dx = Tools.matmul(dz, Wx.T) dprev_h = Tools.matmul(dz, Wh.T) dWx = Tools.matmul(x.T, dz) dWh = Tools.matmul(prev_h.T, dz) db = np.sum(dz, axis=0) ############################################################################## # END OF YOUR CODE # ############################################################################## return dx, dprev_h, dWx, dWh, db
def loss(y,y_, n): y_argmax = np.argmax(y, axis=1) softmax_y = Tools.softmax(y) acc = np.mean(y_argmax == y_) # loss corect_logprobs = Tools.crossEntropy(softmax_y, y_) data_loss = np.sum(corect_logprobs) / n # delta softmax_y[range(n), y_] -= 1 delta = softmax_y / n return data_loss, delta, acc
def lstm_step_backward(self, dnext_h, dnext_c, cache): """ Backward pass for a single timestep of an LSTM. Inputs: - dnext_h: Gradients of next hidden state, of shape (N, H) - dnext_c: Gradients of next cell state, of shape (N, H) - cache: Values from the forward pass Returns a tuple of: - dx: Gradient of input data, of shape (N, D) - dprev_h: Gradient of previous hidden state, of shape (N, H) - dprev_c: Gradient of previous cell state, of shape (N, H) - dWx: Gradient of input-to-hidden weights, of shape (D, 4H) - dWh: Gradient of hidden-to-hidden weights, of shape (H, 4H) - db: Gradient of biases, of shape (4H,) """ dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None ############################################################################# # TODO: Implement the backward pass for a single timestep of an LSTM. # # # # HINT: For sigmoid and tanh you can compute local derivatives in terms of # # the output value from the nonlinearity. # ############################################################################# x, prev_h, prev_c, Wx, Wh, i, f, o, g, next_c = cache dnext_c = dnext_c + o * ( 1 - np.tanh(next_c)**2) * dnext_h # next_h = o*np.tanh(next_c) di = dnext_c * g # next_c = f*prev_c + i*g df = dnext_c * prev_c # next_c = f*prev_c + i*g do = dnext_h * np.tanh(next_c) # next_h = o*np.tanh(next_c) dg = dnext_c * i # next_h = o*np.tanh(next_c) dprev_c = f * dnext_c # next_c = f*prev_c + i*g dz = np.hstack((i * (1 - i) * di, f * (1 - f) * df, o * (1 - o) * do, (1 - g**2) * dg)) # 共四部分 dx = Tools.matmul(dz, Wx.T) dprev_h = Tools.matmul(dz, Wh.T) dWx = Tools.matmul(x.T, dz) dWh = Tools.matmul(prev_h.T, dz) db = np.sum(dz, axis=0) ############################################################################## # END OF YOUR CODE # ############################################################################## return dx, dprev_h, dprev_c, dWx, dWh, db
def bpDelta(self): deltaPrevReshapped = Tools.matmul(self.deltaOri, self.w.T) self.deltaPrev = deltaPrevReshapped if self.needReshape is False else deltaPrevReshapped.reshape( self.shapeOfOriIn) return self.deltaPrev
def bpWeights(self, input, lrt): dw = Tools.matmul(input.T, self.deltaOri) db = np.sum(self.deltaOri, axis=0, keepdims=True).reshape(self.b.shape) weight = (self.w, self.b) dweight = (dw, db) # 元组按引用传递 self.optimizerObj.getUpdWeights(weight, dweight, lrt)
def lstm_step_forward(self, x, prev_h, prev_c, Wx, Wh, b): """ Forward pass for a single timestep of an LSTM. The input data has dimension D, the hidden state has dimension H, and we use a minibatch size of N. Note that a sigmoid() function has already been provided for you in this file. Inputs: - x: Input data, of shape (N, D) - prev_h: Previous hidden state, of shape (N, H) - prev_c: previous cell state, of shape (N, H) - Wx: Input-to-hidden weights, of shape (D, 4H) - Wh: Hidden-to-hidden weights, of shape (H, 4H) - b: Biases, of shape (4H,) Returns a tuple of: - next_h: Next hidden state, of shape (N, H) - next_c: Next cell state, of shape (N, H) - cache: Tuple of values needed for backward pass. """ next_h, next_c, cache = None, None, None ############################################################################# # TODO: Implement the forward pass for a single timestep of an LSTM. # # You may want to use the numerically stable sigmoid implementation above. # 首层,x(N,T,D), 向上变成xh(N,T,H) # 首层 Wx(D,H), 向上变成Wxh(H,H) ############################################################################# H = prev_h.shape[1] #z , of shape(N,4H) z = Tools.matmul(x, Wx) + Tools.matmul(prev_h, Wh) + b # of shape(N,H) i = Tools.sigmoid(z[:, :H]) f = Tools.sigmoid(z[:, H:2 * H]) o = Tools.sigmoid(z[:, 2 * H:3 * H]) g = np.tanh(z[:, 3 * H:]) next_c = f * prev_c + i * g next_h = o * np.tanh(next_c) cache = (x, prev_h, prev_c, Wx, Wh, i, f, o, g, next_c) ############################################################################## # END OF YOUR CODE # ############################################################################## return next_h, next_c, cache
def conv_efficient(self, x, w, b, output_size, vec_idx_key, strides=1): batches = x.shape[0] depth_i = x.shape[1] filter_size = w.shape[2] depth_o = w.shape[0] if 4 == x.ndim: # 原始规格: input_size = x.shape[2] # p = int(((output_size - 1) * strides + filter_size - input_size) / 2) # padding尺寸 # logger.debug("padding begin..") if p > 0: # 需要padding处理 x_pad = Tools.padding(x, p, self.dataType) else: x_pad = x st = time.time() logger.debug("vecting begin..") # 可以根据自己的硬件环境,在三种优化方式中选择较快的一种 x_col = self.vectorize4conv_batches(x_pad, filter_size, output_size, strides) #x_col = spd.vectorize4conv_batches(x_pad, filter_size, output_size, strides) #x_col = vec_by_idx(x_pad, filter_size, filter_size,vec_idx_key,0, strides) logger.debug("vecting end.. %f s" % (time.time() - st)) else: # x_col规格 x_col = x w_row = w.reshape(depth_o, x_col.shape[1]) conv = np.zeros((batches, depth_o, (output_size * output_size)), dtype=self.dataType) st1 = time.time() logger.debug("matmul begin..") #不广播,提高处理效率 for batch in range(batches): conv[batch] = Tools.matmul(w_row, x_col[batch]) + b logger.debug("matmul end.. %f s" % (time.time() - st1)) conv_return = conv.reshape(batches, depth_o, output_size, output_size) return conv_return
def conv4dw(self, x, w, output_size, b=0, strides=1, x_v=False): batches = x.shape[0] depth_i = x.shape[1] filter_size = w.shape[2] # 过滤器尺寸,对应卷积层误差矩阵尺寸 x_per_filter = filter_size * filter_size depth_o = w.shape[1] if False == x_v: # 原始规格: input_size = x.shape[2] # p = int(((output_size - 1) * strides + filter_size - input_size) / 2) # padding尺寸 if p > 0: # 需要padding处理 x_pad = Tools.padding(x, p, self.dataType) else: x_pad = x logger.debug("vec4dw begin..") x_col = self.vectorize4convdw_batches(x_pad, filter_size, output_size, strides) logger.debug("vec4dw end..") else: # x_col规格 x_col = x w_row = w.reshape(batches, depth_o, x_per_filter) conv = np.zeros( (batches, depth_i, depth_o, (output_size * output_size)), dtype=self.dataType) logger.debug("conv4dw matmul begin..") for batch in range(batches): for col in range(depth_i): conv[batch, col] = Tools.matmul(w_row[batch], x_col[batch, col]) conv_sum = np.sum(conv, axis=0) # transpose而不是直接reshape避免错位 conv = conv_sum.transpose(1, 0, 2).reshape(depth_o, depth_i, output_size, output_size) logger.debug("conv4dw matmul end..") return conv, x_col
def fp(self, input): # 拉伸变形处理 self.shapeOfOriIn = input.shape self.inputReshaped = input if self.needReshape is False else input.reshape( input.shape[0], -1) self.out = self.activator.activate( Tools.matmul(self.inputReshaped, self.w) + self.b) ####debug#### # np.savetxt('G:/0tmp/0debug/x.csv',self.inputReshaped[0]) # np.savetxt('G:/0tmp/0debug/w_c1.csv', self.w[:,0]) # np.savetxt('G:/0tmp/0debug/w_c2.csv', self.w[:, 1]) # np.savetxt('G:/0tmp/0debug/out.csv', self.out[0]) ####debug end##### return self.out
def rnn_step_forward(self, x, prev_h, Wx, Wh, b): """ Run the forward pass for a single timestep of a vanilla RNN that uses a tanh activation function. The input data has dimension D, the hidden state has dimension H, and we use a minibatch size of N. Inputs: - x: Input data for this timestep, of shape (N, D). - prev_h: Hidden state from previous timestep, of shape (N, H) - Wx: Weight matrix for input-to-hidden connections, of shape (D, H) - Wh: Weight matrix for hidden-to-hidden connections, of shape (H, H) - b: Biases of shape (H,) Returns a tuple of: - next_h: Next hidden state, of shape (N, H) - cache: Tuple of values needed for the backward pass. """ next_h, cache = None, None ############################################################################## # TODO: Implement a single forward step for the vanilla RNN. Store the next # # hidden state and any values you need for the backward pass in the next_h # # and cache variables respectively. # ############################################################################## z = Tools.matmul(x, Wx) + Tools.matmul(prev_h, Wh) + b next_h = np.tanh(z) dtanh = 1. - next_h * next_h cache = (x, prev_h, Wx, Wh, dtanh) ############################################################################## # END OF YOUR CODE # ############################################################################## return next_h, cache
def gru_step_forward(self, x, prev_h, Wzx, Wzh, bz, Wax, War, ba): """ Forward pass for a single timestep of an LSTM. The input data has dimension D, the hidden state has dimension H, and we use a minibatch size of N. Note that a sigmoid() function has already been provided for you in this file. Inputs: - x: Input data, of shape (N, D) - prev_h: Previous hidden state, of shape (N, H) - prev_c: previous cell state, of shape (N, H) - Wzx: Input-to-hidden weights, of shape (D, 4H) - Wh: Hidden-to-hidden weights, of shape (H, 4H) - b: Biases, of shape (4H,) Returns a tuple of: - next_h: Next hidden state, of shape (N, H) - next_c: Next cell state, of shape (N, H) - cache: Tuple of values needed for backward pass. """ next_h, cache = None, None ############################################################################# # TODO: Implement the forward pass for a single timestep of an LSTM. # # You may want to use the numerically stable sigmoid implementation above. # 首层,x(N,T,D), 向上变成xh(N,T,H) # 首层 Wx(D,H), 向上变成Wxh(H,H) ############################################################################# H = prev_h.shape[1] # z_hat, of shape(N,4H) z_hat = Tools.matmul(x, Wzx) + Tools.matmul(prev_h, Wzh) + bz # of shape(N,H) r = Tools.sigmoid(z_hat[:, :H]) z = Tools.sigmoid(z_hat[:, H:2 * H]) a = Tools.matmul(x, Wax) + Tools.matmul(r * prev_h, War) + ba next_h = prev_h * (1. - z) + z * np.tanh(a) cache = (x, prev_h, Wzx, Wzh, Wax, War, z_hat, r, z, a) ############################################################################## # END OF YOUR CODE # ############################################################################## return next_h, cache
def loss(y, y_, n): corect_logprobs = Tools.mse(y, y_) data_loss = np.sum(corect_logprobs) / n delta = (y - y_) / n return data_loss, delta, None
def bpDelta(self): self.deltaPrev = Tools.matmul(self.deltaOri, self.w.T) return self.deltaPrev
def fp(self, input): self.out = self.activator.activate( Tools.matmul(input, self.w) + self.b) return self.out
def gru_step_backward(self, dnext_h, cache): """ Backward pass for a single timestep of an LSTM. Inputs: - dnext_h: Gradients of next hidden state, of shape (N, H) - dnext_c: Gradients of next cell state, of shape (N, H) - cache: Values from the forward pass Returns a tuple of: - dx: Gradient of input data, of shape (N, D) - dprev_h: Gradient of previous hidden state, of shape (N, H) - dprev_c: Gradient of previous cell state, of shape (N, H) - dWx: Gradient of input-to-hidden weights, of shape (D, 4H) - dWh: Gradient of hidden-to-hidden weights, of shape (H, 4H) - db: Gradient of biases, of shape (4H,) """ dx, dprev_h, dWzx, dWzh, dbz, dWax, dWar, dba = None, None, None, None, None, None, None, None ############################################################################# # TODO: Implement the backward pass for a single timestep of an LSTM. # # # # HINT: For sigmoid and tanh you can compute local derivatives in terms of # # the output value from the nonlinearity. # ############################################################################# x, prev_h, Wzx, Wzh, Wax, War, z_hat, r, z, a = cache N, D = x.shape H = dnext_h.shape[1] z_hat_H1 = z_hat[:, :H] z_hat_H2 = z_hat[:, H:2 * H] # delta tanha = np.tanh(a) dh = dnext_h da = dh * z * (1. - tanha * tanha) dh_prev_1 = dh * (1. - z) # dz = dh * (z+tanha) # dz = dh*tanha+1.-dh*(1.-z)*prev_h # dz = dh*tanha+1.-dh*prev_h dz = dh * (tanha - prev_h) dz_hat_2 = dz * (z * (1. - z)) # dz_hat_2 = dz*(z_hat_H2*(1.-z_hat_H2)) dhat_a = Tools.matmul(da, War.T) # dz_hat_2 = dhat_r * r dr = dhat_a * prev_h dx_1 = Tools.matmul(da, Wax.T) dh_prev_2 = dhat_a * r #da* Tools.matmul(r,War.T) # dz_hat_1 = dh_prev_2 * (r * (1. - r)) dz_hat_1 = dr * (r * (1. - r)) dz_hat = np.hstack((dz_hat_1, dz_hat_2)) # dh_prev_3 = Tools.matmul(dz_hat_2,Wzh.T) # dx_2 = Tools.matmul(dz_hat_2,Wzx.T) # dh_prev_3 = Tools.matmul(dz_hat,Wzh.T) # dh_prev_3 = Tools.matmul(dz_hat_2,Wzh.T) dx_2 = Tools.matmul(dz_hat, Wzx.T) # dx_3 = Tools.matmul(dz_hat_1,Wzx.T) # dh_prev_4 =Tools.matmul(dz_hat_1, Wzh.T) # dx_3 = Tools.matmul(dz_hat,Wzx.T) # dh_prev_4 =Tools.matmul(dz_hat, Wzh.T) # dh_prev_34 = np.hstack((dh_prev_3, dh_prev_4)) # dh_prev_34 = Tools.matmul(dh_prev_34,Wzh.T) dh_prev_34 = Tools.matmul(dz_hat, Wzh.T) # dprev_h = dh_prev_1+dh_prev_2+dh_prev_34 * 2. #dh_prev_3 + dh_prev_4 # dx = dx_1 + dx_2*2. # +dx_3 dprev_h = dh_prev_1 + dh_prev_2 + dh_prev_34 #dh_prev_3 + dh_prev_4 dx = dx_1 + dx_2 # +dx_3 dWax = Tools.matmul(x.T, da) dWar = Tools.matmul((r * prev_h).T, da) dba = np.sum(da, axis=0) dWzx = Tools.matmul(x.T, dz_hat) dWzh = Tools.matmul(prev_h.T, dz_hat) dbz = np.sum(dz_hat, axis=0) ############################################################################## # END OF YOUR CODE # ############################################################################## return dx, dprev_h, dWzx, dWzh, dbz, dWax, dWar, dba
def fp(self, input): out_tmp = self.inference(input) self.out, self.dropoutMask = Tools.dropout4rnn(out_tmp, self.dropoutRRate) return self.out