def forward(self, Y_train, Y_hat): target = Y_train.reshape(Y_hat.shape) m = target.shape[0] p.clip(Y_hat, self.epsilon, 1.0 - self.epsilon, out=Y_hat) cost = -target * p.log(Y_hat) - (1 - target) * p.log(1 - Y_hat) J = p.sum(cost, axis=0, keepdims=True) / m return p.squeeze(J)
def lstm_step_backward(self, da_next, dc_next, cache): z_i, z_f, z_o, z_g, z_t, c_prev, a_prev, x = cache dz_o = z_t * da_next dc_t = z_o * (1 - z_t * z_t) * da_next + dc_next dz_f = c_prev * dc_t dz_i = z_g * dc_t dc_prev = z_f * dc_t dz_g = z_i * dc_t da_i = (1 - z_i) * z_i * dz_i da_f = (1 - z_f) * z_f * dz_f da_o = (1 - z_o) * z_o * dz_o da_g = (1 - z_g * z_g) * dz_g da = p.hstack((da_i, da_f, da_o, da_g)) dWx = x.T.dot(da) dWa = a_prev.T.dot(da) db = p.sum(da, axis=0) dx = da.dot(self.parameters['Wx'].T) da_prev = da.dot(self.parameters['Wa'].T) return dx, da_prev, dc_prev, dWx, dWa, db
def rnn_step_backward(self, da_next, cache): x, a_prev, a_next = cache da = da_next * (1 - a_next * a_next) dx = da.dot(self.parameters['Wxa'].T) da_prev = da.dot(self.parameters['Waa'].T) dWx = x.T.dot(da) dWh = a_prev.T.dot(da) db = p.sum(da, axis=0) return dx, da_prev, dWx, dWh, db
def twoDims_batchnormal_backward(self, pre_grad): xhat, xmu, ivar, sqrtvar = self.caches del self.caches m, nx = pre_grad.shape self.gradients['beta'] = p.sum(pre_grad, axis=0) dgammax = pre_grad self.gradients['gamma'] = p.sum(xhat * dgammax, axis=0) dxhat = self.parameters['gamma'] * dgammax divar = p.sum(xmu * dxhat, axis=0) dxmu1 = dxhat * ivar dsqrtvar = -1. / (sqrtvar ** 2) * divar dvar = 0.5 * ivar * dsqrtvar dsq = p.divide(1., m) * p.ones_like(pre_grad) * dvar dxmu2 = 2 * dsq * xmu dx1 = dxmu1 + dxmu2 dmu = -1 * p.sum(dx1, axis=0) dx2 = p.divide(1., m) * p.ones_like(pre_grad) * dmu dx = dx1 + dx2 return dx
def forward(self, targets, outputs): p.clip(outputs, self.epsilon, 1.0 - self.epsilon, out=outputs) if targets.ndim == 1 and outputs.ndim == 2: N, T = outputs.shape loss = -p.sum(p.log(outputs[p.arange(N), targets])) / N return loss N, T = targets.shape if outputs.ndim == 3: N, T, D = outputs.shape outputs = outputs.reshape(N * T, D) if targets.ndim == 3: N, T, D = targets.shape targets = targets.reshape(N * T, D) loss = -p.sum(p.log(outputs[p.arange(N * T), targets.reshape(N * T, )])) / N return loss
def backward(self, dout): dZ = ac_get_grad(dout, self.Z, self.activation) if self.batch_normal: dZ = self.batch_normal.backward(dZ) N = dZ.shape[0] self.gradients['b'] = 1. / N * p.sum(dZ, axis=(0, 2, 3)) num_filters, _, filter_height, filter_width = self.parameters['W'].shape dout_reshaped = dZ.transpose(1, 2, 3, 0).reshape(num_filters, -1) self.gradients['W'] = 1. / N * dout_reshaped.dot(self.X_col.T).reshape(self.parameters['W'].shape) dx_cols = self.parameters['W'].reshape(num_filters, -1).T.dot(dout_reshaped) if isinstance(dZ, numpy.ndarray): dx = col2im_indices_cpu(dx_cols, self.x.shape, filter_height, filter_width, self.padding, self.stride) else: dx = col2im_indices_gpu(dx_cols, self.x.shape, filter_height, filter_width, self.padding, self.stride) del self.x, self.X_col return dx
def backward(self, dout): if self.keep_prob < 1. and self.mode == 'train': dout = dout * self.drop_mask / self.keep_prob dout = ac_get_grad(dout, self.Z, self.activation) if self.batch_normal: dout = self.batch_normal.backward(dout) if self.x.ndim == 3: N, T, D = self.x.shape dx = dout.reshape(N * T, self.unit_number).dot(self.parameters['W'].T).reshape(N, T, D) self.gradients['W'] = 1. / N * dout.reshape(N * T, self.unit_number).T.dot(self.x.reshape(N * T, D)).T self.gradients['b'] = 1. / N * dout.sum(axis=(0, 1)) else: N, D = self.x.shape dx = p.dot(dout, self.parameters['W'].T) self.gradients['W'] = 1. / N * p.dot(self.x.T, dout) self.gradients['b'] = 1. / N * p.sum(dout, axis=0) if self.flatten: dx = dx.reshape(self.x_shape) # 还原输入数据的形状(对应张量) return dx