def backward(self, y_pred, y_train, d_next, cache): X, hf, hi, ho, hc, hf_cache, hf_sigm_cache, hi_cache, hi_sigm_cache, ho_cache, ho_sigm_cache, hc_cache, hc_tanh_cache, c_old, c, c_tanh_cache, y_cache = cache dh_next, dc_next = d_next dy = loss_fun.dcross_entropy(y_pred, y_train) dh, dWy, dby = l.fc_backward(dy, y_cache) dh += dh_next dho = c * dh dho = l.sigmoid_backward(dho, ho_sigm_cache) dc = ho * dh dc = l.tanh_backward(dc, c_tanh_cache) dc = dc + dc_next dhf = c_old * dc dhf = l.sigmoid_backward(dhf, hf_sigm_cache) dhi = hc * dc dhi = l.sigmoid_backward(dhi, hi_sigm_cache) dhc = hi * dc dhc = l.tanh_backward(dhc, hc_tanh_cache) dXo, dWo, dbo = l.fc_backward(dho, ho_cache) dXc, dWc, dbc = l.fc_backward(dhc, hc_cache) dXi, dWi, dbi = l.fc_backward(dhi, hi_cache) dXf, dWf, dbf = l.fc_backward(dhf, hf_cache) dX = dXo + dXc + dXi + dXf dh_next = dX[:, :self.H] dc_next = hf * dc grad = dict(Wf=dWf, Wi=dWi, Wc=dWc, Wo=dWo, Wy=dWy, bf=dbf, bi=dbi, bc=dbc, bo=dbo, by=dby) return grad, (dh_next, dc_next)
def backward(self, y_pred, y_train, dh_next, cache): X, X_prime, h_old, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, h, y_cache = cache dy = loss_fun.dcross_entropy(y_pred, y_train) dh, dWy, dby = l.fc_backward(dy, y_cache) dh += dh_next dhh = hz * dh dh_old1 = (1. - hz) * dh dhz = hh * dh - h_old * dh dhh = l.tanh_backward(dhh, hh_tanh_cache) dX_prime, dWh, dbh = l.fc_backward(dhh, hh_cache) dh_prime = dX_prime[:, :self.H] dh_old2 = hr * dh_prime dhr = h_old * dh_prime dhr = l.sigmoid_backward(dhr, hr_sigm_cache) dXr, dWr, dbr = l.fc_backward(dhr, hr_cache) dhz = l.sigmoid_backward(dhz, hz_sigm_cache) dXz, dWz, dbz = l.fc_backward(dhz, hz_cache) dX = dXr + dXz dh_old3 = dX[:, :self.H] dh_next = dh_old1 + dh_old2 + dh_old3 grad = dict(Wz=dWz, Wr=dWr, Wh=dWh, Wy=dWy, bz=dbz, br=dbr, bh=dbh, by=dby) return grad, dh_next
def backward(self, y_pred, y_train, cache): X, h1_cache, h3_cache, score_cache, hpool_cache, hpool, nl_cache1, nl_cache3 = cache # Output layer grad_y = self.dloss_funs[self.loss](y_pred, y_train) # FC-7 dh3, dW3, db3 = l.fc_backward(grad_y, score_cache) dh3 = self.backward_nonlin(dh3, nl_cache3) dh2, dW2, db2 = l.fc_backward(dh3, h3_cache) dh2 = dh2.ravel().reshape(hpool.shape) # Pool-1 dpool = l.maxpool_backward(dh2, hpool_cache) # Conv-1 dh1 = self.backward_nonlin(dpool, nl_cache1) dX, dW1, db1 = l.conv_backward(dh1, h1_cache) grad = dict(W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3) return grad
def backward(self, y_pred, y_train, cache): X, h1_cache, h2_cache, score_cache, nl_cache1, nl_cache2, u1, u2, bn1_cache, bn2_cache = cache # Output layer grad_y = self.dloss_funs[self.loss](y_pred, y_train) # Third layer dh2, dW3, db3 = l.fc_backward(grad_y, score_cache) dW3 += reg.dl2_reg(self.model['W3'], self.lam) dh2 = self.backward_nonlin(dh2, nl_cache2) dh2 = l.dropout_backward(dh2, u2) dh2, dgamma2, dbeta2 = l.bn_backward(dh2, bn2_cache) # Second layer dh1, dW2, db2 = l.fc_backward(dh2, h2_cache) dW2 += reg.dl2_reg(self.model['W2'], self.lam) dh1 = self.backward_nonlin(dh1, nl_cache1) dh1 = l.dropout_backward(dh1, u1) dh1, dgamma1, dbeta1 = l.bn_backward(dh1, bn1_cache) # First layer _, dW1, db1 = l.fc_backward(dh1, h1_cache) dW1 += reg.dl2_reg(self.model['W1'], self.lam) grad = dict(W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3, gamma1=dgamma1, gamma2=dgamma2, beta1=dbeta1, beta2=dbeta2) return grad
def backward(self, y_pred, y_train, dh_next, cache): X, Whh, h, hprev, y, h_cache, y_cache = cache # Softmax gradient dy = loss_fun.dcross_entropy(y_pred, y_train) # Hidden to output gradient dh, dWhy, dby = l.fc_backward(dy, y_cache) dh += dh_next dby = dby.reshape((1, -1)) # tanh dh = l.tanh_backward(dh, h_cache) # Hidden gradient dbh = dh dWhh = hprev.T @ dh dWxh = X.T @ dh dh_next = dh @ Whh.T grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby) return grad, dh_next