def backward(self, y_pred, y_train, dh_next, cache): X, X_prime, h_old, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, h, y_cache = cache dy = loss_fun.dcross_entropy(y_pred, y_train) dh, dWy, dby = l.fc_backward(dy, y_cache) dh += dh_next dhh = hz * dh dh_old1 = (1. - hz) * dh dhz = hh * dh - h_old * dh dhh = l.tanh_backward(dhh, hh_tanh_cache) dX_prime, dWh, dbh = l.fc_backward(dhh, hh_cache) dh_prime = dX_prime[:, :self.H] dh_old2 = hr * dh_prime dhr = h_old * dh_prime dhr = l.sigmoid_backward(dhr, hr_sigm_cache) dXr, dWr, dbr = l.fc_backward(dhr, hr_cache) dhz = l.sigmoid_backward(dhz, hz_sigm_cache) dXz, dWz, dbz = l.fc_backward(dhz, hz_cache) dX = dXr + dXz dh_old3 = dX[:, :self.H] dh_next = dh_old1 + dh_old2 + dh_old3 grad = dict(Wz=dWz, Wr=dWr, Wh=dWh, Wy=dWy, bz=dbz, br=dbr, bh=dbh, by=dby) return grad, dh_next
def backward(self, y_pred, y_train, d_next, cache): X, hf, hi, ho, hc, hf_cache, hf_sigm_cache, hi_cache, hi_sigm_cache, ho_cache, ho_sigm_cache, hc_cache, hc_tanh_cache, c_old, c, c_tanh_cache, y_cache = cache dh_next, dc_next = d_next dy = loss_fun.dcross_entropy(y_pred, y_train) dh, dWy, dby = l.fc_backward(dy, y_cache) dh += dh_next dho = c * dh dho = l.sigmoid_backward(dho, ho_sigm_cache) dc = ho * dh dc = l.tanh_backward(dc, c_tanh_cache) dc = dc + dc_next dhf = c_old * dc dhf = l.sigmoid_backward(dhf, hf_sigm_cache) dhi = hc * dc dhi = l.sigmoid_backward(dhi, hi_sigm_cache) dhc = hi * dc dhc = l.tanh_backward(dhc, hc_tanh_cache) dXo, dWo, dbo = l.fc_backward(dho, ho_cache) dXc, dWc, dbc = l.fc_backward(dhc, hc_cache) dXi, dWi, dbi = l.fc_backward(dhi, hi_cache) dXf, dWf, dbf = l.fc_backward(dhf, hf_cache) dX = dXo + dXc + dXi + dXf dh_next = dX[:, :self.H] dc_next = hf * dc grad = dict(Wf=dWf, Wi=dWi, Wc=dWc, Wo=dWo, Wy=dWy, bf=dbf, bi=dbi, bc=dbc, bo=dbo, by=dby) return grad, (dh_next, dc_next)
def backward(self, y_pred, y_train, dh_next, cache): X, Whh, h, hprev, y, h_cache, y_cache = cache # Softmax gradient dy = loss_fun.dcross_entropy(y_pred, y_train) # Hidden to output gradient dh, dWhy, dby = l.fc_backward(dy, y_cache) dh += dh_next dby = dby.reshape((1, -1)) # tanh dh = l.tanh_backward(dh, h_cache) # Hidden gradient dbh = dh dWhh = hprev.T @ dh dWxh = X.T @ dh dh_next = dh @ Whh.T grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby) return grad, dh_next