def _backward_propagation(self, seq, y_): """ Parameters ---------- seq : list Variable length sequence of elements in the vocabulary. This is needed both for its lengths and for its input representations. y_ : list The label vector. Returns ------- tuple The matrices of derivatives (d_W_hy, d_b, d_W_hh, d_W_xh). """ # Output errors: y_err = self.y y_err[np.argmax(y_)] -= 1 h_err = y_err.dot(self.W_hy.T) * d_tanh(self.h[-1]) d_W_hy = np.outer(self.h[-1], y_err) d_b = y_err # For accumulating the gradients through time: d_W_hh = np.zeros(self.W_hh.shape) d_W_xh = np.zeros(self.W_xh.shape) # Back-prop through time; the +1 is because the 0th # hidden state is the all-0s initial state. num_steps = len(seq) + 1 for t in reversed(range(1, num_steps)): d_W_hh += np.outer(self.h[t], h_err) word_rep = self.get_word_rep(seq[t - 1]) d_W_xh += np.outer(word_rep, h_err) h_err = h_err.dot(self.W_hh.T) * d_tanh(self.h[t]) return (d_W_hy, d_b, d_W_hh, d_W_xh)
def _backward_propagation(self, seq, y_): """ Parameters ---------- seq : list Variable length sequence of elements in the vocabulary. This is needed both for its lengths and for its input representations. y_ : list The label vector. Returns ------- tuple The matrices of derivatives (d_W_hy, d_b, d_W_hh, d_W_xh). """ # Output errors: y_err = self.y y_err[np.argmax(y_)] -= 1 h_err = y_err.dot(self.W_hy.T) * d_tanh(self.h[-1]) d_W_hy = np.outer(self.h[-1], y_err) d_b = y_err # For accumulating the gradients through time: d_W_hh = np.zeros(self.W_hh.shape) d_W_xh = np.zeros(self.W_xh.shape) # Back-prop through time; the +1 is because the 0th # hidden state is the all-0s initial state. num_steps = len(seq)+1 for t in reversed(range(1, num_steps)): d_W_hh += np.outer(self.h[t], h_err) word_rep = self.get_word_rep(seq[t-1]) d_W_xh += np.outer(word_rep, h_err) h_err = h_err.dot(self.W_hh.T) * d_tanh(self.h[t]) return (d_W_hy, d_b, d_W_hh, d_W_xh)
def _backward_propagation(self, X, Y): m = X.shape[1] W1 = self.params['W1'] W2 = self.params['W2'] Z1 = self.caches['Z1'] A1 = self.caches['A1'] Z2 = self.caches['Z2'] A2 = self.caches['A2'] # dZ2 = dA2 * dA2/dZ2 # A2-Y = ( -Y/A2 + (1-Y)/(1-A2) ) * ( A2*(1-A2) ) shortcut = True if shortcut: dZ2 = A2 - Y else: dA2 = -np.divide(Y, A2) + np.divide( 1 - Y, 1 - A2) # -(Y/A2) + (1-Y) / (1-A2) dZ2 = np.multiply(dA2, d_sigmoid(Z2)) # dA2 * g'(Z2) dW2 = np.dot(dZ2, A1.T) / float(m) db2 = np.sum(dZ2, axis=1, keepdims=True) / float(m) #dZ1 = np.dot(W2.T, dZ2) * (1-np.power(A1, 2)) dZ1 = np.dot(W2.T, dZ2) * d_tanh(Z1) # dZ1 = dZA1 * dA1/dZ1 (=g'(Z1)) dW1 = np.dot(dZ1, X.T) / float(m) db1 = np.sum(dZ1, axis=1, keepdims=True) / float(m) self.grads['dW2'] = dW2 self.grads['db2'] = db2 self.grads['dW1'] = dW1 self.grads['db1'] = db1
def backward_propagation(self, vectree, predictions, ex, labels): root = self._get_vector_tree_root(vectree) # Output errors: y_err = predictions y_err[np.argmax(labels)] -= 1 d_W_hy = np.outer(root, y_err) d_b_y = y_err # Internal error accumulation: d_W = np.zeros_like(self.W) d_b = np.zeros_like(self.b) h_err = y_err.dot(self.W_hy.T) * d_tanh(root) d_W, d_b = self._tree_backprop(vectree, h_err, d_W, d_b) return d_W_hy, d_b_y, d_W, d_b
def backward_propagation(self, h, predictions, seq, labels): """ Parameters ---------- h : np.array, shape (m, self.hidden_dim) Matrix of hidden states. `m` is the shape of the current example (which is allowed to vary). predictions : np.array, dimension `len(self.classes)` Vector of predictions. seq : list of lists The original example. labels : np.array, dimension `len(self.classes)` One-hot vector giving the true label. Returns ------- tuple The matrices of derivatives (d_W_hy, d_b, d_W_hh, d_W_xh). """ # Output errors: y_err = predictions y_err[np.argmax(labels)] -= 1 h_err = y_err.dot(self.W_hy.T) * d_tanh(h[-1]) d_W_hy = np.outer(h[-1], y_err) d_b = y_err # For accumulating the gradients through time: d_W_hh = np.zeros(self.W_hh.shape) d_W_xh = np.zeros(self.W_xh.shape) # Back-prop through time; the +1 is because the 0th # hidden state is the all-0s initial state. num_steps = len(seq)+1 for t in reversed(range(1, num_steps)): d_W_hh += np.outer(h[t], h_err) word_rep = self.get_word_rep(seq[t-1]) d_W_xh += np.outer(word_rep, h_err) h_err = h_err.dot(self.W_hh.T) * d_tanh(h[t]) return (d_W_hy, d_b, d_W_hh, d_W_xh)
def _tree_backprop(self, deep_tree, h_err, d_W, d_b): # This is the leaf-node condition for vector trees: if isinstance(deep_tree, np.ndarray): return d_W, d_b else: # Biased gradient: d_b += h_err # Get the left and right representations: left_subtree, right_subtree = deep_tree[0], deep_tree[1] left_rep = self._get_vector_tree_root(left_subtree) right_rep = self._get_vector_tree_root(right_subtree) # Combine them and update d_W: combined = np.concatenate((left_rep, right_rep)) d_W += np.outer(combined, h_err) # Get the gradients for both child nodes: h_err = h_err.dot(self.W.T) * d_tanh(combined) # Split the gradients between the children and continue # backpropagation down each subtree: l_err = h_err[:self.embed_dim] r_err = h_err[self.embed_dim:] d_W, d_b = self._tree_backprop(left_subtree, l_err, d_W, d_b) d_W, d_b = self._tree_backprop(right_subtree, r_err, d_W, d_b) return d_W, d_b
def test_d_tanh(arg, expected): assert np.array_equal(utils.d_tanh(arg), expected)