def forwardProp(self, node, correct=[], guess=[]): cost = total = 0.0 # this is exactly the same setup as forwardProp in rnn.py if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:, node.word] node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2) node.probs = softmax( self.Ws.dot(node.hActs2 * self.mask) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1, t1 = self.forwardProp(node.left, correct, guess) c2, t2 = self.forwardProp(node.right, correct, guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1) node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2) node.probs = softmax( self.Ws.dot(node.hActs2 * self.mask) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def forwardProp(self,node, correct=[], guess=[]): cost = total = 0.0 # this is exactly the same setup as forwardProp in rnn.py if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:,node.word] node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2) node.probs = softmax(self.Ws.dot(node.hActs2)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1,t1 = self.forwardProp(node.left,correct,guess) c2,t2 = self.forwardProp(node.right,correct,guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1) node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2) node.probs = softmax(self.Ws.dot(node.hActs2)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def backprop(self, xs, ys, hs, y_hat): ns = len(xs) h_final = hs[ns - 1] delta = self.params.weights * (y_hat - ys) self.grads.b2 += delta ht = h_final.reshape(len(h_final), 1) delta = delta.reshape(len(ys), 1) self.grads.U += delta.dot(ht.T) # H and L t = ns - 1 # last t current = self.params.U.T.dot(delta) * ht * (1 - ht) # the common part prev_ht = hs[t - 1].reshape(len(hs[t - 1]), 1) self.grads.H += current.dot(prev_ht.T) self.grads.b1 += current.reshape((len(current), )) xt = make_onehot(xs[t], self.vdim).reshape(self.vdim, 1) self.sgrads.L[xs[t]] = xt.dot(current.T)[xs[t]] for i in range(1, self.bptt): if t < i: # so that h[-2] doesn't return anything continue ht_i = hs[t - i].reshape(len(hs[t - i]), 1) prev_ht_i = hs[t - i - 1].reshape(len(hs[t - i - 1]), 1) current = self.params.H.T.dot(current) * ht_i * (1 - ht_i) self.grads.H += current.dot(prev_ht_i.T) self.grads.b1 += current.reshape((len(current), )) prev_xt = make_onehot(xs[t - i], self.vdim).reshape(self.vdim, 1) self.sgrads.L[xs[t - i]] = prev_xt.dot(current.T)[xs[t - i]]
def backprop(self,xs,ys,hs,y_hat): ns = len(xs) h_final = hs[ns-1] delta = (y_hat -ys) self.grads.b2 += delta ht = h_final.reshape(len(h_final),1) delta = delta.reshape(len(ys),1) self.grads.U += delta.dot(ht.T) # H and L t = ns-1 # last t current = self.params.U.T.dot(delta) * ht * (1-ht) # the common part prev_ht = hs[t-1].reshape(len(hs[t-1]),1) self.grads.H += current.dot(prev_ht.T) self.grads.b1 += current.reshape((len(current),)) xt = make_onehot(xs[t],self.vdim).reshape(self.vdim,1) self.sgrads.L[xs[t]] = xt.dot(current.T)[xs[t]] for i in range(1,self.bptt): if t<i: # so that h[-2] doesn't return anything continue ht_i = hs[t-i].reshape(len(hs[t-i]),1) prev_ht_i = hs[t-i-1].reshape(len(hs[t-i-1]),1) current = self.params.H.T.dot(current)*ht_i*(1-ht_i) self.grads.H += current.dot(prev_ht_i.T) self.grads.b1 += current.reshape((len(current),)) prev_xt = make_onehot(xs[t-i],self.vdim).reshape(self.vdim,1) self.sgrads.L[xs[t-i]] = prev_xt.dot(current.T)[xs[t-i]]
def forwardProp(self, node, correct, guess): cost = total = 0.0 if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:, node.word] node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1, t1 = self.forwardProp(node.left, correct, guess) c2, t2 = self.forwardProp(node.right, correct, guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) tmp = np.zeros(len(node.left.hActs1)) for i in range(len(tmp)): tmp[i] = h.dot(self.V[i]).dot(h) node.hActs1 = self.ReLU(self.W.dot(h) + self.b + tmp) node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def forwardProp(self,node,correct, guess): cost = total = 0.0 if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:, node.word] node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs) p = node.probs*make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1,t1 = self.forwardProp(node.left,correct,guess) c2,t2 = self.forwardProp(node.right,correct,guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) tmp = np.zeros(len(node.left.hActs1)) for i in range(len(tmp)): tmp[i] = h.dot(self.V[i]).dot(h) node.hActs1 = self.ReLU(self.W.dot(h) + self.b + tmp) node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def backprop(self, xs, ys, hs_f, hs_b, y_hat): inverted_xs = list(reversed(xs)) ns = len(xs) ht_f = hs_f[ns - 1].reshape(len(hs_f[ns - 1]), 1) ht_b = hs_b[ns - 1].reshape(len(hs_b[ns - 1]), 1) delta = (y_hat - ys) self.grads.b2 += delta delta = delta.reshape(len(ys), 1) self.grads.U += delta.dot( hstack([ht_f, ht_b]).reshape((1, 2 * len(ht_f)))) # H and L t = ns - 1 # last t current_f = self.params.U.T.dot(delta)[:self.hdim] * ht_f * (1 - ht_f) current_b = self.params.U.T.dot(delta)[self.hdim:] * ht_b * ( 1 - ht_b) # the common part # update initial Hs prev_ht_f = hs_f[t - 1].reshape(len(hs_f[t - 1]), 1) self.grads.H_f += current_f.dot(prev_ht_f.T) self.grads.b1_f += current_f.reshape((len(current_f), )) prev_ht_b = hs_b[t - 1].reshape(len(hs_b[t - 1]), 1) self.grads.H_b += current_b.dot(prev_ht_b.T) self.grads.b1_b += current_b.reshape((len(current_b), )) # update initial L xt = make_onehot(xs[t], self.vdim).reshape(self.vdim, 1) self.sgrads.L[xs[t]] = xt.dot(current_f.T)[xs[t]] inv_xt = make_onehot(inverted_xs[t], self.vdim).reshape(self.vdim, 1) self.sgrads.L[inverted_xs[t]] = inv_xt.dot(current_b.T)[inverted_xs[t]] # update the rest for i in range(1, self.bptt): if t < i: # so that h[-2] doesn't return anything continue ht_f_i = hs_f[t - i].reshape(len(hs_f[t - i]), 1) prev_ht_f_i = hs_f[t - i - 1].reshape(len(hs_f[t - i - 1]), 1) current_f = self.params.H_f.T.dot(current_f) * ht_f_i * (1 - ht_f_i) self.grads.H_f += current_f.dot(prev_ht_f_i.T) self.grads.b1_f += current_f.reshape((len(current_b), )) ht_b_i = hs_b[t - i].reshape(len(hs_b[t - i]), 1) prev_ht_b_i = hs_b[t - i - 1].reshape(len(hs_b[t - i - 1]), 1) current_b = self.params.H_b.T.dot(current_b) * ht_b_i * (1 - ht_b_i) self.grads.H_b += current_b.dot(prev_ht_b_i.T) self.grads.b1_b += current_b.reshape((len(current_b), )) prev_xt = make_onehot(xs[t - i], self.vdim).reshape(self.vdim, 1) self.sgrads.L[xs[t - i]] = prev_xt.dot(current_f.T)[xs[t - i]] prev_inv_xt = make_onehot(inverted_xs[t - i], self.vdim).reshape(self.vdim, 1) self.sgrads.L[inverted_xs[t - i]] = prev_inv_xt.dot( current_b.T)[inverted_xs[t - i]]
def backprop(self,xs,ys,hs_f,hs_b,y_hat): inverted_xs = list(reversed(xs)) ns = len(xs) ht_f = hs_f[ns-1].reshape(len(hs_f[ns-1]),1) ht_b = hs_b[ns-1].reshape(len(hs_b[ns-1]),1) delta = self.params.weights*(y_hat -ys) self.grads.b2 += delta delta = delta.reshape(len(ys),1) self.grads.U += delta.dot(hstack([ht_f,ht_b]).reshape((1,2*len(ht_f)))) # H and L t = ns-1 # last t current_f = self.params.U.T.dot(delta)[:self.hdim] * ht_f * (1-ht_f) current_b = self.params.U.T.dot(delta)[self.hdim:] * ht_b * (1-ht_b) # the common part # update initial Hs prev_ht_f = hs_f[t-1].reshape(len(hs_f[t-1]),1) self.grads.H_f += current_f.dot(prev_ht_f.T) self.grads.b1_f += current_f.reshape((len(current_f),)) prev_ht_b = hs_b[t-1].reshape(len(hs_b[t-1]),1) self.grads.H_b += current_b.dot(prev_ht_b.T) self.grads.b1_b += current_b.reshape((len(current_b),)) # update initial L xt = make_onehot(xs[t],self.vdim).reshape(self.vdim,1) self.sgrads.L[xs[t]] = xt.dot(current_f.T)[xs[t]] inv_xt = make_onehot(inverted_xs[t],self.vdim).reshape(self.vdim,1) self.sgrads.L[inverted_xs[t]] = inv_xt.dot(current_b.T)[inverted_xs[t]] # update the rest for i in range(1,self.bptt): if t<i: # so that h[-2] doesn't return anything continue ht_f_i = hs_f[t-i].reshape(len(hs_f[t-i]),1) prev_ht_f_i = hs_f[t-i-1].reshape(len(hs_f[t-i-1]),1) current_f = self.params.H_f.T.dot(current_f)*ht_f_i*(1-ht_f_i) self.grads.H_f += current_f.dot(prev_ht_f_i.T) self.grads.b1_f += current_f.reshape((len(current_b),)) ht_b_i = hs_b[t-i].reshape(len(hs_b[t-i]),1) prev_ht_b_i = hs_b[t-i-1].reshape(len(hs_b[t-i-1]),1) current_b = self.params.H_b.T.dot(current_b)*ht_b_i*(1-ht_b_i) self.grads.H_b += current_b.dot(prev_ht_b_i.T) self.grads.b1_b += current_b.reshape((len(current_b),)) prev_xt = make_onehot(xs[t-i],self.vdim).reshape(self.vdim,1) self.sgrads.L[xs[t-i]] = prev_xt.dot(current_f.T)[xs[t-i]] prev_inv_xt = make_onehot(inverted_xs[t-i],self.vdim).reshape(self.vdim,1) self.sgrads.L[inverted_xs[t-i]] = prev_inv_xt.dot(current_b.T)[inverted_xs[t-i]]
def b_prop(self, ys): #L = self.params['L'] Wh = self.params['Wh'] #Wx = self.params['Wx'] U = self.params['U'] b1 = self.params['b1'] b2 = self.params['b2'] N = len(ys) delta_above = np.zeros(self.hdim) for t in xrange(N - 1, -1, -1): delta_3 = self.yhats[:, t] - make_onehot(ys[t], self.outdim) self.grads['U'] += np.outer(delta_3, self.hs[:, t]) self.grads['b2'] += delta_3 dh = np.dot(np.transpose(U), delta_3) + delta_above delta_2 = dh * (self.hs[:, t] > 0) self.grads['b1'] += delta_2 self.grads['Wh'] += np.outer(delta_2, self.hs[:, t - 1]) #self.grads['Wx'] += np.outer(delta_2, L[:,xs[t]]) #self.grads['L'][:,xs[t]] += np.dot(np.transpose(Wx), delta_2) delta_below = np.dot(np.transpose(Wh), delta_2) delta_above = delta_below return delta_below
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ #### YOUR CODE HERE #### print "windows shape ", windows.shape x = self.sparams.L[windows[:,0]] for i in range(len(windows[0])-1): x = np.concatenate((x,self.sparams.L[windows[:,i+1]]),axis=1) z = self.params.W.dot(x.T)+self.params.b1.reshape((self.params.b1.shape[0],1)) h = tanh(z) p = softmax(self.params.U.dot(h)+self.params.b2.reshape((self.params.b2.shape[0],1))) labelArray = np.zeros((len(labels),self.params.b2.shape[0])) for i in range(len(labels)): labelArray[i] = make_onehot(labels[i],self.params.b2.shape[0]) batch = len(labels) p = p*labelArray.T p = np.sum(p,axis=0) J = np.sum(-np.log(p)) Jreg = batch*(self.lreg/2.0)*(np.sum(self.params.W**2)+np.sum(self.params.U**2)) J += Jreg #### END YOUR CODE #### return J
def backProp(self, node, error=None): # Clear nodes node.fprop = False ################ # TODO: Implement the recursive backProp function # - you should update self.dWs, self.dbs, self.dW, self.db, and self.dL[node.word] accordingly # - node: your current node in the parse tree # - error: error that has been passed down from a previous iteration ################ errorCur = node.probs - make_onehot(node.label, len(self.bs)) self.dWs += np.outer(errorCur, node.hActs1) self.dbs += errorCur errorCur = errorCur.dot(self.Ws) if error is not None: errorCur += error if node.isLeaf == True: self.dL[node.word] += errorCur return errorCur = errorCur * self.df(node.hActs1) self.dW += np.outer(errorCur, np.hstack([node.left.hActs1, node.right.hActs1])) self.db += errorCur errorDown = errorCur.dot(self.W) self.backProp(node.left, errorDown[:self.wvecDim]) self.backProp(node.right, errorDown[self.wvecDim:])
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = np.zeros((ns + 1, self.hdim)) for i in range(ns): hs[i + 1] = sigmoid( self.params.H.dot(hs[i]) + self.params.W.dot(self.sparams.L[xs[i]])) p = softmax(self.params.U.dot(hs[i + 1])) p = p * make_onehot(ys[i], self.vdim) J += -np.log(np.sum(p)) #### END YOUR CODE #### Jreg = 0.5 * self.lreg * (np.sum(self.params.H**2) + np.sum( self.params.W**2) + np.sum(self.params.U**2)) return J + Jreg
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ #### YOUR CODE HERE #### if not hasattr(windows[0], "__iter__"): windows = [windows] labels = [labels] N = len(windows) # x = self.sparams.L[windows] # x = x.reshape((N,x.shape[-2]*x.shape[-1])) # z = x.dot(self.params.W.T) + self.params.b1 # h = tanh(z) # z2 = h.dot(self.params.U.T) + self.params.b2 # p = softmax(z2) # J -= sum(log(p[0][labels]) # J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) J = 0 for n in xrange(N): x = self.sparams.L[windows[n]] x = reshape(x, x.shape[0]*x.shape[1]) h = tanh(self.params.W.dot(x) + self.params.b1) y_hat = softmax(self.params.U.dot(h) + self.params.b2) y = make_onehot(labels[n], len(y_hat)) J -= sum(y*log(y_hat)) J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) #### END YOUR CODE #### return J
def backProp(self,node,error=None): # Clear nodes node.fprop = False ################ # TODO: Implement the recursive backProp function # - you should update self.dWs, self.dbs, self.dW, self.db, and self.dL[node.word] accordingly # - node: your current node in the parse tree # - error: error that has been passed down from a previous iteration ################ errorCur = node.probs - make_onehot(node.label,len(self.bs)) self.dWs += np.outer(errorCur, node.hActs1) self.dbs += errorCur errorCur = errorCur.dot(self.Ws) if error is not None: errorCur += error if node.isLeaf == True: self.dL[node.word] += errorCur return errorCur = errorCur*self.df(node.hActs1) self.dW += np.outer(errorCur,np.hstack([node.left.hActs1, node.right.hActs1])) self.db += errorCur errorDown = errorCur.dot(self.W) self.backProp(node.left,errorDown[:self.wvecDim]) self.backProp(node.right,errorDown[self.wvecDim:])
def b_prop(self, ys): #L = self.params['L'] Wh = self.params['Wh'] #Wx = self.params['Wx'] U = self.params['U'] b1 = self.params['b1'] b2 = self.params['b2'] N = len(ys) delta_above = np.zeros(self.hdim) for t in xrange(N-1,-1, -1): delta_3 = self.yhats[:,t] - make_onehot(ys[t], self.outdim) self.grads['U'] += np.outer(delta_3, self.hs[:,t]) self.grads['b2'] += delta_3 dh = np.dot(np.transpose(U), delta_3) + delta_above delta_2 = dh * (self.hs[:,t] > 0) self.grads['b1'] += delta_2 self.grads['Wh'] += np.outer(delta_2, self.hs[:,t-1]) #self.grads['Wx'] += np.outer(delta_2, L[:,xs[t]]) #self.grads['L'][:,xs[t]] += np.dot(np.transpose(Wx), delta_2) delta_below = np.dot(np.transpose(Wh), delta_2) delta_above = delta_below return delta_below
def compute_loss(self, windows, labels): """ Compute the loss for a given dataset. windows = same as for predict_proba labels = list of class labels, for each row of windows """ #### YOUR CODE HERE #### if not hasattr(windows[0], "__iter__"): windows = [windows] labels = [labels] N = len(windows) # x = self.sparams.L[windows] # x = x.reshape((N,x.shape[-2]*x.shape[-1])) # z = x.dot(self.params.W.T) + self.params.b1 # h = tanh(z) # z2 = h.dot(self.params.U.T) + self.params.b2 # p = softmax(z2) # J -= sum(log(p[0][labels]) # J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) J = 0 for n in xrange(N): x = self.sparams.L[windows[n]] x = reshape(x, x.shape[0] * x.shape[1]) h = tanh(self.params.W.dot(x) + self.params.b1) y_hat = softmax(self.params.U.dot(h) + self.params.b2) y = make_onehot(labels[n], len(y_hat)) J -= sum(y * log(y_hat)) J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0)) #### END YOUR CODE #### return J
def backProp(self,node,error=None): # Clear nodes node.fprop = False # this is exactly the same setup as backProp in rnn.py errorCur = node.probs - make_onehot(node.label,len(self.bs)) self.dWs += np.outer(errorCur,node.hActs2) self.dbs += errorCur errorCur = errorCur.dot(self.Ws)*self.df(node.hActs2) self.dW2 += np.outer(errorCur,node.hActs1) self.db2 += errorCur errorCur = errorCur.dot(self.W2) if error is not None: errorCur += error if node.isLeaf == True: self.dL[node.word] += errorCur return errorCur = errorCur*self.df(node.hActs1) tmp1 = np.ones(self.W1.shape).dot(np.diag(np.hstack([node.left.hActs1, node.right.hActs1]))) self.dW1 += np.diag(errorCur).dot(tmp1) self.db1 += errorCur errorCur = errorCur.dot(self.W1) self.backProp(node.left,errorCur[:self.wvecDim]) self.backProp(node.right,errorCur[self.wvecDim:])
def _acc_grads(self, x, label): """ Accumulate gradients from a training example. """ #import ipdb; ipdb.set_trace() ## # Forward Pass zs, hs = self.forward_pass(x) y_hat = hs[-1] y = make_onehot(label, self.outputsize) delta = y_hat - y cur_h = len(hs) - 2 # current h vector index cur_z = len(zs) - 2 # current z vector index # Backpropagation #import ipdb; ipdb.set_trace() for i in range(len(self.dims) - 1, 0, -1): self._add_grads('b', i, delta) curw = self._get_param('W', i) gradw = np.outer(delta, hs[cur_h]) gradw_reg = self.lreg * curw self._add_grads('W', i, gradw + gradw_reg) if cur_z >= 0: delta = np.dot(curw.T, delta) * self.act_grad(zs[cur_z]) cur_h -= 1 cur_z -= 1
def backProp(self, node, error=None): # Clear nodes node.fprop = False # this is exactly the same setup as backProp in rnn.py errorCur = node.probs - make_onehot(node.label, len(self.bs)) self.dWs += np.outer(errorCur, node.hActs2 * self.mask) self.dbs += errorCur errorCur = errorCur.dot(self.Ws) * self.df(node.hActs2) * self.mask self.dW2 += np.outer(errorCur, node.hActs1) self.db2 += errorCur errorCur = errorCur.dot(self.W2) if error is not None: errorCur += error if node.isLeaf == True: self.dL[node.word] += errorCur return errorCur = errorCur * self.df(node.hActs1) tmp1 = np.ones(self.W1.shape).dot( np.diag(np.hstack([node.left.hActs1, node.right.hActs1]))) self.dW1 += np.diag(errorCur).dot(tmp1) self.db1 += errorCur errorCur = errorCur.dot(self.W1) self.backProp(node.left, errorCur[:self.wvecDim]) self.backProp(node.right, errorCur[self.wvecDim:])
def backProp(self,node,error=None): # Clear nodes node.fprop = False errorCur = node.probs - make_onehot(node.label,len(self.bs)) self.dWs += np.outer(errorCur, node.hActs1) self.dbs += errorCur errorCur = errorCur.dot(self.Ws) if error is not None: errorCur += error if node.isLeaf == True: self.dL[node.word] += errorCur return errorCur = errorCur*self.df(node.hActs1) LR = np.hstack([node.left.hActs1, node.right.hActs1]) self.dW += np.outer(errorCur,LR) self.db += errorCur S = np.zeros(len(LR)) for i in range(len(self.V)): self.dV[i] += errorCur[i]*np.outer(LR,LR) S += (self.V[i]+self.V[i].T).dot(LR)*errorCur[i] errorDown = errorCur.dot(self.W) + S self.backProp(node.left,errorDown[:self.wvecDim]) self.backProp(node.right,errorDown[self.wvecDim:])
def backProp(self, node, error=None): # Clear nodes node.fprop = False errorCur = node.probs - make_onehot(node.label, len(self.bs)) self.dWs += np.outer(errorCur, node.hActs1) self.dbs += errorCur errorCur = errorCur.dot(self.Ws) if error is not None: errorCur += error if node.isLeaf == True: self.dL[node.word] += errorCur return errorCur = errorCur * self.df(node.hActs1) LR = np.hstack([node.left.hActs1, node.right.hActs1]) self.dW += np.outer(errorCur, LR) self.db += errorCur S = np.zeros(len(LR)) for i in range(len(self.V)): self.dV[i] += errorCur[i] * np.outer(LR, LR) S += (self.V[i] + self.V[i].T).dot(LR) * errorCur[i] errorDown = errorCur.dot(self.W) + S self.backProp(node.left, errorDown[:self.wvecDim]) self.backProp(node.right, errorDown[self.wvecDim:])
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ xf = [] for idx in window: xf.extend( self.sparams.L[idx]) # extract representation tanhX = tanh(self.params.W.dot(xf) + self.params.b1) softmaxP = softmax(self.params.U.dot(tanhX) + self.params.b2) y = make_onehot(label, len(softmaxP)) delta2 = softmaxP -y self.grads.U += outer(delta2, tanhX) + self.lreg * self.params.U self.grads.b2 += delta2 delta1 = self.params.U.T.dot(delta2)*(1. - tanhX*tanhX) self.grads.W += outer(delta1, xf) + self.lreg * self.params.W self.grads.b1 += delta1
def _acc_grads(self, x, label): """ Accumulate gradients from a training example. """ #import ipdb; ipdb.set_trace() ## # Forward propagation z1 = self.params.W.dot(x) + self.params.b1 h1 = tanh(z1) z2 = np.dot(self.params.U, h1) + self.params.b2 h2 = tanh(z2) z3 = np.dot(self.params.G, h2) + self.params.b3 y_hat = softmax(z3) y = make_onehot(label, self.outputsize) d3 = y_hat - y self.grads.b3 += d3 self.grads.G += np.outer(d3, h2) + self.lreg * self.params.G d2 = np.dot(self.params.G.T, d3) * tanhd(z2) self.grads.b2 += d2 self.grads.U += np.outer(d2, h1) + self.lreg * self.params.U d1 = np.dot(self.params.U.T, d2) * tanhd(z1) self.grads.W += np.outer(d1, x) + self.lreg * self.params.W self.grads.b1 += d1
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation x = hstack(self.sparams.L[window, :]) h = tanh(2*(self.params.W.dot(x)+self.params.b1)) p = softmax(self.params.U.dot(h)+self.params.b2) ## y = make_onehot(label, 5) delta = p - y # Backpropagation self.grads.U += outer(delta, h) + self.lreg * self.params.U self.grads.b2 += delta gradh = dot(self.params.U.T,delta) * (1-h**2) self.grads.W += outer(gradh, x) + self.lreg * self.params.W self.grads.b1 += gradh dL = self.params.W.T.dot(gradh).reshape(self.window_size, self.word_vec_size) for i in xrange(self.window_size): self.sgrads.L[window[i], :] = dL[i]
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H) and self.sgrads (for L,U) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) #3 # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### ## # Forward propagation # for each time step for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]]) ps[t] = softmax(dot(self.params.U, hs[t])) ## # Backward propagation through time for j in xrange(ns): y = make_onehot(ys[j], self.vdim) y_hat_minus_y = ps[j] - y self.grads.U += outer(y_hat_minus_y, hs[j]) delta = dot(self.params.U.T, y_hat_minus_y) * hs[j] * (1.0 - hs[j]) # start at j and go back self.bptt times (total self.bptt + 1 elements, including current one) for t in xrange(j, j - self.bptt - 1, -1): if t - 1 >= -1: self.grads.H += outer(delta, hs[t - 1]) #See from above.. hs[-1] is list of zeros. self.sgrads.L[xs[t]] = delta delta = dot(self.params.H.T, delta) * hs[t - 1] * (1.0 - hs[t - 1])
def predict(self, node, correct=[], guess=[]): cost = total = 0.0 # this is exactly the same setup as forwardProp in rnn.py if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:,node.word] #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2) tmp = node.hActs1*self.dropoutP tmpMaxout = np.zeros((self.maxoutK, self.middleDim)) for i in range(self.maxoutK): tmpMaxout[i] = self.W2[i].dot(tmp)+self.b2[i] (node.hActs2, node.idx) = self.maxout(tmpMaxout) node.probs = softmax((self.Ws*self.dropoutP).dot(node.hActs1)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1,t1 = self.forwardProp(node.left,correct,guess) c2,t2 = self.forwardProp(node.right,correct,guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1) #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2) tmp = node.hActs1*self.dropoutP tmpMaxout = np.zeros((self.maxoutK,self.middleDim)) for i in range(self.maxoutK): tmpMaxout[i] = self.W2[i].dot(tmp)+self.b2[i] (node.hActs2, node.idx) = self.maxout(tmpMaxout) node.probs = softmax((self.Ws*self.dropoutP).dot(node.hActs2)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def forwardProp(self,node, correct=[], guess=[]): cost = total = 0.0 # this is exactly the same setup as forwardProp in rnn.py if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:,node.word] #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2) tmp = node.hActs1*self.mask1 tmpMaxout = np.zeros((self.maxoutK, self.middleDim)) for i in range(self.maxoutK): tmpMaxout[i] = self.W2[i].dot(tmp) + self.b2[i] (node.hActs2, node.idx) = self.maxout(tmpMaxout) node.probs = softmax(self.Ws.dot(node.hActs2*self.mask)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1,t1 = self.forwardProp(node.left,correct,guess) c2,t2 = self.forwardProp(node.right,correct,guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1) #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2) tmp = node.hActs1*self.mask1 tmpMaxout = np.zeros((self.maxoutK, self.middleDim)) for i in range(self.maxoutK): tmpMaxout[i] = self.W2[i].dot(tmp) + self.b2[i] (node.hActs2, node.idx) = self.maxout(tmpMaxout) node.probs = softmax(self.Ws.dot(node.hActs2*self.mask)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation words = [ self.sparams.L[window[0]], self.sparams.L[window[1]], self.sparams.L[window[2]] ] x = reshape(words, self.sparams.L.shape[1] * 3) # 3n row vector z2 = self.params.W.dot(x) + self.params.b1 a2 = tanh(z2) z3 = self.params.U.dot(a2) + self.params.b2 a3 = softmax(z3) ## # Backpropagation y = make_onehot(label, len(a3)) delta3 = a3 - y dJdU = outer(delta3, a2) dJdb2 = delta3 delta2 = multiply((1 - square(a2)), self.params.U.T.dot(delta3)) dJdW = outer(delta2, x) dJdb1 = delta2 # Regularization regdJdW = self.lreg * self.params.W regdJdU = self.lreg * self.params.U self.grads.U += (dJdU + regdJdU) self.grads.b2 += dJdb2 self.grads.W += (dJdW + regdJdW) self.grads.b1 += dJdb1 dJdL = self.params.W.T.dot(delta2) dJDL_shaped = reshape(dJdL, (3, self.sparams.L.shape[1])) self.sgrads.L[window[0]] = dJDL_shaped[0] self.sgrads.L[window[1]] = dJDL_shaped[1] self.sgrads.L[window[2]] = dJDL_shaped[2]
def _acc_grads(self, xs, ys): # Expect xs as list of indices ns = len(xs) #print 'size of window ',ns # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### ## # Forward propagation #print self.params.U.shape #print self.params.H.shape #print self.sparams.L.shape #bre for t in range(0,ns): hs[t] = sigmoid(dot(hs[t-1],self.params.H)+self.sparams.L[xs[t],:]) ps[t] = softmax(dot(self.params.U,hs[t])) #print hs temp_ys = [] for i in range(0,ns): temp_ys.append(make_onehot(ys[i],self.vdim)) temp_ys = matrix(temp_ys) #print hs #print ps #print ps.shape #print temp_ys.shape delta2 = ps - temp_ys #print self.grads.U ## # Backward propagation through time for t in range(0,ns)[::-1]: self.grads.U += outer(delta2[t],hs[t]) #print self.grads.U delta1 = multiply(dot(delta2[t],self.params.U),hs[t]*(1-hs[t])) for step in range(max(0,t-self.bptt),t+1)[::-1]: self.grads.H += dot(delta1,hs[step-1]) self.sgrads.L[xs[step]] = delta1 delta1 = multiply(dot(delta1,self.params.H),hs[step-1]*(1-hs[step-1]))
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### L = self.sparams.L U = self.params.U W = self.params.W b1 = self.params.b1 b2 = self.params.b2 windowSize = self.windowSize wordVecLen = self.wordVecLen lambda_ = self.lreg alpha = self.alpha ## # Forward propagation x = hstack(L[window, :]) z1 = W.dot(x) + b1 h = tanh(z1) z2 = U.dot(h) + b2 y_hat = softmax(z2) ## # Backpropagation target = make_onehot(label, len(y_hat)) delta = y_hat - target #self.grads.U += delta.dot(h.T) + lambda_ * U #outer函数很有用 self.grads.U += outer(delta, h) + lambda_ * U self.grads.b2 += delta grad_h = U.T.dot(delta) * (1 - h ** 2) self.grads.W += outer(grad_h, x) + lambda_ * W self.grads.b1 += grad_h sgrad_L = W.T.dot(grad_h) sgrad_L = sgrad_L.reshape(windowSize, wordVecLen) for i in xrange(windowSize): self.sgrads.L[window[i], :] = sgrad_L[i, :]
def forwardProp(self, node, correct=[], guess=[]): cost = total = 0.0 # cost should be a running number and total is the total examples we have seen used in accuracy reporting later ################ # TODO: Implement the recursive forwardProp function # - you should update node.probs, node.hActs1, node.fprop, and cost # - node: your current node in the parse tree # - correct: this is a running list of truth labels # - guess: this is a running list of guess that our model makes # (we will use both correct and guess to make our confusion matrix) ################ if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:, node.word] node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1, t1 = self.forwardProp(node.left, correct, guess) c2, t2 = self.forwardProp(node.right, correct, guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W.dot(h) + self.b) node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs) p = node.probs * make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def forwardProp(self,node,correct=[], guess=[]): cost = total = 0.0 # cost should be a running number and total is the total examples we have seen used in accuracy reporting later ################ # TODO: Implement the recursive forwardProp function # - you should update node.probs, node.hActs1, node.fprop, and cost # - node: your current node in the parse tree # - correct: this is a running list of truth labels # - guess: this is a running list of guess that our model makes # (we will use both correct and guess to make our confusion matrix) ################ if node.isLeaf == True: node.fprop = True node.hActs1 = self.L[:, node.word] node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs) p = node.probs*make_onehot(node.label, len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) return cost, 1 c1,t1 = self.forwardProp(node.left,correct,guess) c2,t2 = self.forwardProp(node.right,correct,guess) if node.left.fprop and node.right.fprop: node.fprop = True h = np.hstack([node.left.hActs1, node.right.hActs1]) node.hActs1 = self.ReLU(self.W.dot(h) + self.b) node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs) p = node.probs*make_onehot(node.label,len(self.bs)) cost = -np.log(np.sum(p)) correct.append(node.label) guess.append(np.argmax(node.probs)) cost += c1 cost += c2 total += t1 total += t2 return cost, total + 1
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### (H, X) = self.params.W.shape # (100, 150) (Dy, H) = self.params.U.shape # (5, 100) ## # Forward propagation x = hstack(self.sparams.L[window]) # (150,) --> (X,) a = dot(self.params.W, x) + self.params.b1 # (H,) h = tanh(a) # (H,) y_hat = softmax(dot(self.params.U, h) + self.params.b2) # (Dy,) y = make_onehot(label, len(y_hat)) delta = y_hat - y ## # Backpropagation # dJ/db2 self.grads.b2 += delta # dJ/dU self.grads.U += outer(delta, h) + self.lreg * self.params.U # dJ/dW, dJ/db1 # d_tanh(a) is (H,) #x1 = dot(self.params.U.T, delta.reshape((Dy, 1))).reshape((H,)) * d_tanh(a) x1 = dot(self.params.U.T, delta) * d_tanh(a) self.grads.W += outer(x1, x) + self.lreg * self.params.W self.grads.b1 += x1 dL_updates = dot(self.params.W.T, x1.reshape((H, 1))) for pt in xrange(self.windowsize): f = dL_updates[pt * self.word_vec_size:(pt + 1) * self.word_vec_size] self.sgrads.L[window[pt]] = f.reshape((self.word_vec_size, ))
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation words = [self.sparams.L[window[0]], self.sparams.L[window[1]], self.sparams.L[window[2]]] x = reshape(words, self.sparams.L.shape[1] *3) # 3n row vector z2 = self.params.W.dot(x) + self.params.b1 a2 = tanh(z2) z3 = self.params.U.dot(a2) + self.params.b2 a3 = softmax(z3) ## # Backpropagation y = make_onehot(label, len(a3)) delta3 = a3 - y dJdU = outer(delta3, a2) dJdb2 = delta3 delta2 = multiply((1 - square(a2)), self.params.U.T.dot(delta3)) dJdW = outer(delta2, x) dJdb1 = delta2 # Regularization regdJdW = self.lreg * self.params.W regdJdU = self.lreg * self.params.U self.grads.U += (dJdU + regdJdU) self.grads.b2 += dJdb2 self.grads.W += (dJdW + regdJdW) self.grads.b1 += dJdb1 dJdL = self.params.W.T.dot(delta2) dJDL_shaped = reshape(dJdL, (3, self.sparams.L.shape[1])) self.sgrads.L[window[0]] = dJDL_shaped[0] self.sgrads.L[window[1]] = dJDL_shaped[1] self.sgrads.L[window[2]] = dJDL_shaped[2]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### (H, X) = self.params.W.shape # (100, 150) (Dy, H) = self.params.U.shape # (5, 100) ## # Forward propagation x = hstack(self.sparams.L[window]) # (150,) --> (X,) a = dot(self.params.W, x) + self.params.b1 # (H,) h = tanh(a) # (H,) y_hat = softmax(dot(self.params.U, h) + self.params.b2) # (Dy,) y = make_onehot(label, len(y_hat)) delta = y_hat - y ## # Backpropagation # dJ/db2 self.grads.b2 += delta # dJ/dU self.grads.U += outer(delta, h) + self.lreg * self.params.U # dJ/dW, dJ/db1 # d_tanh(a) is (H,) #x1 = dot(self.params.U.T, delta.reshape((Dy, 1))).reshape((H,)) * d_tanh(a) x1 = dot(self.params.U.T, delta) * d_tanh(a) self.grads.W += outer(x1, x) + self.lreg * self.params.W self.grads.b1 += x1 dL_updates = dot(self.params.W.T, x1.reshape((H, 1))) for pt in xrange(self.windowsize): f = dL_updates[pt * self.word_vec_size : (pt + 1) * self.word_vec_size] self.sgrads.L[window[pt]] = f.reshape((self.word_vec_size,))
def _acc_grads(self, xs, ys, d): # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) zs = zeros((ns+1, self.hdim)) ## # Forward propagation d_vec = self.sparams.D[d] for t in xrange(ns): x_t = xs[t] zs[t] = self.params.H.dot(hs[t-1]) + self.sparams.L[x_t] + d_vec hs[t] = sigmoid(zs[t]) ps[t] = softmax(self.params.U.dot(hs[t]) + self.params.G.dot(d_vec.T).reshape(self.vdim,)) ## # Backward propagation through time d_grad = zeros_like(self.sparams.D[0]) for t in reversed(xrange(ns)): delta = zeros((ns, self.hdim)) p_t = ps[t] eps_t = p_t - make_onehot(ys[t], len(p_t)) self.grads.U += outer(eps_t, hs[t]) self.grads.G += outer(eps_t, d_vec) d_grad += self.params.G.T.dot(eps_t) sig_prime_t = sigmoid(zs[t])*(1.-sigmoid(zs[t])) delta[t] = sig_prime_t * self.params.U.T.dot(eps_t) self.sgrads.L[xs[t]] = delta[t].copy() d_grad += delta[t].copy() self.grads.H += outer(delta[t], hs[t-1]) for i in xrange(1, self.bptt): j = t-i if j < 0: continue sig_prime_j = sigmoid(zs[j])*(1.-sigmoid(zs[j])) delta[j] = sig_prime_j * self.params.H.T.dot(delta[j+1]) self.sgrads.L[xs[j]] = delta[j].copy() d_grad += delta[j].copy() self.grads.H += outer(delta[j], hs[j-1]) self.sgrads.D[d] = d_grad.copy()
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation a1 = hstack(self.sparams.L[window, :]) z2 = self.params.W.dot(a1) + self.params.b1 a2 = tanh(z2) # h z3 = self.params.U.dot(a2) + self.params.b2 y_hat = softmax(z3) y = make_onehot(label, len(y_hat)) delta3 = y_hat - y ## # Backpropagation # dJ/dU self.grads.U += outer(delta3, a2) + self.lreg * self.params.U # dJ/db2 self.grads.b2 += delta3 delta2 = self.params.U.T.dot(delta3) * d_tanh(z2) # dJ/dW @TODO: check self.grads.W += outer(delta2, a1) + self.lreg * self.params.W # dJ/db1 self.grads.b1 += delta2 # dJ/dL dL = self.params.W.T.dot(delta2).reshape(self.windowsize, -1) for idx in xrange(self.windowsize): self.sgrads.L[window[idx], :] = dL[idx]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### #x_ = concatenate([self.wv[window[0],:], self.wv[window[1],:], self.wv[window[2],:]]) x_ = hstack(self.sparams.L[window, :]) lam = self.lreg alpha = self.alpha W = self.params.W b1 = self.params.b1 U = self.params.U b2 = self.params.b2 ## # Forward propagation z1 = W.dot(x_) + b1 h = tanh(z1) z2 = U.dot(h) + b2 y = softmax(z2) ## # Backpropagation target = make_onehot(label, len(y)) dscore = y - target self.grads.U += outer(dscore, h) + lam * U self.grads.b2 += dscore dhidden = U.T.dot(dscore)*(1 - h ** 2) self.grads.W += outer(dhidden, x_) + lam * W self.grads.b1 += dhidden dx_ = dot(W.T, dhidden) dx_ = dx_.reshape(self.windowSize, self.wordVecLen) for i in xrange(self.windowSize): self.sgrads.L[window[i],:] = dx_[i, :]
def _acc_grads(self, x, label): """ Accumulate gradients from a training example. """ ## # Forward propagation #import ipdb; ipdb.set_trace() p = softmax(self.params.W.dot(x) + self.params.b) ## # Compute gradients w.r.t cross-entropy loss y = make_onehot(label, len(p)) delta = p - y # dJ/dW, dJ/db1 self.grads.W += np.outer(delta, x) + self.lreg * self.params.W self.grads.b += delta
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence ns = maxlen hs = np.zeros((ns+1,self.hdim)) #### YOUR CODE HERE #### for i in range(ns): hs[i+1] = sigmoid(self.params.H.dot(hs[i])+self.params.W.dot(self.sparams.L[ys[i]])) p = self.hierarchicalU.getDistribution(hs[i+1]) y = multinomial_sample(p) ys.append(y) if y == end: break p = p*make_onehot(y,self.vdim) J += -np.log(np.sum(p)) ## #x only compute the node which gradient is updated x = self.hierarchicalU.getSumSquareU(self.hierarchicalU.root) Jreg = 0.5*self.lreg*(np.sum(self.params.H**2)+np.sum(self.params.W**2)+ x) #### YOUR CODE HERE #### return ys, J+Jreg
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation x = hstack([self.sparams.L[idx] for idx in window]) # extract representation,(150,) matrix a =self.params.W.dot(x)+self.params.b1#(100,150)*(150,)+(100,)=>(100,) h = tanh(a)#(100,) p = softmax(self.params.U.dot(h) + self.params.b2)#(5,100)*(100,)+(100,)=>(5,) # Compute gradients w.r.t cross-entropy loss y = make_onehot(label, len(p)) delta = p - y #(5,) ## # Backpropagation # dJ/dh dh = self.params.U.T.dot(delta) #(100,5)*(5,)=>(100,) # dJ/da da = dh * (1-tanh(a)**2)#(100,) right L_updatevalue=self.params.W.T.dot(da)#(150,100)*(100,)=>(150,) # dJ/dU, dJ/db2 self.grads.U += outer(delta, h) + self.lreg * self.params.U#(5,100) self.grads.b2 += delta#(5,) # dJ/dW, dJ/db1 self.grads.W += outer(da,x) + self.lreg * self.params.W #(100,)*(150,)+(100,150)=>(100,150) self.grads.b1 += da #(100,) # dJ/dL, sparse update: use sgrads dL = self.params.W.T.dot(da).reshape(self.window_size, self.word_vec_size)#(150,100)*(100,)=>(150,)=>(3,50) for i in xrange(self.window_size): self.sgrads.L[window[i], :] = dL[i]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation window_vecs = [] for w in window: window_vecs.append(self.sparams.L[w]) x = append([], window_vecs) z2 = dot(self.params.W1, x) a2 = tanh(z2 + self.params.b2) z3 = dot(self.params.W2, a2) y_hat = softmax(z3 + self.params.b3) #print 'acc_grads: ' + str(y_hat) ## # Backpropagation d3 = y_hat - make_onehot(label, y_hat.shape[0]); # label is 'y' self.grads.b3 += d3 self.grads.W2 += outer(d3, a2) + self.lreg * self.params.W2 d2 = multiply(tanh_derivative(a2), dot(self.params.W2.T, d3)) self.grads.W1 += outer(d2, x) + self.lreg * self.params.W1 self.grads.b2 += d2 x_grads = dot(self.params['W1'].T, d2) for i in range(0, len(window)): self.sgrads.L[ window[i] ] = x_grads[i*self.n : (i+1)*self.n]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation x = self.sparams.L[window] x = reshape(x,(-1,)) W = self.params.W U = self.params.U b1 = self.params.b1 b2 = self.params.b2 lreg = self.lreg h = tanh(W.dot(x) + b1) p = softmax(U.dot(h) + b2) y = make_onehot(label, len(p)) delta2 = p - y delta1 = multiply(transpose(U).dot(delta2), 1-h*h) ## # Backpropagation self.grads.b2 += delta2 self.grads.U += outer(delta2, h) + lreg * U self.grads.b1 += delta1 self.grads.W += outer(delta1, x) + lreg * W C = window.shape[0] gradx = reshape(transpose(W).dot(delta1),(C,-1)) # for i in range(C): # self.sgrads.L[window[i]] = gradx[i] self.sgrads.L[window] = gradx
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### # Forward propagation #words = [self.sparams.L[window[0]], self.sparams.L[window[1]], self.sparams.L[window[2]]] #x = reshape(words, self.sparams.L.shape[1] *3) # 3n row vector x = self.sparams.L[window, :].flatten() h = tanh(self.params.W.dot(x) + self.params.b1) # 100*1 yhat = softmax(self.params.U.dot(h) + self.params.b2) # 5*1 # Compute gradients w.r.t cross-entropy loss # Backpropagation y = make_onehot(label, len(yhat)) delta = yhat - y # dJ/dU, dJ/db2 self.grads.U += (outer(delta, h) + self.lreg * self.params.U) self.grads.b2 += delta # dJ/dW, dJ/db1 delta2 = multiply((1 - square(h)), self.params.U.T.dot(delta)) self.grads.W += (outer(delta2, x) + self.lreg * self.params.W) self.grads.b1 += delta2 # dJ/dL, sparse grad update dJ_dL = self.params.W.T.dot(delta2).reshape(len(window), self.sparams.L.shape[1]) #for i, w in enumerate(window): # self.sgrads.L[w] = dJ_dL[i] for k in range(len(window)): self.sgrads.L[window[k]] = dJ_dL[k]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation words = np.array([self.params.L[x] for x in window]) x = np.reshape(words, -1) layer1 = np.tanh(self.params.W.dot(x) + self.params.b1) probs = softmax(self.params.U.dot(layer1) + self.params.b2) ## # Backpropagation y = make_onehot(label, len(probs)) dx = probs - y dU = np.outer(dx, layer1) delta2 = np.multiply((1 - np.square(dU)), self.params.U.T.dot(dx)) dW = np.outer(delta2, x) db1 = delta2 dL = self.params.W.T.dot(delta2) dL = np.reshape(dL, (3, self.params.L.shape[1])) dW += self.lreg * self.params.W dU += self.lreg * self.params.U self.grads.U += dU self.grads.W += dW self.grads.b2 += dx self.grads.b1 += delta2 self.sgrads.L[window[0]] = dL[0] self.sgrads.L[window[1]] = dL[1] self.sgrads.L[window[2]] = dL[2]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation N = len(window) x = self.sparams.L[window] d = x.shape[1] x = x.reshape((x.shape[0] * x.shape[1])) z = self.params.W.dot(x) + self.params.b1 h = tanh(z) y_hat = softmax(self.params.U.dot(h) + self.params.b2) ## # Backpropagation y = make_onehot(label, len(y_hat)) delta = y_hat - y # dJ/dU, dJ/db2, dJ/dW, dJ/db1, dJ/dL self.grads.U += outer(delta, h) + self.lreg * self.params.U self.grads.b2 += delta tanh_prime_z = 1 - tanh(z)**2 self.grads.W += outer(tanh_prime_z * delta.dot(self.params.U), x) + self.lreg * self.params.W self.grads.b1 += tanh_prime_z * delta.dot(self.params.U) temp = (tanh_prime_z * delta.dot(self.params.U)).dot(self.params.W) for n in xrange(N): self.sgrads.L[window[n]] = temp[n * d:(n + 1) * d]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation x = hstack(self.sparams.L[window]) dd = len(x)/3 a = self.params.W.dot(x) + self.params.b1 h = tanh(a) scores = self.params.U.dot(h) + self.params.b2 p = softmax(scores) ## # Backpropagation y = make_onehot(label,len(p)) delta = p - y self.grads.U += outer(delta,h) + self.lreg * self.params.U # 5 *100 self.grads.b2 += delta dh = self.params.U.T.dot(delta) #100 da = dh * (1-tanh(a)**2) self.grads.W += outer(da,x) + self.lreg * self.params.W #100*150 self.grads.b1 = da # good dx = self.params.W.T.dot(da) dx__ = reshape(dx,(3,-1)) self.sgrads.L[window[0]] = dx__[0] self.sgrads.L[window[1]] = dx__[1] self.sgrads.L[window[2]] = dx__[2]
def _acc_grads(self, idx, label): """ Accumulate gradients from a training example. """ ## # Forward propagation x = self.sparams.L[idx] # extract representation p = softmax(self.params.W.dot(x) + self.params.b) ## # Compute gradients w.r.t cross-entropy loss y = make_onehot(label, len(p)) delta = p - y # dJ/dW, dJ/db1 self.grads.W += outer(delta, x) + self.lreg * self.params.W self.grads.b += delta # dJ/dL, sparse update: use sgrads # this stores an update to the row L[idx] self.sgrads.L[idx] = self.params.W.T.dot(delta)
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation N = len(window) x = self.sparams.L[window] d = x.shape[1] x = x.reshape((x.shape[0]*x.shape[1])) z = self.params.W.dot(x) + self.params.b1 h = tanh(z) y_hat = softmax(self.params.U.dot(h) + self.params.b2) ## # Backpropagation y = make_onehot(label, len(y_hat)) delta = y_hat - y # dJ/dU, dJ/db2, dJ/dW, dJ/db1, dJ/dL self.grads.U += outer(delta, h) + self.lreg * self.params.U self.grads.b2 += delta tanh_prime_z = 1-tanh(z)**2 self.grads.W += outer(tanh_prime_z*delta.dot(self.params.U), x) + self.lreg * self.params.W self.grads.b1 += tanh_prime_z*delta.dot(self.params.U) temp = (tanh_prime_z*delta.dot(self.params.U)).dot(self.params.W) for n in xrange(N): self.sgrads.L[window[n]] = temp[n*d:(n+1)*d]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation # x: dim0 x 1 # W: dim1 x dim0 # b1: dim1 x 1 # U: dim2 x dim1 # b2: dim2 x 1 x, z1, h, z2, y = self._forward(window) label_vector = make_onehot(label, len(y)) ## # Backpropagation dz2 = y - label_vector # dim2 x 1 self.grads.U += outer(dz2, h) self.grads.U += self.lreg * self.params.U self.grads.b2 += dz2 dz1 = self.params.U.T.dot(dz2) * (1 - h * h) # dim1 x 1 self.grads.W += outer(dz1, x) # dim1 x dim0 self.grads.W += self.lreg * self.params.W self.grads.b1 += dz1 dx = self.params.W.T.dot(dz1) for i, w in enumerate(window): self.sgrads.L[w] = dx[i * self.D:(i + 1) * self.D]
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation x = concatenate([self.sparams.L[w] for w in window]) z1 = self.params.W.dot(x) + self.params.b1 h = 2 * sigmoid(2 * z1) - 1 z2 = self.params.U.dot(h) + self.params.b2 p = softmax(z2) y = make_onehot(label, len(p)) ## # Backpropagation # compute the gradients w.r.t cross-entropy loss delta1 = p - y # dJ/dU, dJ/db2 self.grads.U += outer(delta1, h) + self.lreg * self.params.U self.grads.b2 += delta1 # dJ/dW, dJ/db1 delta2 = self.params.U.T.dot(delta1) * (1 - h**2) self.grads.W += outer(delta2, x) + self.lreg * self.params.W self.grads.b1 += delta2 # dj/dLi for i, w_chunk in enumerate(split(self.params.W, len(window), axis=1)): self.sgrads.L[window[i]] = w_chunk.T.dot(delta2)
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation # build input context x = self.build_input_context(window) # first hidden layer z1 = self.params.W.dot(x) + self.params.b1 a1 = tanh(z1) # second hidden layer z2 = self.params.U.dot(a1) + self.params.b2 a2 = softmax(z2) ## # Backpropagation # second hidden layer delta2 = a2 - make_onehot(label, self.nclass) self.grads.b2 += delta2 self.grads.U += outer(delta2, a1) + self.lreg * self.params.U # first hidden layer delta1 = (1.0 - a1**2) * self.params.U.T.dot(delta2) self.grads.b1 += delta1 self.grads.W += outer(delta1, x) + self.lreg * self.params.W for j, idx in enumerate(window): start = j * self.n stop = (j + 1) * self.n self.sgrads.L[idx] = self.params.W[:, start:stop].T.dot(delta1)
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation x = hstack(self.sparams.L[window, :]) z1 = self.params.W.dot(x) + self.params.b1 h = tanh(z1) z2 = self.params.U.dot(h) + self.params.b2 y_pre = softmax(z2) y_act = make_onehot(label, len(y_pre)) ## # Backpropagation delta = y_pre - y_act self.grads.U += outer(delta, h) + self.lreg * self.params.U self.grads.b2 += delta grad_h = self.params.U.T.dot(delta) self.grads.W += outer(grad_h * (1 - h ** 2), x) + self.lreg * self.params.W self.grads.b1 += grad_h * (1 - h ** 2) sgrad_L = self.params.W.T.dot(grad_h * (1 - h ** 2)) sgrad_L = sgrad_L.reshape(self.windowSize, self.wordVecLen) # self.sgrads.L[window,:] = sgrad_L for i in xrange(self.windowSize): self.sgrads.L[window[i], :] = sgrad_L[i, :]