def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = np.zeros((ns+1,self.hdim)) for i in range(ns): hs[i+1] = sigmoid(self.params.H.dot(hs[i])+self.params.W.dot(self.sparams.L[xs[i]])) nodeCur = self.word2node[ys[i]] while nodeCur.parent != None: t = 1 if nodeCur.isLeft == False: t = -1 nodeCur = nodeCur.parent J += -np.log(sigmoid(t*nodeCur.hActs.dot(hs[i+1]))) #### END YOUR CODE #### x = self.hierarchicalU.getSumSquareU(self.hierarchicalU.root) Jreg = 0.5*self.lreg*(np.sum(self.params.H**2)+np.sum(self.params.W**2) + x) return J + Jreg
def compute_seq_ppl(self, xs, ys): #### YOUR CODE HERE #### J = 0 ns = len(xs) hs = zeros((ns+1, self.hdim)) cs = zeros((ns, self.cdim)) # predicted probas ps = zeros((ns, self.Udim)) #### YOUR CODE HERE #### L = self.sparams.L Lc = self.Lcluster cfreq = self.cfreq cwords = self.cwords direct_size = self.hsize U = self.params.U H = self.params.H C = zeros((self.cdim, self.hdim)) if self.isCompression is True: C = self.params.C ## # Forward propagation for i in xrange(ns): hs[i+1] = sigmoid(H.dot(hs[i]) + L[xs[i]]) #hs[i+1] = 2.0/(1 + exp(-2.0*(H.dot(hs[i]) + L[xs[i]]))) - 1 #without maximum entropy optimization word_cluster = Lc[ys[i]] st_word = cwords[word_cluster, 0] ed_word = st_word + cfreq[word_cluster] part_cluster = zeros((self.class_size, )) part_word = zeros((ed_word - st_word, )) if self.isME is True: if direct_size > 0 and xs[i] != -1: part_cluster += self.params.cluster_direct[xs[i]] indexs = cwords[word_cluster, 0:int(cfreq[word_cluster])] if xs[i] < direct_size: part_word += self.params.word_direct[xs[i], indexs] if self.isCompression is True: cs[i] = sigmoid(C.dot(hs[i+1])) part_cluster += U[self.vdim:].dot(cs[i]) part_word += U[st_word:ed_word].dot(cs[i]) ps[i, self.vdim:] = softmax(part_cluster) ps[i, st_word:ed_word] = softmax(part_word) else: part_cluster += U[self.vdim:].dot(hs[i+1]) part_word += U[st_word:ed_word].dot(hs[i+1]) ps[i, self.vdim:] = softmax(part_cluster) ps[i, st_word:ed_word] = softmax(part_word) #ps[i, self.vdim:] = softmax(U[self.vdim:,:].dot(hs[i+1])) #ps[i, st_word:ed_word] = softmax(U[st_word:ed_word,:].dot(hs[i+1])) #print maximum(ps[i, ys[st_word:ed_word]]), ps[i,ys[i]], maximum(ps[i, self.vdim:]), ps[i, self.vdim+word_cluster] J -= log(ps[i, ys[i]] * ps[i, self.vdim+word_cluster]) return J
def _acc_grads(self, xs, ys): #### YOUR CODE HERE #### # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = np.zeros((ns+1, self.hdim)) # predicted probas ps = np.zeros((ns+1, self.vdim)) #### YOUR CODE HERE #### ## # Forward propagation zs = np.zeros((ns+1,self.hdim)) for i in range(ns): zs[i+1] = self.params.H.dot(hs[i]) + self.params.W.dot(self.sparams.L[xs[i]]) hs[i+1] = sigmoid(zs[i+1]) ## # Backward propagation through time sgradsTmp = np.zeros((self.vdim,self.hdim)) grad0 = np.zeros((ns+1,self.hdim)) # (y-t)*U for i in range(ns): nodeCur = self.word2node[ys[i]] while nodeCur.parent != None: t = 1 if nodeCur.isLeft == False: t = 0 nodeCur = nodeCur.parent if nodeCur.grad == None: nodeCur.grad = (sigmoid(nodeCur.hActs.dot(hs[i+1]))-t)*hs[i+1] else: nodeCur.grad = nodeCur.grad + (sigmoid(nodeCur.hActs.dot(hs[i+1]))-t)*hs[i+1] grad0[i+1] = grad0[i+1] + (sigmoid(nodeCur.hActs.dot(hs[i+1]))-t)*nodeCur.hActs vectorCurrent = grad0[i+1]*sigmoidGrad(zs[i+1]) for j in range(min(i+1,self.bptt+1)): xh1 = np.ones((self.hdim, self.hdim)).dot(np.diag(hs[i-j])) self.grads.H += np.diag(vectorCurrent).dot(xh1) x1 = np.ones((self.hdim, self.hdim)).dot(np.diag(self.sparams.L[xs[i-j]])) self.grads.W += np.diag(vectorCurrent).dot(x1) sgradsTmp[xs[i-j]] += vectorCurrent.dot(self.params.W) vectorCurrent = vectorCurrent.dot(self.params.H) vectorCurrent = vectorCurrent*sigmoidGrad(zs[i-j]) self.hierarchicalU.regularizedGrad(self.hierarchicalU.root,self.lreg) self.grads.H += self.lreg*self.params.H self.grads.W += self.lreg*self.params.W for i in range(len(sgradsTmp)): self.sgrads.L[i] = sgradsTmp[i,:]
def _acc_grads(self, xs, ys, d): # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) zs = zeros((ns+1, self.hdim)) ## # Forward propagation d_vec = self.sparams.D[d] for t in xrange(ns): x_t = xs[t] zs[t] = self.params.H.dot(hs[t-1]) + self.sparams.L[x_t] + d_vec hs[t] = sigmoid(zs[t]) ps[t] = softmax(self.params.U.dot(hs[t]) + self.params.G.dot(d_vec.T).reshape(self.vdim,)) ## # Backward propagation through time d_grad = zeros_like(self.sparams.D[0]) for t in reversed(xrange(ns)): delta = zeros((ns, self.hdim)) p_t = ps[t] eps_t = p_t - make_onehot(ys[t], len(p_t)) self.grads.U += outer(eps_t, hs[t]) self.grads.G += outer(eps_t, d_vec) d_grad += self.params.G.T.dot(eps_t) sig_prime_t = sigmoid(zs[t])*(1.-sigmoid(zs[t])) delta[t] = sig_prime_t * self.params.U.T.dot(eps_t) self.sgrads.L[xs[t]] = delta[t].copy() d_grad += delta[t].copy() self.grads.H += outer(delta[t], hs[t-1]) for i in xrange(1, self.bptt): j = t-i if j < 0: continue sig_prime_j = sigmoid(zs[j])*(1.-sigmoid(zs[j])) delta[j] = sig_prime_j * self.params.H.T.dot(delta[j+1]) self.sgrads.L[xs[j]] = delta[j].copy() d_grad += delta[j].copy() self.grads.H += outer(delta[j], hs[j-1]) self.sgrads.D[d] = d_grad.copy()
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = zeros((ns+1, self.hdim)) ps = zeros((ns, self.vdim)) for i in xrange(ns): hs[i] = sigmoid(self.params.H.dot(hs[i-1]) + self.sparams.L[xs[i]]) ps[i] = softmax(self.params.U.dot(hs[i])) J -= log(ps[i][ys[i]]) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) for t in xrange(ns): hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]]) ps[t] = softmax(self.params.U.dot(hs[t])) J -= log(ps[t,ys[t]]) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) self.xs = xs self.ys=ys hs = zeros((ns+1, self.hdim)) self.hs1 = hs # for each time step for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]]) y_hat = softmax(dot(self.params.U, hs[t])) J -= log(y_hat[ys[t]]) #### END YOUR CODE #### return J
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probs ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### # forward propagation for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]]) ps[t] = softmax(dot(self.sparams.U, hs[t])) # backpropagation through time for i in xrange(ns): d2i = ps[i] d2i[ys[i]] -= 1 d1 = dot(self.sparams.U.T, d2i) * hs[i] * (1 - hs[i]) self.sgrads.U = dot(d2i.reshape((-1, 1)), hs[i].reshape((1, -1))) for t in xrange(i, i - self.bptt - 1, -1): if t >= 0: # the farthest reference will thus be hs[-1] self.sgrads.L[xs[t]] = d1 self.grads.H += dot(d1.reshape((-1, 1)), hs[t-1].reshape((1, -1))) d1 = dot(self.params.H.T, d1) * hs[t-1] * (1 - hs[t-1]) # accumulate punishments/deltas
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### L = self.sparams.L U = self.params.U H = self.params.H ## # Forward propagation for i in xrange(ns): hs[i+1] = sigmoid(H.dot(hs[i]) + L[xs[i]]) #hs[i+1] = 2.0/(1.0 + exp(-2.0*(H.dot(hs[i]) + L[xs[i]]))) - 1.0 ps[i] = softmax(U.dot(hs[i+1])) J -= log(ps[i][ys[i]]) #### END YOUR CODE #### return J
def predict_proba(self, windows): """ Predict class probabilities. Should return a matrix P of probabilities, with each row corresponding to a row of X. windows = array (n x windowsize), each row is a window of indices """ # handle singleton input by making sure we have # a list-of-lists if not hasattr(windows[0], "__iter__"): windows = [windows] #### YOUR CODE HERE #### # construct input matrix x = vstack([concatenate([self.sparams.L[idx] for idx in window]) for window in windows]) z1 = self.params.W.dot(x.T) + self.params.b1[:, newaxis] h1 = 2 * sigmoid(2 * z1) - 1 z2 = self.params.U.dot(h1) + self.params.b2[:, newaxis] P = softmax(z2.T) #### END YOUR CODE #### return P # rows are output for each input
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ ns = len(xs) hs = zeros((ns+1, self.hdim)) ps = zeros((ns, self.vdim)) for i in range(ns): z1 = self.params.H.dot(hs[i-1]) + self.sparams.L[xs[i]] hs[i] = sigmoid(z1) z2 = self.params.U.dot(hs[i]) ps[i] = softmax(z2) J = sum(-log(ps[range(len(ps)), ys])) return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ ns = len(xs) h_ant = zeros((1, self.hdim)) J = 0 #### YOUR CODE HERE #### for step in xrange(0,ns): # print "hs[step-1].shape %s" % (hs[step-1].shape,) # print "self.params.H.shape %s" % (self.params.H.shape,) # print "self.sparams.L.shape %s" % (self.sparams.L.shape,) # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,) a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[xs[step]] h = sigmoid( a1 ) a2 = self.params.U.dot(h.T).T # print "h.shape %s" % (h.shape,) # print "a2.shape %s" % (a2.shape,) # print "self.params.U.shape %s" % (self.params.U.shape,) y_hat = softmax( a2 ) h_ant = h J -= log( y_hat[:,ys[step]] ) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) h_prev = zeros(self.hdim) for t in xrange(ns): h_t = sigmoid(dot(self.params.H, h_prev) + self.sparams.L[xs[t]]) if t == ns - 1: yhat_t = softmax(dot(self.params.U, h_t)) J = -log(yhat_t[ys]) h_prev = h_t J += .5 * self.lamb * (sum(self.params.H**2) + sum(self.params.U**2)) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = zeros((ns+1, self.hdim)) ps = zeros((ns, self.vdim))#(3,10) # Forward propagation for t in xrange(ns): hs[t] = sigmoid(self.params.H.dot(hs[t - 1]) + self.sparams.L[xs[t]])#(Dh,Dh)*(Dh,)+(Dh,) ps[t] = softmax(self.params.U.dot(hs[t]))#(V,Dh)*(Dh,) J += - log(ps[t][ys[t]]) #print ps[t] #print [ys[t]] #J += -ys[t]*log(ps[t]) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### # hs[-1] = initial hidden state (zeros) ns = len(ys) hs = zeros((ns+1, self.hdim)) for t in range(ns): hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]]) #ps[t] = softmax(self.params.U.dot(hs[t])) #J -= log(ps[t][ys[t]]) h_final = hs[ns-1] z = self.params.U.dot(h_final) y_hat = [] for i in range(n_aspect): current = z[sent_dim*i:sent_dim*(i+1)] y_hat.extend(softmax(current)) J =- sum(ys.reshape(len(ys),1)*log(array(y_hat).reshape(len(y_hat),1))) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # _for memory purposes_, we do not compute the loss in one fell swoop # forward propagation for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]]) p = softmax(dot(self.sparams.U, hs[t])) J -= sum(log(p[ys[t]])) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ #J = 0 ns = len(xs) #### YOUR CODE HERE #### # forward propagation hs = zeros((ns+1, self.hdim)) ps = zeros((ns, self.vdim)) # predicted probas for t in range(0, ns): hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t], :]) ps[t] = softmax(dot(self.params.U, hs[t])) J = - sum(log(ps[arange(ns), ys])) #### END YOUR CODE #### return J
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H) and self.sgrads (for L,U) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) #3 # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### ## # Forward propagation # for each time step for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]]) ps[t] = softmax(dot(self.params.U, hs[t])) ## # Backward propagation through time for j in xrange(ns): y = make_onehot(ys[j], self.vdim) y_hat_minus_y = ps[j] - y self.grads.U += outer(y_hat_minus_y, hs[j]) delta = dot(self.params.U.T, y_hat_minus_y) * hs[j] * (1.0 - hs[j]) # start at j and go back self.bptt times (total self.bptt + 1 elements, including current one) for t in xrange(j, j - self.bptt - 1, -1): if t - 1 >= -1: self.grads.H += outer(delta, hs[t - 1]) #See from above.. hs[-1] is list of zeros. self.sgrads.L[xs[t]] = delta delta = dot(self.params.H.T, delta) * hs[t - 1] * (1.0 - hs[t - 1])
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = list of index of start words (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = init # emitted sequence #### YOUR CODE HERE #### h = np.zeros(self.hdim) for x in ys: z = self.params.H.dot(h) + self.sparams.L[x] h = sigmoid(z) while ys[-1] != end: x = ys[-1] z = self.params.H.dot(h) + self.sparams.L[x] h = sigmoid(z) y_hat = softmax(self.params.U.dot(h)) y = multinomial_sample(y_hat) J -= np.log(y_hat[y]) ys.append(y) #### YOUR CODE HERE #### return ys, J
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### ns = maxlen # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### H = self.params.H U = self.params.U L = self.sparams.L bptt = self.bptt ## # Forward propagation for t in xrange(ns): hs[t + 1] = sigmoid(H.dot(hs[t]) + L[ys[t]]) ps[t] = softmax(U.dot(hs[t + 1])) ys = ys + [multinomial_sample(ps[t])] #ys.append(multinomial_sample(ps[t])) J -= log(ps[t][ys[t]]) if ys[t + 1] == end: break if t == ns - 1: ys = ys + [end] #### YOUR CODE HERE #### return ys, J
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### onehot_vecs = expand_dims(self.sparams.L[window,:].flatten(),axis=0) #print "onehot_vecs.shape: %s " % (onehot_vecs.shape,) ## # Forward propagation a1 = self.params.W.dot(onehot_vecs.T).T + self.params.b1 s = sigmoid( 2.0 * a1 ) h = 2.0 * s - 1.0 a2 = self.params.U.dot(h.T).T + self.params.b2 y_hat = softmax( a2 ) ## # Backpropagation t = zeros( y_hat.shape ) t[:,label] = 1 delta_out = y_hat - t self.grads.U += h.T.dot(delta_out).T + self.lreg * self.params.U #print "delta_out shape: %s" % (delta_out.shape,) self.grads.b2 += delta_out.flatten() #print "self.grads.b2.shape: %s " % (self.grads.b2.shape,) delta_hidden = delta_out.dot(self.params.U) * 4.0 * sigmoid_grad( s ) self.grads.W += delta_hidden.T.dot(onehot_vecs) + self.lreg * self.params.W self.grads.b1 += delta_hidden.flatten() #print "self.grads.b2.shape: %s " % (self.grads.b1.shape,) grad_xs = delta_hidden.dot(self.params.W).T #print "grad_xs.shape: %s " % (grad_xs.shape,) self.sgrads.L[window[0]] = grad_xs[range(0,50)].flatten() self.sgrads.L[window[1]] = grad_xs[range(50,100)].flatten() self.sgrads.L[window[2]] = grad_xs[range(100,150)].flatten()
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### h = sigmoid(self.sparams.L[init]) for t in range(maxlen): h = sigmoid(self.params.H.dot(h) + self.sparams.L[ys[-1]]) pred = softmax(self.params.U.dot(h)) y = multinomial_sample(pred) ys.append(y) J += -1*log(pred[y]) if y == end: break #### YOUR CODE HERE #### return ys, J
def forward_propagation(self,xs): n_aspect = N_ASPECTS sent_dim = SENT_DIM ns = len(xs) hs_f = zeros((ns+1, self.hdim)) hs_b = zeros((ns+1, self.hdim)) for t in range(ns): hs_f[t] = sigmoid(self.params.H_f.dot(hs_f[t-1]) + self.sparams.L[xs[t]] + self.params.b1_f) h_f_final = hs_f[ns-1] inverted_xs = list(reversed(xs)) for t in range(ns): hs_b[t] = sigmoid(self.params.H_b.dot(hs_b[t-1]) + self.sparams.L[inverted_xs[t]] + self.params.b1_b) h_b_final = hs_b[ns-1] z = self.params.U.dot(hstack([h_f_final,h_b_final])) + self.params.b2 y_hat = [] for i in range(n_aspect): current = z[sent_dim*i:sent_dim*(i+1)] y_hat.extend(softmax(current)) return hs_f,hs_b,y_hat
def generate_missing_word(self, before, after, wv=None, nres=5): Ps = [] missings = [] lh = np.zeros(self.hdim) for x in before: vec = wv[x] if x >= self.vdim else self.sparams.LL[x] z = self.params.LH.dot(lh) + vec lh = sigmoid(z) rh = np.zeros(self.hdim) for x in reversed(after): vec = wv[x] if x >= self.vdim else self.sparams.RL[x] z = self.params.RH.dot(rh) + vec rh = sigmoid(z) y_hat = softmax(self.params.U.dot(np.concatenate((lh, rh)))) for i in xrange(nres): high_idx = np.argmax(y_hat) missings.append(high_idx) Ps.append(y_hat[high_idx]) y_hat[high_idx] = 0.0 return missings
def compute_seq_loss(self, sentence, ys, wv=None): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ """ ys are not used """ J = 0 #### YOUR CODE HERE #### ns = len(sentence) lhs = np.zeros((ns, self.hdim)) rhs = np.zeros((ns, self.hdim)) for i in xrange(ns-1): x = sentence[i] h = lhs[i] vec = wv[x] if x >= self.vdim else self.sparams.LL[x] lhs[i+1] = sigmoid(self.params.LH.dot(h) + vec) for i in reversed(xrange(1, ns)): x = sentence[i] h = rhs[i] vec = wv[x] if x >= self.vdim else self.sparams.RL[x] rhs[i-1] = sigmoid(self.params.RH.dot(h) + vec) for i in xrange(1, ns-1): y = sentence[i] if y >= self.vdim: y = 3 # UUUNKKK y_hat = softmax(self.params.U.dot(np.concatenate((lhs[i], rhs[i])))) J -= np.log(y_hat[y]) #### END YOUR CODE #### return J
def forward_propagation(self,xs): n_aspect = N_ASPECTS sent_dim = SENT_DIM ns = len(xs) hs = zeros((ns+1, self.hdim)) for t in range(ns): hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]] + self.params.b1) h_final = hs[ns-1] z = self.params.U.dot(h_final) + self.params.b2 y_hat = [] for i in range(n_aspect): current = z[sent_dim*i:sent_dim*(i+1)] y_hat.extend(softmax(current)) return hs,y_hat
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### ns = len(ys) t = 0 nextIdx = init hs = zeros((maxlen+1, self.hdim)) ps = zeros((maxlen, self.vdim)) while ns <= maxlen and nextIdx != end: hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[ys[t]]) ps[t] = softmax(self.params.U.dot(hs[t])) J -= log(ps[t,ys[t]]) nextIdx = multinomial_sample(ps[t]) ys.append(nextIdx) ns = len(ys) t += 1 #### YOUR CODE HERE #### return ys, J
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### ps = zeros((maxlen, self.vdim)) hs = zeros((maxlen, self.hdim)) H = self.params.H L = self.sparams.L U = self.params.U start = init for i in xrange(maxlen): hs[i+1] = sigmoid(H.dot(hs[i]) + L[start]) ps[i] = softmax(U.dot(hs[i+1])) start = multinomial_sample(ps[i]) J -= log(ps[i][start]) ys.append(start) if start == end: break #### YOUR CODE HERE #### return ys, J
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### h_ant = zeros((1, self.hdim)) for step in xrange(maxlen): a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[ys[step]] h = sigmoid( a1 ) a2 = self.params.U.dot(h.T).T # print "h.shape %s" % (h.shape,) # print "a2.shape %s" % (a2.shape,) # print "self.params.U.shape %s" % (self.params.U.shape,) y_hat = softmax( a2 ) h_ant = h ys.append( multinomial_sample(y_hat) ) J -= log( y_hat[:,ys[step]] ) ys.append(end) #### YOUR CODE HERE #### return ys, J
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence ns = maxlen hs = np.zeros((ns+1,self.hdim)) #### YOUR CODE HERE #### for i in range(ns): hs[i+1] = sigmoid(self.params.H.dot(hs[i])+self.params.W.dot(self.sparams.L[ys[i]])) p = self.hierarchicalU.getDistribution(hs[i+1]) y = multinomial_sample(p) ys.append(y) if y == end: break p = p*make_onehot(y,self.vdim) J += -np.log(np.sum(p)) ## #x only compute the node which gradient is updated x = self.hierarchicalU.getSumSquareU(self.hierarchicalU.root) Jreg = 0.5*self.lreg*(np.sum(self.params.H**2)+np.sum(self.params.W**2)+ x) #### YOUR CODE HERE #### return ys, J+Jreg
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### h_ant = zeros((1, self.hdim)) for step in xrange(maxlen): a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[ys[step]] h = sigmoid(a1) a2 = self.params.U.dot(h.T).T # print "h.shape %s" % (h.shape,) # print "a2.shape %s" % (a2.shape,) # print "self.params.U.shape %s" % (self.params.U.shape,) y_hat = softmax(a2) h_ant = h ys.append(multinomial_sample(y_hat)) J -= log(y_hat[:, ys[step]]) ys.append(end) #### YOUR CODE HERE #### return ys, J
def compute_seq_loss(self, xs, ys): J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = zeros((ns+1, self.hdim)) ps = zeros((ns, self.vdim)) # Forward propagation for t in range(0,ns): hs[t] = sigmoid(dot(hs[t-1],self.params.H)+self.sparams.L[xs[t],:]) ps[t] = softmax(dot(self.params.U,hs[t])) J += -log(ps[t][ys[t]]) #### END YOUR CODE #### return J
def forward_propagation(self, xs): n_aspect = N_ASPECTS sent_dim = SENT_DIM ns = len(xs) hs = zeros((ns + 1, self.hdim)) for t in range(ns): hs[t] = sigmoid( self.params.H.dot(hs[t - 1]) + self.sparams.L[xs[t]] + self.params.b1) h_final = hs[ns - 1] z = self.params.U.dot(h_final) + self.params.b2 y_hat = [] for i in range(n_aspect): current = z[sent_dim * i:sent_dim * (i + 1)] y_hat.extend(softmax(current)) return hs, y_hat
def generate_sequence(self, d, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence hs = zeros((maxlen+1, self.hdim)) curr = init t = 0 d_vec = self.sparams.D[d] while curr != end and len(ys) < maxlen: x_t = curr zs_t = self.params.H.dot(hs[t-1]) + self.sparams.L[x_t] + d_vec hs[t] = sigmoid(zs_t) ps_t = softmax(self.params.U.dot(hs[t]) + self.params.G.dot(d_vec)) y = multinomial_sample(ps_t) ys.append(y) curr = y J += -1*log(ps_t[y]) t += 1 return ys, J
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### h = zeros(self.hdim) t = 1 while t < maxlen: # shape Dh h = sigmoid(self.params.H.dot(h) + self.sparams.L[ys[t-1]]) # shape V p = softmax(self.params.U.dot(h)) ys += [multinomial_sample(p)] J += -log(p[ys[t]]) if ys[t] == end: break t += 1 #### YOUR CODE HERE #### return ys, J
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence hs = zeros((maxlen + 1, self.hdim)) ps = zeros((maxlen, self.vdim)) for w in range(maxlen): z1 = self.params.H.dot(hs[w - 1]) + self.sparams.L[ys[w]] hs[w] = sigmoid(z1) z2 = self.params.U.dot(hs[w]) ps = softmax(z2) y = multinomial_sample(ps) ys.append(y) J += -log(ps[y]) if y == end: break return ys, J
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### hs = zeros((maxlen+1, self.hdim)) for t in xrange(maxlen): hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[ys[t]]) y_hat = softmax(dot(self.params.U, hs[t])) y_index = multinomial_sample(y_hat) ys.append(y_index) J -= log(y_hat[y_index]) if y_index == end: break #### YOUR CODE HERE #### return ys, J
def predict(self, xs): n_aspect = N_ASPECTS sent_dim = SENT_DIM #### YOUR CODE HERE #### # hs[-1] = initial hidden state (zeros) ns = len(xs) hs = zeros((ns + 1, self.hdim)) for t in range(ns): hs[t] = sigmoid( self.params.H.dot(hs[t - 1, :]) + self.sparams.L[xs[t]]) h_final = hs[ns - 1] z = self.params.U.dot(h_final) y_hat = [] for i in range(n_aspect): current = z[sent_dim * i:sent_dim * (i + 1)] y_hat.extend(softmax(current)) return y_hat
def generate_sequence(self, init, end, maxlen=100): """ Generate a sequence from the language model, by running the RNN forward and selecting, at each timestep, a random word from the a word from the emitted probability distribution. The MultinomialSampler class (in nn.math) may be helpful here for sampling a word. Use as: y = multinomial_sample(p) to sample an index y from the vector of probabilities p. Arguments: init = index of start word (word_to_num['<s>']) end = index of end word (word_to_num['</s>']) maxlen = maximum length to generate Returns: ys = sequence of indices J = total cross-entropy loss of generated sequence """ J = 0 # total loss ys = [init] # emitted sequence #### YOUR CODE HERE #### t = 0 hs = zeros(self.hdim) while True: if (len(ys) > maxlen) or (ys[-1] == end): break hs = sigmoid(self.params.H.dot(hs) + self.sparams.L[ys[t], :]) ps = softmax(self.params.U.dot(hs)) y = multinomial_sample(ps) J += -log(ps[y]) ys.append(y) t += 1 #### YOUR CODE HERE #### return ys, J
def _acc_grads(self, window, label): """ Accumulate gradients, given a training point (window, label) of the format window = [x_{i-1} x_{i} x_{i+1}] # three ints label = {0,1,2,3,4} # single int, gives class Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.U += (your gradient dJ/dU) self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index """ #### YOUR CODE HERE #### ## # Forward propagation x = concatenate([self.sparams.L[w] for w in window]) z1 = self.params.W.dot(x) + self.params.b1 h = 2 * sigmoid(2 * z1) - 1 z2 = self.params.U.dot(h) + self.params.b2 p = softmax(z2) y = make_onehot(label, len(p)) ## # Backpropagation # compute the gradients w.r.t cross-entropy loss delta1 = p - y # dJ/dU, dJ/db2 self.grads.U += outer(delta1, h) + self.lreg * self.params.U self.grads.b2 += delta1 # dJ/dW, dJ/db1 delta2 = self.params.U.T.dot(delta1) * (1 - h**2) self.grads.W += outer(delta2, x) + self.lreg * self.params.W self.grads.b1 += delta2 # dj/dLi for i, w_chunk in enumerate(split(self.params.W, len(window), axis=1)): self.sgrads.L[window[i]] = w_chunk.T.dot(delta2)
def backward(self, m, lambd=0.1): self.dA = np.dot(self.next_layer.W.T, self.next_layer.dZ) if self.activation == 'relu': self.dZ = np.multiply(self.dA, np.int64(self.A > 0)) elif self.activation == 'sigmoid': s = sigmoid(self.Z) self.dZ = self.dA * s * (1 - s) elif self.activation == 'linear': self.dZ = self.dA elif self.activation == 'tanh': self.dZ = np.dot(self.next_layer.W.T, self.next_layer.dZ) * (1 - np.power(self.A, 2)) self.dW = (1 / m) * np.dot(self.dZ, self.prev_layer.A.T) + (lambd / m) * self.W self.db = (1 / m) * np.sum(self.dZ, axis=1, keepdims=True) self.prev_layer.backward(m)
def compute_seq_loss(self, xs, ys, d): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 ns = len(xs) hs = zeros((ns+1, self.hdim)) d_vec = self.sparams.D[d] for t in xrange(ns): x_t = xs[t] zs_t = self.params.H.dot(hs[t-1]) + self.sparams.L[x_t] + d_vec hs[t] = sigmoid(zs_t) ps_t = softmax(self.params.U.dot(hs[t]) + self.params.G.dot(d_vec.T).reshape(self.vdim,)) J += -1*log(ps_t[ys[t]]) return J
def _acc_grads_batch(self, X, Y): """ Accumulate gradients from a training examples, X matrix, Y vector of targets TODO hidden layer average activation is done separately can be rewritten to be twice as fast """ # Frist compute average activation for examples ro_hat = np.zeros_like(self.params.b1) for i in range(len(Y)): x = X[i] _, h = self.forward_pass(x) ro_hat += h ro_hat /= float(len(Y)) ## # Forward propagation for i in range(len(Y)): x = X[i] y = Y[i] z1 = self.params.W.dot(x) + self.params.b1 h = sigmoid(z1) z2 = np.dot(self.params.U, h) + self.params.b2 y_hat = z2 d2 = (y_hat - y) #d2 *= (1./len(y)) self.grads.b2 += d2 self.grads.U += np.outer(d2, h) + self.lreg * self.params.U # incorporate kld gradient into d1 kl_grad = self.beta * (- self.ro / ro_hat + (1. - self.ro) / (1 - ro_hat)) d1 = (np.dot(self.params.U.T, d2) + kl_grad) * sigmoid_grad(z1) self.grads.W += np.outer(d1, x) + self.lreg * self.params.W self.grads.b1 += d1
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = np.zeros((ns + 1, self.hdim)) for i in range(ns): hs[i + 1] = sigmoid(self.params.H.dot(hs[i]) + self.sparams.L[xs[i]]) p = softmax(self.params.U.dot(hs[i + 1])) p = p * make_onehot(ys[i], self.vdim) J += -np.log(np.sum(p)) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### ns = len(xs) hs = zeros((ns + 1, self.hdim)) ps = zeros((ns, self.vdim)) for t in range(0, ns): hs[t] = sigmoid( self.params.H.dot(hs[t - 1]) + self.sparams.L[xs[t], :]) ps[t] = softmax(self.params.U.dot(hs[t])) J += -log(ps[t][ys[t]]) #### END YOUR CODE #### return J
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ J = 0 #### YOUR CODE HERE #### h = zeros(self.hdim) for t in range(len(xs)): h = sigmoid(self.params.H.dot(h) + self.sparams.L[xs[t]]) pred = softmax(self.params.U.dot(h)) J += -1 * log(pred[ys[t]]) #pdb.set_trace() #### END YOUR CODE #### return J
def predict(self, xs): # predicts yhat based on xs # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas #ps = zeros((ns, self.vdim)) yhat = None ## # Forward propagation #hs[-1] is kindly all 0s (always) for t in xrange(ns): theta_t = dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]] hs[t] = sigmoid(theta_t) if t == ns - 1: yhat = softmax(dot(self.params.U, hs[t])) return yhat
def compute_seq_loss(self, xs, ys): """ Compute the total cross-entropy loss for an input sequence xs and output sequence (labels) ys. You should run the RNN forward, compute cross-entropy loss at each timestep, and return the sum of the point losses. """ ns = len(xs) hs = zeros((ns + 1, self.hdim)) ps = zeros((ns, self.vdim)) for i in range(ns): z1 = self.params.H.dot(hs[i - 1]) + self.sparams.L[xs[i]] hs[i] = sigmoid(z1) z2 = self.params.U.dot(hs[i]) ps[i] = softmax(z2) J = sum(-log(ps[range(len(ps)), ys])) return J
def forward(self, X): """Forward propagation Parameter: X: A numpy array of size (n, m) n: the number features (excluding the bias term)""" self.Z = np.dot(self.W, X) if self.use_bias: self.Z = self.Z + self.b if self.activation == 'sigmoid': self.A = sigmoid(self.Z) elif self.activation == 'relu': self.A = relu(self.Z) elif self.activation == 'softmax': self.A = softmax(self.Z) elif self.activation == 'tanh': self.A = tanh(self.Z) else: self.A = linear(self.Z) if self.next_layer is not None: return self.next_layer.forward(self.Z) return self.A
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### ## # Forward propagation for t in xrange(ns): hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]]) ps[t] = softmax(dot(self.params.U, hs[t])) ## # Backward propagation through time for j in xrange(ns): y = make_onehot(ys[j], self.vdim) y_hat_minus_y = ps[j] - y self.grads.U += outer(y_hat_minus_y, hs[j]) delta = dot(self.params.U.T, y_hat_minus_y) * hs[j] * (1.0 - hs[j]) # start at j and go back self.bptt times (total self.bptt + 1 elements, including current one) for t in xrange(j, j - self.bptt - 1, -1): if t - 1 >= -1: self.grads.H += outer(delta, hs[t - 1]) self.sgrads.L[xs[t]] = delta delta = dot(self.params.H.T, delta) * hs[t - 1] * (1.0 - hs[t - 1])
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns + 1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### for i in xrange(ns): hs[i] = sigmoid( self.params.H.dot(hs[i - 1]) + self.sparams.L[xs[i]]) ps[i] = softmax(self.params.U.dot(hs[i])) ## # Forward propagation for i in xrange(ns): delta_pre = ps[i] delta_pre[ys[i]] -= 1 self.grads.U += outer(delta_pre, hs[i]) delta = self.params.U.T.dot(delta_pre) * hs[i] * (1 - hs[i]) # self.grads.H += outer(delta, hs[i - 1]) # self.sgrads.L[xs[i]] = delta j = i while j >= 0 and j >= (i - self.bptt): self.grads.H += outer(delta, hs[j - 1]) self.sgrads.L[xs[j]] = delta delta = self.params.H.T.dot(delta) * hs[j - 1] * (1 - hs[j - 1]) j -= 1
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns + 1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) #### YOUR CODE HERE #### ## # Forward propagation for step in xrange(0, ns): # print "hs[step-1].shape %s" % (hs[step-1].shape,) # print "self.params.H.shape %s" % (self.params.H.shape,) # print "self.sparams.L.shape %s" % (self.sparams.L.shape,) # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,) a1 = self.params.H.dot(hs[step - 1].T).T + self.sparams.L[xs[step]] a1 = expand_dims(a1, axis=0) h = sigmoid(a1) a2 = self.params.U.dot(h.T).T # print "h.flatten().shape %s" % (h.flatten().shape,) # print "a2.shape %s" % (a2.shape,) # print "self.params.U.shape %s" % (self.params.U.shape,) y_hat = softmax(a2) # print "y_hat.shape %s" % (y_hat.shape,) hs[step] = h.flatten() ps[step] = y_hat ## # Backward propagation through time for step in xrange(ns - 1, -1, -1): t = zeros(ps[step].shape) t[ys[step]] = 1 delta_out = ps[step] - t self.grads.U += outer(hs[step], delta_out).T delta_hidden = delta_out.dot(self.params.U) * sigmoid_grad( hs[step]) for step_bp in xrange(step, step - self.bptt - 1, -1): if step_bp < 0: break self.grads.H += outer(delta_hidden, hs[step_bp - 1]) self.sgrads.L[xs[step_bp]] = delta_hidden delta_hidden = delta_hidden.dot(self.params.H) * sigmoid_grad( hs[step_bp - 1])
def _acc_grads(self, X, y, just_probs=False): """ Accumulate gradients, given a pair of training sequences: X = input word vectors (N x WvecDim matrix) y = document classifcation (as an integer) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect X as a matrix of word vectors # ith row is a word embedding for the ith word ns = X.shape[0] #### YOUR CODE HERE #### ############## # FORWARD PROP # X.shape = (ns, Dw) #### A1 # A1.shape = (ns, Dh) A1 = sigmoid((self.params.W11.dot(X.T)).T + self.params.b11) assert A1.shape == (ns, self.hdim) # if dropout set A1 if self.drop_p > 0.: A1[random.rand(*A1.shape) <= self.drop_p] = 0. #### A2 # A2.shape = (ns, Dh) A2 = sigmoid((self.params.W12.dot(A1.T)).T + self.params.b12) assert A2.shape == (ns, self.hdim) # if dropout set A2 if self.drop_p > 0.: A2[random.rand(*A2.shape) <= self.drop_p] = 0. #### MAX POOLING # Max each node of A over time (max of each column over all rows) # use argmax for use in backprop mx = argmax(A2,0) # Max pooling vector # this will select max elements of A: # h.shape == (Dh,) h1 = A2[mx,list(range(len(mx)))] assert h1.shape == (self.hdim,) #### HIDDEN POOLED LAYER h2 = sigmoid(self.params.W21.dot(h1) + self.params.b21) assert h2.shape == (self.hdim,) # prediction probabilities ps = softmax(self.params.Ws.dot(h2) + self.params.bs) if just_probs: return ps ############# # BACK PROP y = array(y).astype(int) #### SOFTMAX LAYER err_o = ps err_o[y] += -1 self.grads.Ws += outer(err_o, h2) self.grads.bs += err_o err_h2 = self.params.Ws.T.dot(err_o) * h2 * (1-h2) assert err_h2.shape == (self.hdim,) #### HIDDEN POOLED LAYER self.grads.W21 += outer(err_h2, h1) self.grads.b21 += err_h2 err_h_max = self.params.W21.T.dot(err_h2) * h1 * (1-h1) assert err_h_max.shape == (self.hdim,) #### HIDDEN UNPOOLED LAYER 2 # the inputs to hidden unpooled layers # for the examples that went in to the argmax instance of each node A1_max = A1[mx,:] assert A1_max.shape == (self.hdim, self.hdim) # How to multiply by the same thing in each row (columnwise multiplication) # zeros((10,5)) * reshape(range(10), (10,))[:,newaxis] self.grads.W12 += A1_max * err_h_max[:,newaxis] self.grads.b12 += err_h_max # output for argmax node a1_mx = A1[mx,list(range(len(mx)))] err_A2 = zeros((ns,self.hdim)) err_A2[mx,list(range(len(mx)))] = err_h_max assert err_A2.shape == (ns, self.hdim) err_a1_max = self.params.W12.T.dot(err_A2.T).T * A1*(1-A1) assert err_a1_max.shape == (ns,self.hdim) ##### HiddEN UNPOOLED LAYER 1 self.grads.W11 += err_a1_max.T.dot(X) self.grads.b11 += sum(err_a1_max,axis=0) #### REGULARIZATION self.grads.W11 += self.rho * self.params.W11 self.grads.W12 += self.rho * self.params.W12 self.grads.W21 += self.rho * self.params.W21 self.grads.Ws += self.rho * self.params.Ws
def forward_pass(self, x): " Compute the final and the hidden layer " z1 = self.params.W.dot(x) + self.params.b1 h = sigmoid(z1) z2 = np.dot(self.params.U, h) + self.params.b2 return (z2, h)
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H) and self.sgrads (for L,U) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) hs = zeros((ns+1, self.hdim)) # predicted probas #ps = zeros((ns, self.vdim)) yhat = None #### YOUR CODE HERE #### ## # Forward propagation #hs[-1] is kindly all 0s (always) for t in xrange(ns): theta_t = dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]] hs[t] = sigmoid(theta_t) if t == ns - 1: yhat = softmax(dot(self.params.U, hs[t])) ## # Backward propagation through time def get_delta_i(delta_next, t): ddht = dot(transpose(self.params.H), delta_next) return ddht * hs[t] * (1- hs[t]) #for t in xrange(ns-1, -1, -1): t = ns-1 dJ_dUht = yhat dJ_dUht[ys] -= 1 # (-y + yhat) self.grads.U += outer(dJ_dUht, hs[t]) dJ_dht = dot(transpose(self.params.U), dJ_dUht) # h(t) = sig(theta) dJ_dThetat = dJ_dht * (hs[t]) * (1 - hs[t]) delta_t = dJ_dThetat # BPTT delta_next = None i = t while i >= max(t - self.bptt + 1, 0): # note that bptt=1 means we only run it on regular t delta_i = get_delta_i(delta_next, i) if i != t else delta_t self.sgrads.L[xs[i]] = delta_i self.grads.H += outer(delta_i, hs[i-1]) delta_next = delta_i i -= 1 # regularization self.grads.H += self.lamb * self.params.H self.grads.U += self.lamb * self.params.U
def tanh(x): return 2 * sigmoid(2 * x) - 1.0
def _acc_grads(self, xs, ys): """ Accumulate gradients, given a pair of training sequences: xs = [<indices>] # input words ys = [<indices>] # output words (to predict) Your code should update self.grads and self.sgrads, in order for gradient_check and training to work. So, for example: self.grads.H += (your gradient dJ/dH) self.sgrads.L[i] = (gradient dJ/dL[i]) # update row, matrix L: |V| * dim(h) Per the handout, you should: - make predictions by running forward in time through the entire input sequence - for *each* output word in ys, compute the gradients with respect to the cross-entropy loss for that output word - run backpropagation-through-time for self.bptt timesteps, storing grads in self.grads (for H, U) and self.sgrads (for L) You'll want to store your predictions \hat{y}(t) and the hidden layer values h(t) as you run forward, so that you can access them during backpropagation. At time 0, you should initialize the hidden layer to be a vector of zeros. """ # Expect xs as list of indices ns = len(xs) # make matrix here of corresponding h(t) # hs[-1] = initial hidden state (zeros) # compact h(t)s to a matrix hs = zeros((ns + 1, self.hdim)) # predicted probas ps = zeros((ns, self.vdim)) # Forward propagation for i in range(ns): z1 = self.params.H.dot(hs[i - 1]) + self.sparams.L[xs[i]] hs[i] = sigmoid(z1) z2 = self.params.U.dot(hs[i]) ps[i] = softmax(z2) ps_copy = ps.copy() ps_copy[ arange(len(ys)), ys] -= 1. # Matrix, each row is a prob distribution for predicting certain word. yhat_y = ps_copy mean_grads_H = zeros_like(self.params.H) mean_grads_U = zeros_like(self.params.U) # Backward propagation through time for t in reversed(range(ns)): # Start from the latest step, e.g: 4, 3, 2, 1, 0 mean_grads_U += outer(yhat_y[t], hs[t]) delta = (self.params.U.T.dot(yhat_y[t])) * (hs[t] * (1 - hs[t])) for s in range(min(t, self.bptt) + 1): #for s in range(max(0, t-self.bptt), t+1): mean_grads_H += outer(delta, hs[t - s - 1]) self.sgrads.L[xs[t - s]] = delta delta = self.params.H.T.dot(delta) * (hs[t - s - 1] * (1 - hs[t - s - 1])) self.grads.H += mean_grads_H self.grads.U += mean_grads_U
def d_tanh(x): return 4 * sigmoid(2.0 * x) * (1.0 - sigmoid(2.0 * x))