Exemple #1
0
    def forwardProp(self, node, correct=[], guess=[]):
        cost = total = 0.0
        # this is exactly the same setup as forwardProp in rnn.py
        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:, node.word]
            node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2)
            node.probs = softmax(
                self.Ws.dot(node.hActs2 * self.mask) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1

        c1, t1 = self.forwardProp(node.left, correct, guess)
        c2, t2 = self.forwardProp(node.right, correct, guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1)
            node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2)
            node.probs = softmax(
                self.Ws.dot(node.hActs2 * self.mask) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))

        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
Exemple #2
0
 def forwardProp(self,node, correct=[], guess=[]):
     cost  =  total = 0.0
     # this is exactly the same setup as forwardProp in rnn.py
     if node.isLeaf == True:
         node.fprop = True
         node.hActs1 = self.L[:,node.word]
         node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2)
         node.probs = softmax(self.Ws.dot(node.hActs2)+self.bs)
         p = node.probs*make_onehot(node.label,len(self.bs))
         cost = -np.log(np.sum(p))
         correct.append(node.label)
         guess.append(np.argmax(node.probs))
         return cost, 1
     
     c1,t1 = self.forwardProp(node.left,correct,guess)
     c2,t2 = self.forwardProp(node.right,correct,guess)
     if node.left.fprop and node.right.fprop:
         node.fprop = True
         h = np.hstack([node.left.hActs1, node.right.hActs1])
         node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1)
         node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2)
         node.probs = softmax(self.Ws.dot(node.hActs2)+self.bs)
         p = node.probs*make_onehot(node.label,len(self.bs))
         cost = -np.log(np.sum(p))
         correct.append(node.label)
         guess.append(np.argmax(node.probs))
         
     cost += c1
     cost += c2
     total += t1
     total += t2
     return cost, total + 1
    def backprop(self, xs, ys, hs, y_hat):
        ns = len(xs)
        h_final = hs[ns - 1]
        delta = self.params.weights * (y_hat - ys)
        self.grads.b2 += delta
        ht = h_final.reshape(len(h_final), 1)
        delta = delta.reshape(len(ys), 1)
        self.grads.U += delta.dot(ht.T)

        # H and L
        t = ns - 1  # last t
        current = self.params.U.T.dot(delta) * ht * (1 - ht)  # the common part
        prev_ht = hs[t - 1].reshape(len(hs[t - 1]), 1)
        self.grads.H += current.dot(prev_ht.T)
        self.grads.b1 += current.reshape((len(current), ))
        xt = make_onehot(xs[t], self.vdim).reshape(self.vdim, 1)
        self.sgrads.L[xs[t]] = xt.dot(current.T)[xs[t]]
        for i in range(1, self.bptt):
            if t < i:  # so that h[-2] doesn't return anything
                continue
            ht_i = hs[t - i].reshape(len(hs[t - i]), 1)
            prev_ht_i = hs[t - i - 1].reshape(len(hs[t - i - 1]), 1)
            current = self.params.H.T.dot(current) * ht_i * (1 - ht_i)
            self.grads.H += current.dot(prev_ht_i.T)
            self.grads.b1 += current.reshape((len(current), ))
            prev_xt = make_onehot(xs[t - i], self.vdim).reshape(self.vdim, 1)
            self.sgrads.L[xs[t - i]] = prev_xt.dot(current.T)[xs[t - i]]
Exemple #4
0
 def backprop(self,xs,ys,hs,y_hat):
     ns = len(xs)
     h_final = hs[ns-1]
     delta = (y_hat -ys)
     self.grads.b2 += delta 
     ht = h_final.reshape(len(h_final),1)
     delta = delta.reshape(len(ys),1)
     self.grads.U += delta.dot(ht.T)
      
     # H and L
     t = ns-1 # last t
     current = self.params.U.T.dot(delta) * ht * (1-ht) # the common part
     prev_ht = hs[t-1].reshape(len(hs[t-1]),1)
     self.grads.H += current.dot(prev_ht.T)
     self.grads.b1 += current.reshape((len(current),))
     xt = make_onehot(xs[t],self.vdim).reshape(self.vdim,1)
     self.sgrads.L[xs[t]] = xt.dot(current.T)[xs[t]]
     for i in range(1,self.bptt):
         if t<i: # so that h[-2] doesn't return anything
             continue
         ht_i = hs[t-i].reshape(len(hs[t-i]),1)
         prev_ht_i = hs[t-i-1].reshape(len(hs[t-i-1]),1)
         current = self.params.H.T.dot(current)*ht_i*(1-ht_i)
         self.grads.H += current.dot(prev_ht_i.T)
         self.grads.b1 += current.reshape((len(current),))
         prev_xt = make_onehot(xs[t-i],self.vdim).reshape(self.vdim,1)
         self.sgrads.L[xs[t-i]] = prev_xt.dot(current.T)[xs[t-i]]
Exemple #5
0
    def forwardProp(self, node, correct, guess):
        cost = total = 0.0
        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:, node.word]
            node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1

        c1, t1 = self.forwardProp(node.left, correct, guess)
        c2, t2 = self.forwardProp(node.right, correct, guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            tmp = np.zeros(len(node.left.hActs1))
            for i in range(len(tmp)):
                tmp[i] = h.dot(self.V[i]).dot(h)
            node.hActs1 = self.ReLU(self.W.dot(h) + self.b + tmp)
            node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))

        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
Exemple #6
0
 def forwardProp(self,node,correct, guess):
     cost = total = 0.0
     if node.isLeaf == True:
         node.fprop = True
         node.hActs1 = self.L[:, node.word]
         node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs)
         p = node.probs*make_onehot(node.label, len(self.bs))
         cost = -np.log(np.sum(p))
         correct.append(node.label)
         guess.append(np.argmax(node.probs))
         return cost, 1
         
     c1,t1 = self.forwardProp(node.left,correct,guess)
     c2,t2 = self.forwardProp(node.right,correct,guess)
     if node.left.fprop and node.right.fprop:
         node.fprop = True
         h = np.hstack([node.left.hActs1, node.right.hActs1])
         tmp = np.zeros(len(node.left.hActs1))
         for i in range(len(tmp)):
             tmp[i] = h.dot(self.V[i]).dot(h)
         node.hActs1 = self.ReLU(self.W.dot(h) + self.b + tmp)
         node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs)
         p = node.probs*make_onehot(node.label,len(self.bs))
         cost = -np.log(np.sum(p))
         correct.append(node.label)
         guess.append(np.argmax(node.probs))
         
     cost += c1
     cost += c2
     total += t1
     total += t2
     return cost, total + 1
Exemple #7
0
    def backprop(self, xs, ys, hs_f, hs_b, y_hat):
        inverted_xs = list(reversed(xs))
        ns = len(xs)
        ht_f = hs_f[ns - 1].reshape(len(hs_f[ns - 1]), 1)
        ht_b = hs_b[ns - 1].reshape(len(hs_b[ns - 1]), 1)
        delta = (y_hat - ys)
        self.grads.b2 += delta
        delta = delta.reshape(len(ys), 1)
        self.grads.U += delta.dot(
            hstack([ht_f, ht_b]).reshape((1, 2 * len(ht_f))))

        # H and L
        t = ns - 1  # last t
        current_f = self.params.U.T.dot(delta)[:self.hdim] * ht_f * (1 - ht_f)
        current_b = self.params.U.T.dot(delta)[self.hdim:] * ht_b * (
            1 - ht_b)  # the common part

        # update initial Hs
        prev_ht_f = hs_f[t - 1].reshape(len(hs_f[t - 1]), 1)
        self.grads.H_f += current_f.dot(prev_ht_f.T)
        self.grads.b1_f += current_f.reshape((len(current_f), ))

        prev_ht_b = hs_b[t - 1].reshape(len(hs_b[t - 1]), 1)
        self.grads.H_b += current_b.dot(prev_ht_b.T)
        self.grads.b1_b += current_b.reshape((len(current_b), ))

        # update initial L
        xt = make_onehot(xs[t], self.vdim).reshape(self.vdim, 1)
        self.sgrads.L[xs[t]] = xt.dot(current_f.T)[xs[t]]
        inv_xt = make_onehot(inverted_xs[t], self.vdim).reshape(self.vdim, 1)
        self.sgrads.L[inverted_xs[t]] = inv_xt.dot(current_b.T)[inverted_xs[t]]

        # update the rest
        for i in range(1, self.bptt):
            if t < i:  # so that h[-2] doesn't return anything
                continue
            ht_f_i = hs_f[t - i].reshape(len(hs_f[t - i]), 1)
            prev_ht_f_i = hs_f[t - i - 1].reshape(len(hs_f[t - i - 1]), 1)
            current_f = self.params.H_f.T.dot(current_f) * ht_f_i * (1 -
                                                                     ht_f_i)
            self.grads.H_f += current_f.dot(prev_ht_f_i.T)
            self.grads.b1_f += current_f.reshape((len(current_b), ))

            ht_b_i = hs_b[t - i].reshape(len(hs_b[t - i]), 1)
            prev_ht_b_i = hs_b[t - i - 1].reshape(len(hs_b[t - i - 1]), 1)
            current_b = self.params.H_b.T.dot(current_b) * ht_b_i * (1 -
                                                                     ht_b_i)
            self.grads.H_b += current_b.dot(prev_ht_b_i.T)
            self.grads.b1_b += current_b.reshape((len(current_b), ))

            prev_xt = make_onehot(xs[t - i], self.vdim).reshape(self.vdim, 1)
            self.sgrads.L[xs[t - i]] = prev_xt.dot(current_f.T)[xs[t - i]]
            prev_inv_xt = make_onehot(inverted_xs[t - i],
                                      self.vdim).reshape(self.vdim, 1)
            self.sgrads.L[inverted_xs[t - i]] = prev_inv_xt.dot(
                current_b.T)[inverted_xs[t - i]]
    def backprop(self,xs,ys,hs_f,hs_b,y_hat):
        inverted_xs = list(reversed(xs))
        ns = len(xs)
        ht_f = hs_f[ns-1].reshape(len(hs_f[ns-1]),1)
        ht_b = hs_b[ns-1].reshape(len(hs_b[ns-1]),1)
        delta = self.params.weights*(y_hat -ys)
        self.grads.b2 += delta
        delta = delta.reshape(len(ys),1)
        self.grads.U += delta.dot(hstack([ht_f,ht_b]).reshape((1,2*len(ht_f))))
         
        # H and L
        t = ns-1 # last t
        current_f = self.params.U.T.dot(delta)[:self.hdim] * ht_f * (1-ht_f)
        current_b = self.params.U.T.dot(delta)[self.hdim:] * ht_b * (1-ht_b) # the common part

        # update initial Hs
        prev_ht_f = hs_f[t-1].reshape(len(hs_f[t-1]),1)
        self.grads.H_f += current_f.dot(prev_ht_f.T)
        self.grads.b1_f += current_f.reshape((len(current_f),))

        prev_ht_b = hs_b[t-1].reshape(len(hs_b[t-1]),1)
        self.grads.H_b += current_b.dot(prev_ht_b.T)
        self.grads.b1_b += current_b.reshape((len(current_b),))

        # update initial L
        xt = make_onehot(xs[t],self.vdim).reshape(self.vdim,1)
        self.sgrads.L[xs[t]] = xt.dot(current_f.T)[xs[t]]
        inv_xt = make_onehot(inverted_xs[t],self.vdim).reshape(self.vdim,1)
        self.sgrads.L[inverted_xs[t]] = inv_xt.dot(current_b.T)[inverted_xs[t]]

        # update the rest
        for i in range(1,self.bptt):
            if t<i: # so that h[-2] doesn't return anything
                continue
            ht_f_i = hs_f[t-i].reshape(len(hs_f[t-i]),1)
            prev_ht_f_i = hs_f[t-i-1].reshape(len(hs_f[t-i-1]),1)
            current_f = self.params.H_f.T.dot(current_f)*ht_f_i*(1-ht_f_i)
            self.grads.H_f += current_f.dot(prev_ht_f_i.T)
            self.grads.b1_f += current_f.reshape((len(current_b),))

            ht_b_i = hs_b[t-i].reshape(len(hs_b[t-i]),1)
            prev_ht_b_i = hs_b[t-i-1].reshape(len(hs_b[t-i-1]),1)
            current_b = self.params.H_b.T.dot(current_b)*ht_b_i*(1-ht_b_i)
            self.grads.H_b += current_b.dot(prev_ht_b_i.T)
            self.grads.b1_b += current_b.reshape((len(current_b),))

            prev_xt = make_onehot(xs[t-i],self.vdim).reshape(self.vdim,1)
            self.sgrads.L[xs[t-i]] = prev_xt.dot(current_f.T)[xs[t-i]]
            prev_inv_xt = make_onehot(inverted_xs[t-i],self.vdim).reshape(self.vdim,1)
            self.sgrads.L[inverted_xs[t-i]] = prev_inv_xt.dot(current_b.T)[inverted_xs[t-i]]
    def b_prop(self, ys):

        #L = self.params['L']
        Wh = self.params['Wh']
        #Wx = self.params['Wx']
        U = self.params['U']
        b1 = self.params['b1']
        b2 = self.params['b2']
        N = len(ys)

        delta_above = np.zeros(self.hdim)
        for t in xrange(N - 1, -1, -1):
            delta_3 = self.yhats[:, t] - make_onehot(ys[t], self.outdim)
            self.grads['U'] += np.outer(delta_3, self.hs[:, t])
            self.grads['b2'] += delta_3
            dh = np.dot(np.transpose(U), delta_3) + delta_above
            delta_2 = dh * (self.hs[:, t] > 0)
            self.grads['b1'] += delta_2
            self.grads['Wh'] += np.outer(delta_2, self.hs[:, t - 1])
            #self.grads['Wx'] += np.outer(delta_2, L[:,xs[t]])
            #self.grads['L'][:,xs[t]] += np.dot(np.transpose(Wx), delta_2)
            delta_below = np.dot(np.transpose(Wh), delta_2)

            delta_above = delta_below
        return delta_below
Exemple #10
0
    def compute_loss(self, windows, labels):
        """
        Compute the loss for a given dataset.
        windows = same as for predict_proba
        labels = list of class labels, for each row of windows
        """

        #### YOUR CODE HERE ####

        print "windows shape ", windows.shape 
        x = self.sparams.L[windows[:,0]]
        for i in range(len(windows[0])-1):
            x = np.concatenate((x,self.sparams.L[windows[:,i+1]]),axis=1)

        z = self.params.W.dot(x.T)+self.params.b1.reshape((self.params.b1.shape[0],1))
        h = tanh(z)
        p = softmax(self.params.U.dot(h)+self.params.b2.reshape((self.params.b2.shape[0],1)))
        labelArray = np.zeros((len(labels),self.params.b2.shape[0]))
        for i in range(len(labels)):
            labelArray[i] = make_onehot(labels[i],self.params.b2.shape[0])
        batch = len(labels)
        p = p*labelArray.T
        p = np.sum(p,axis=0)
        J = np.sum(-np.log(p))
        Jreg = batch*(self.lreg/2.0)*(np.sum(self.params.W**2)+np.sum(self.params.U**2))
        J += Jreg                    
        #### END YOUR CODE ####
        return J
Exemple #11
0
    def backProp(self, node, error=None):

        # Clear nodes
        node.fprop = False
        ################
        # TODO: Implement the recursive backProp function
        #  - you should update self.dWs, self.dbs, self.dW, self.db, and self.dL[node.word] accordingly
        #  - node: your current node in the parse tree
        #  - error: error that has been passed down from a previous iteration
        ################

        errorCur = node.probs - make_onehot(node.label, len(self.bs))
        self.dWs += np.outer(errorCur, node.hActs1)
        self.dbs += errorCur

        errorCur = errorCur.dot(self.Ws)
        if error is not None:
            errorCur += error

        if node.isLeaf == True:
            self.dL[node.word] += errorCur
            return

        errorCur = errorCur * self.df(node.hActs1)
        self.dW += np.outer(errorCur,
                            np.hstack([node.left.hActs1, node.right.hActs1]))
        self.db += errorCur
        errorDown = errorCur.dot(self.W)
        self.backProp(node.left, errorDown[:self.wvecDim])
        self.backProp(node.right, errorDown[self.wvecDim:])
Exemple #12
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = np.zeros((ns + 1, self.hdim))
        for i in range(ns):
            hs[i + 1] = sigmoid(
                self.params.H.dot(hs[i]) +
                self.params.W.dot(self.sparams.L[xs[i]]))
            p = softmax(self.params.U.dot(hs[i + 1]))
            p = p * make_onehot(ys[i], self.vdim)
            J += -np.log(np.sum(p))
        #### END YOUR CODE ####

        Jreg = 0.5 * self.lreg * (np.sum(self.params.H**2) + np.sum(
            self.params.W**2) + np.sum(self.params.U**2))
        return J + Jreg
Exemple #13
0
    def compute_loss(self, windows, labels):
        """
        Compute the loss for a given dataset.
        windows = same as for predict_proba
        labels = list of class labels, for each row of windows
        """

        #### YOUR CODE HERE ####
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]
            labels = [labels]

        N = len(windows)

        # x = self.sparams.L[windows]
        # x = x.reshape((N,x.shape[-2]*x.shape[-1]))
        # z = x.dot(self.params.W.T) + self.params.b1
        # h = tanh(z)
        # z2 = h.dot(self.params.U.T) + self.params.b2
        # p = softmax(z2)
        # J -= sum(log(p[0][labels])
        # J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0))

        J = 0
        for n in xrange(N):
            x = self.sparams.L[windows[n]]
            x = reshape(x, x.shape[0]*x.shape[1])
            h = tanh(self.params.W.dot(x) + self.params.b1)
            y_hat = softmax(self.params.U.dot(h) + self.params.b2)
            y = make_onehot(labels[n], len(y_hat))
            J -= sum(y*log(y_hat))
        J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0))
        #### END YOUR CODE ####
        return J
Exemple #14
0
    def backProp(self,node,error=None):

        # Clear nodes
        node.fprop = False
        ################
        # TODO: Implement the recursive backProp function
        #  - you should update self.dWs, self.dbs, self.dW, self.db, and self.dL[node.word] accordingly
        #  - node: your current node in the parse tree
        #  - error: error that has been passed down from a previous iteration
        ################

        errorCur = node.probs - make_onehot(node.label,len(self.bs))
        self.dWs += np.outer(errorCur, node.hActs1)
        self.dbs += errorCur

        errorCur = errorCur.dot(self.Ws)
        if error is not None:
            errorCur += error

        if node.isLeaf == True:
            self.dL[node.word] += errorCur
            return

        errorCur = errorCur*self.df(node.hActs1)
        self.dW += np.outer(errorCur,np.hstack([node.left.hActs1, node.right.hActs1]))
        self.db += errorCur
        errorDown = errorCur.dot(self.W)        
        self.backProp(node.left,errorDown[:self.wvecDim])
        self.backProp(node.right,errorDown[self.wvecDim:])
Exemple #15
0
    def b_prop(self, ys):

        #L = self.params['L']
        Wh = self.params['Wh']
        #Wx = self.params['Wx']
        U = self.params['U']
        b1 = self.params['b1']
        b2 = self.params['b2']
        N = len(ys)

        delta_above = np.zeros(self.hdim)
        for t in xrange(N-1,-1, -1):
            delta_3 = self.yhats[:,t] - make_onehot(ys[t], self.outdim)
            self.grads['U'] += np.outer(delta_3, self.hs[:,t])
            self.grads['b2'] += delta_3
            dh = np.dot(np.transpose(U), delta_3) + delta_above
            delta_2 = dh * (self.hs[:,t] > 0)
            self.grads['b1'] += delta_2
            self.grads['Wh'] += np.outer(delta_2, self.hs[:,t-1])
            #self.grads['Wx'] += np.outer(delta_2, L[:,xs[t]])
            #self.grads['L'][:,xs[t]] += np.dot(np.transpose(Wx), delta_2)
            delta_below = np.dot(np.transpose(Wh), delta_2)

            delta_above = delta_below
        return delta_below
Exemple #16
0
    def compute_loss(self, windows, labels):
        """
        Compute the loss for a given dataset.
        windows = same as for predict_proba
        labels = list of class labels, for each row of windows
        """

        #### YOUR CODE HERE ####
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]
            labels = [labels]

        N = len(windows)

        # x = self.sparams.L[windows]
        # x = x.reshape((N,x.shape[-2]*x.shape[-1]))
        # z = x.dot(self.params.W.T) + self.params.b1
        # h = tanh(z)
        # z2 = h.dot(self.params.U.T) + self.params.b2
        # p = softmax(z2)
        # J -= sum(log(p[0][labels])
        # J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0))

        J = 0
        for n in xrange(N):
            x = self.sparams.L[windows[n]]
            x = reshape(x, x.shape[0] * x.shape[1])
            h = tanh(self.params.W.dot(x) + self.params.b1)
            y_hat = softmax(self.params.U.dot(h) + self.params.b2)
            y = make_onehot(labels[n], len(y_hat))
            J -= sum(y * log(y_hat))
        J += (self.lreg / 2.0) * (sum(self.params.W**2.0) +
                                  sum(self.params.U**2.0))
        #### END YOUR CODE ####
        return J
Exemple #17
0
    def backProp(self,node,error=None):

        # Clear nodes
        node.fprop = False
        # this is exactly the same setup as backProp in rnn.py
        errorCur = node.probs - make_onehot(node.label,len(self.bs))
        self.dWs += np.outer(errorCur,node.hActs2)
        self.dbs += errorCur
        errorCur = errorCur.dot(self.Ws)*self.df(node.hActs2)
        self.dW2 += np.outer(errorCur,node.hActs1)
        self.db2 += errorCur
        errorCur =  errorCur.dot(self.W2)
        if error is not None:
            errorCur += error
        if node.isLeaf == True:
            self.dL[node.word] += errorCur
            return

        errorCur = errorCur*self.df(node.hActs1)
        tmp1 = np.ones(self.W1.shape).dot(np.diag(np.hstack([node.left.hActs1, node.right.hActs1])))
        self.dW1 += np.diag(errorCur).dot(tmp1)
        self.db1 += errorCur

        errorCur = errorCur.dot(self.W1)
        self.backProp(node.left,errorCur[:self.wvecDim])
        self.backProp(node.right,errorCur[self.wvecDim:])
Exemple #18
0
    def _acc_grads(self, x, label):
        """
        Accumulate gradients from a training example.
        """
        #import ipdb; ipdb.set_trace()
        ##
        # Forward Pass
        zs, hs = self.forward_pass(x)

        y_hat = hs[-1]
        y = make_onehot(label, self.outputsize)
        delta = y_hat - y

        cur_h = len(hs) - 2  # current h vector index
        cur_z = len(zs) - 2  # current z vector index
        # Backpropagation
        #import ipdb; ipdb.set_trace()
        for i in range(len(self.dims) - 1, 0, -1):
            self._add_grads('b', i, delta)
            curw = self._get_param('W', i)
            gradw = np.outer(delta, hs[cur_h])
            gradw_reg = self.lreg * curw
            self._add_grads('W', i, gradw + gradw_reg)
            if cur_z >= 0:
                delta = np.dot(curw.T, delta) * self.act_grad(zs[cur_z])
            cur_h -= 1
            cur_z -= 1
Exemple #19
0
    def backProp(self, node, error=None):
        # Clear nodes
        node.fprop = False
        # this is exactly the same setup as backProp in rnn.py
        errorCur = node.probs - make_onehot(node.label, len(self.bs))
        self.dWs += np.outer(errorCur, node.hActs2 * self.mask)
        self.dbs += errorCur
        errorCur = errorCur.dot(self.Ws) * self.df(node.hActs2) * self.mask
        self.dW2 += np.outer(errorCur, node.hActs1)
        self.db2 += errorCur
        errorCur = errorCur.dot(self.W2)
        if error is not None:
            errorCur += error
        if node.isLeaf == True:
            self.dL[node.word] += errorCur
            return

        errorCur = errorCur * self.df(node.hActs1)
        tmp1 = np.ones(self.W1.shape).dot(
            np.diag(np.hstack([node.left.hActs1, node.right.hActs1])))
        self.dW1 += np.diag(errorCur).dot(tmp1)
        self.db1 += errorCur

        errorCur = errorCur.dot(self.W1)
        self.backProp(node.left, errorCur[:self.wvecDim])
        self.backProp(node.right, errorCur[self.wvecDim:])
Exemple #20
0
    def backProp(self,node,error=None):
        # Clear nodes
        node.fprop = False
        errorCur = node.probs - make_onehot(node.label,len(self.bs))
        self.dWs += np.outer(errorCur, node.hActs1)
        self.dbs += errorCur

        errorCur = errorCur.dot(self.Ws)
        if error is not None:
            errorCur += error

        if node.isLeaf == True:
            self.dL[node.word] += errorCur
            return

        errorCur = errorCur*self.df(node.hActs1)
        LR = np.hstack([node.left.hActs1, node.right.hActs1])
        self.dW += np.outer(errorCur,LR)
        self.db += errorCur

        S = np.zeros(len(LR))
        for i in range(len(self.V)):
            self.dV[i] += errorCur[i]*np.outer(LR,LR)
            S += (self.V[i]+self.V[i].T).dot(LR)*errorCur[i]
        
        errorDown = errorCur.dot(self.W) + S        
        self.backProp(node.left,errorDown[:self.wvecDim])
        self.backProp(node.right,errorDown[self.wvecDim:])
Exemple #21
0
    def backProp(self, node, error=None):
        # Clear nodes
        node.fprop = False
        errorCur = node.probs - make_onehot(node.label, len(self.bs))
        self.dWs += np.outer(errorCur, node.hActs1)
        self.dbs += errorCur

        errorCur = errorCur.dot(self.Ws)
        if error is not None:
            errorCur += error

        if node.isLeaf == True:
            self.dL[node.word] += errorCur
            return

        errorCur = errorCur * self.df(node.hActs1)
        LR = np.hstack([node.left.hActs1, node.right.hActs1])
        self.dW += np.outer(errorCur, LR)
        self.db += errorCur

        S = np.zeros(len(LR))
        for i in range(len(self.V)):
            self.dV[i] += errorCur[i] * np.outer(LR, LR)
            S += (self.V[i] + self.V[i].T).dot(LR) * errorCur[i]

        errorDown = errorCur.dot(self.W) + S
        self.backProp(node.left, errorDown[:self.wvecDim])
        self.backProp(node.right, errorDown[self.wvecDim:])
Exemple #22
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        xf = []
        for idx in window:
            xf.extend( self.sparams.L[idx]) # extract representation
        tanhX = tanh(self.params.W.dot(xf) + self.params.b1)
        softmaxP = softmax(self.params.U.dot(tanhX) + self.params.b2)
        y = make_onehot(label, len(softmaxP))
        delta2 = softmaxP -y
        self.grads.U += outer(delta2, tanhX) + self.lreg * self.params.U
        self.grads.b2 += delta2
        delta1 = self.params.U.T.dot(delta2)*(1. - tanhX*tanhX)
        self.grads.W += outer(delta1, xf) + self.lreg * self.params.W
        self.grads.b1 += delta1
Exemple #23
0
    def _acc_grads(self, x, label):
        """
        Accumulate gradients from a training example.
        """
        #import ipdb; ipdb.set_trace()
        ##
        # Forward propagation
        z1 = self.params.W.dot(x) + self.params.b1
        h1 = tanh(z1)
        z2 = np.dot(self.params.U, h1) + self.params.b2
        h2 = tanh(z2)
        z3 = np.dot(self.params.G, h2) + self.params.b3
        y_hat = softmax(z3)

        y = make_onehot(label, self.outputsize)
        d3 = y_hat - y
        self.grads.b3 += d3
        self.grads.G += np.outer(d3, h2) + self.lreg * self.params.G

        d2 = np.dot(self.params.G.T, d3) * tanhd(z2)

        self.grads.b2 += d2
        self.grads.U += np.outer(d2, h1) + self.lreg * self.params.U
        d1 = np.dot(self.params.U.T, d2) * tanhd(z1)

        self.grads.W += np.outer(d1, x) + self.lreg * self.params.W
        self.grads.b1 += d1
Exemple #24
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        ##
        # Forward propagation
        x = hstack(self.sparams.L[window, :])
        h = tanh(2*(self.params.W.dot(x)+self.params.b1))
        p = softmax(self.params.U.dot(h)+self.params.b2)
        ##
        y = make_onehot(label, 5)
        delta = p - y
        # Backpropagation
        self.grads.U += outer(delta, h) + self.lreg * self.params.U
        self.grads.b2 += delta
        gradh = dot(self.params.U.T,delta) * (1-h**2)
        self.grads.W += outer(gradh, x) + self.lreg * self.params.W
        self.grads.b1 += gradh

        dL = self.params.W.T.dot(gradh).reshape(self.window_size, self.word_vec_size)
        for i in xrange(self.window_size):
            self.sgrads.L[window[i], :] = dL[i]
Exemple #25
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)
        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.
        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row
        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H)
                and self.sgrads (for L,U)
        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.
        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs) #3
        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####

        ##
        # Forward propagation

        # for each time step
        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]])
            ps[t] = softmax(dot(self.params.U, hs[t]))

        ##
        # Backward propagation through time

        for j in xrange(ns):
            y = make_onehot(ys[j], self.vdim)
            y_hat_minus_y = ps[j] - y
            self.grads.U += outer(y_hat_minus_y, hs[j])
            delta = dot(self.params.U.T, y_hat_minus_y) * hs[j] * (1.0 - hs[j])

            # start at j and go back self.bptt times (total self.bptt + 1 elements, including current one)
            for t in xrange(j, j - self.bptt - 1, -1):
                if t - 1 >= -1:
                    self.grads.H += outer(delta, hs[t - 1]) #See from above.. hs[-1] is list of zeros.
                    self.sgrads.L[xs[t]] = delta
                    delta = dot(self.params.H.T, delta) * hs[t - 1] * (1.0 - hs[t - 1])
Exemple #26
0
    def predict(self, node, correct=[], guess=[]):
        cost  =  total = 0.0
        # this is exactly the same setup as forwardProp in rnn.py
        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:,node.word]
            #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2)

            tmp = node.hActs1*self.dropoutP
            tmpMaxout = np.zeros((self.maxoutK, self.middleDim))
            for i in range(self.maxoutK):
                tmpMaxout[i] = self.W2[i].dot(tmp)+self.b2[i]
            (node.hActs2, node.idx) = self.maxout(tmpMaxout)
            
            node.probs = softmax((self.Ws*self.dropoutP).dot(node.hActs1)+self.bs)
            p = node.probs*make_onehot(node.label,len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1
        
        c1,t1 = self.forwardProp(node.left,correct,guess)
        c2,t2 = self.forwardProp(node.right,correct,guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1)
            #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2)
            tmp = node.hActs1*self.dropoutP
            tmpMaxout = np.zeros((self.maxoutK,self.middleDim))
            for i in range(self.maxoutK):
                tmpMaxout[i] = self.W2[i].dot(tmp)+self.b2[i]
            (node.hActs2, node.idx) = self.maxout(tmpMaxout)
            
            node.probs = softmax((self.Ws*self.dropoutP).dot(node.hActs2)+self.bs)
            p = node.probs*make_onehot(node.label,len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            
        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
    def forwardProp(self,node, correct=[], guess=[]):
        cost  =  total = 0.0
        # this is exactly the same setup as forwardProp in rnn.py
        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:,node.word]
            #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2)

            tmp = node.hActs1*self.mask1
            tmpMaxout = np.zeros((self.maxoutK, self.middleDim))
            for i in range(self.maxoutK):
                tmpMaxout[i] = self.W2[i].dot(tmp) + self.b2[i]
            (node.hActs2, node.idx) = self.maxout(tmpMaxout)
            
            node.probs = softmax(self.Ws.dot(node.hActs2*self.mask)+self.bs)
            p = node.probs*make_onehot(node.label,len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1
        
        c1,t1 = self.forwardProp(node.left,correct,guess)
        c2,t2 = self.forwardProp(node.right,correct,guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1)
            #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2)
            tmp = node.hActs1*self.mask1
            tmpMaxout = np.zeros((self.maxoutK, self.middleDim))
            for i in range(self.maxoutK):
                tmpMaxout[i] = self.W2[i].dot(tmp) + self.b2[i]
            (node.hActs2, node.idx) = self.maxout(tmpMaxout)

            node.probs = softmax(self.Ws.dot(node.hActs2*self.mask)+self.bs)
            p = node.probs*make_onehot(node.label,len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            
        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
Exemple #28
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        words = [
            self.sparams.L[window[0]], self.sparams.L[window[1]],
            self.sparams.L[window[2]]
        ]
        x = reshape(words, self.sparams.L.shape[1] * 3)  # 3n row vector
        z2 = self.params.W.dot(x) + self.params.b1
        a2 = tanh(z2)
        z3 = self.params.U.dot(a2) + self.params.b2
        a3 = softmax(z3)

        ##
        # Backpropagation
        y = make_onehot(label, len(a3))
        delta3 = a3 - y
        dJdU = outer(delta3, a2)
        dJdb2 = delta3
        delta2 = multiply((1 - square(a2)), self.params.U.T.dot(delta3))
        dJdW = outer(delta2, x)
        dJdb1 = delta2

        # Regularization
        regdJdW = self.lreg * self.params.W
        regdJdU = self.lreg * self.params.U

        self.grads.U += (dJdU + regdJdU)
        self.grads.b2 += dJdb2
        self.grads.W += (dJdW + regdJdW)
        self.grads.b1 += dJdb1

        dJdL = self.params.W.T.dot(delta2)
        dJDL_shaped = reshape(dJdL, (3, self.sparams.L.shape[1]))
        self.sgrads.L[window[0]] = dJDL_shaped[0]
        self.sgrads.L[window[1]] = dJDL_shaped[1]
        self.sgrads.L[window[2]] = dJDL_shaped[2]
Exemple #29
0
    def _acc_grads(self, xs, ys):


        # Expect xs as list of indices
        ns = len(xs)

        #print 'size of window ',ns

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        
        #print self.params.U.shape
        #print self.params.H.shape
        #print self.sparams.L.shape
        #bre
        for t in range(0,ns):
            hs[t] = sigmoid(dot(hs[t-1],self.params.H)+self.sparams.L[xs[t],:])
            ps[t] = softmax(dot(self.params.U,hs[t]))

        #print hs
        temp_ys = []
        for i in range(0,ns):
            temp_ys.append(make_onehot(ys[i],self.vdim))

        temp_ys = matrix(temp_ys)
        #print hs
        #print ps
        #print ps.shape
        #print temp_ys.shape
        delta2 = ps - temp_ys

        #print self.grads.U

        ##
        # Backward propagation through time

        for t in range(0,ns)[::-1]:
            self.grads.U += outer(delta2[t],hs[t])
            #print self.grads.U
            delta1 = multiply(dot(delta2[t],self.params.U),hs[t]*(1-hs[t]))
            for step in range(max(0,t-self.bptt),t+1)[::-1]:
                self.grads.H += dot(delta1,hs[step-1])
                self.sgrads.L[xs[step]] = delta1
                delta1 = multiply(dot(delta1,self.params.H),hs[step-1]*(1-hs[step-1]))
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        L = self.sparams.L
        U = self.params.U
        W = self.params.W
        b1 = self.params.b1
        b2 = self.params.b2
        windowSize = self.windowSize
        wordVecLen = self.wordVecLen
        lambda_ = self.lreg
        alpha = self.alpha
        ##
        # Forward propagation
        x = hstack(L[window, :])
        z1 = W.dot(x) + b1
        h = tanh(z1)
        z2 = U.dot(h) + b2
        y_hat = softmax(z2)
        
        ##
        # Backpropagation
        target = make_onehot(label, len(y_hat))
        delta = y_hat - target
        
        #self.grads.U += delta.dot(h.T) + lambda_ * U
        #outer函数很有用
        self.grads.U += outer(delta, h) + lambda_ * U
        self.grads.b2 += delta
        
        grad_h = U.T.dot(delta) * (1 - h ** 2)
        self.grads.W += outer(grad_h, x) + lambda_ * W
        self.grads.b1 += grad_h
        
        sgrad_L = W.T.dot(grad_h)
        sgrad_L = sgrad_L.reshape(windowSize, wordVecLen)
        
        for i in xrange(windowSize):
            self.sgrads.L[window[i], :] = sgrad_L[i, :]
Exemple #31
0
    def forwardProp(self, node, correct=[], guess=[]):
        cost = total = 0.0  # cost should be a running number and total is the total examples we have seen used in accuracy reporting later
        ################
        # TODO: Implement the recursive forwardProp function
        #  - you should update node.probs, node.hActs1, node.fprop, and cost
        #  - node: your current node in the parse tree
        #  - correct: this is a running list of truth labels
        #  - guess: this is a running list of guess that our model makes
        #     (we will use both correct and guess to make our confusion matrix)
        ################

        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:, node.word]
            node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1

        c1, t1 = self.forwardProp(node.left, correct, guess)
        c2, t2 = self.forwardProp(node.right, correct, guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            node.hActs1 = self.ReLU(self.W.dot(h) + self.b)
            node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))

        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
Exemple #32
0
    def forwardProp(self,node,correct=[], guess=[]):
        cost  =  total = 0.0 # cost should be a running number and total is the total examples we have seen used in accuracy reporting later
        ################
        # TODO: Implement the recursive forwardProp function
        #  - you should update node.probs, node.hActs1, node.fprop, and cost
        #  - node: your current node in the parse tree
        #  - correct: this is a running list of truth labels
        #  - guess: this is a running list of guess that our model makes
        #     (we will use both correct and guess to make our confusion matrix)
        ################

        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:, node.word]
            node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs)
            p = node.probs*make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1
        
        c1,t1 = self.forwardProp(node.left,correct,guess)
        c2,t2 = self.forwardProp(node.right,correct,guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            node.hActs1 = self.ReLU(self.W.dot(h) + self.b)
            node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs)
            p = node.probs*make_onehot(node.label,len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            
        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
Exemple #33
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        (H, X) = self.params.W.shape  # (100, 150)
        (Dy, H) = self.params.U.shape  # (5, 100)

        ##
        # Forward propagation
        x = hstack(self.sparams.L[window])  # (150,) --> (X,)
        a = dot(self.params.W, x) + self.params.b1  # (H,)
        h = tanh(a)  # (H,)
        y_hat = softmax(dot(self.params.U, h) + self.params.b2)  # (Dy,)
        y = make_onehot(label, len(y_hat))
        delta = y_hat - y

        ##
        # Backpropagation

        # dJ/db2
        self.grads.b2 += delta

        # dJ/dU
        self.grads.U += outer(delta, h) + self.lreg * self.params.U

        # dJ/dW, dJ/db1
        # d_tanh(a) is (H,)
        #x1 = dot(self.params.U.T, delta.reshape((Dy, 1))).reshape((H,)) * d_tanh(a)
        x1 = dot(self.params.U.T, delta) * d_tanh(a)
        self.grads.W += outer(x1, x) + self.lreg * self.params.W
        self.grads.b1 += x1

        dL_updates = dot(self.params.W.T, x1.reshape((H, 1)))
        for pt in xrange(self.windowsize):
            f = dL_updates[pt * self.word_vec_size:(pt + 1) *
                           self.word_vec_size]
            self.sgrads.L[window[pt]] = f.reshape((self.word_vec_size, ))
Exemple #34
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        words = [self.sparams.L[window[0]], self.sparams.L[window[1]], self.sparams.L[window[2]]]
        x = reshape(words, self.sparams.L.shape[1] *3) # 3n row vector
        z2 = self.params.W.dot(x) + self.params.b1
        a2 = tanh(z2)
        z3 = self.params.U.dot(a2) + self.params.b2
        a3 = softmax(z3)
 
        ##
        # Backpropagation
        y = make_onehot(label, len(a3))
        delta3 = a3 - y
        dJdU = outer(delta3, a2)
        dJdb2 = delta3
        delta2 = multiply((1 - square(a2)), self.params.U.T.dot(delta3))
        dJdW = outer(delta2, x)
        dJdb1 = delta2
        
        # Regularization
        regdJdW = self.lreg * self.params.W
        regdJdU = self.lreg * self.params.U
        
        self.grads.U += (dJdU + regdJdU)
        self.grads.b2 += dJdb2
        self.grads.W += (dJdW + regdJdW)
        self.grads.b1 += dJdb1
        
        dJdL = self.params.W.T.dot(delta2)
        dJDL_shaped = reshape(dJdL, (3,  self.sparams.L.shape[1]))
        self.sgrads.L[window[0]] = dJDL_shaped[0]
        self.sgrads.L[window[1]] = dJDL_shaped[1]
        self.sgrads.L[window[2]] = dJDL_shaped[2]
Exemple #35
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        (H, X) = self.params.W.shape # (100, 150)
        (Dy, H) = self.params.U.shape # (5, 100)

        ##
        # Forward propagation
        x = hstack(self.sparams.L[window]) # (150,) --> (X,)
        a = dot(self.params.W, x) + self.params.b1 # (H,)
        h = tanh(a) # (H,)
        y_hat = softmax(dot(self.params.U, h) + self.params.b2) # (Dy,)
        y = make_onehot(label, len(y_hat))
        delta = y_hat - y

        ##
        # Backpropagation

        # dJ/db2
        self.grads.b2 += delta

        # dJ/dU
        self.grads.U += outer(delta, h) + self.lreg * self.params.U

        # dJ/dW, dJ/db1
        # d_tanh(a) is (H,)
        #x1 = dot(self.params.U.T, delta.reshape((Dy, 1))).reshape((H,)) * d_tanh(a)
        x1 = dot(self.params.U.T, delta) * d_tanh(a)
        self.grads.W += outer(x1, x) + self.lreg * self.params.W
        self.grads.b1 += x1

        dL_updates = dot(self.params.W.T, x1.reshape((H, 1)))
        for pt in xrange(self.windowsize):
            f = dL_updates[pt * self.word_vec_size : (pt + 1) * self.word_vec_size]
            self.sgrads.L[window[pt]] = f.reshape((self.word_vec_size,))
Exemple #36
0
    def _acc_grads(self, xs, ys, d):

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))

        # predicted probas
        ps = zeros((ns, self.vdim))

        zs = zeros((ns+1, self.hdim))

        ##
        # Forward propagation
        d_vec = self.sparams.D[d]
        for t in xrange(ns):
            x_t = xs[t]
            zs[t] = self.params.H.dot(hs[t-1]) + self.sparams.L[x_t] + d_vec
            hs[t] = sigmoid(zs[t])
            ps[t] = softmax(self.params.U.dot(hs[t]) + self.params.G.dot(d_vec.T).reshape(self.vdim,))

        ##
        # Backward propagation through time

        d_grad = zeros_like(self.sparams.D[0])
        for t in reversed(xrange(ns)):
            delta = zeros((ns, self.hdim))
            p_t = ps[t]
            eps_t = p_t - make_onehot(ys[t], len(p_t))
            self.grads.U += outer(eps_t, hs[t])
            self.grads.G += outer(eps_t, d_vec)
            d_grad += self.params.G.T.dot(eps_t)
            sig_prime_t = sigmoid(zs[t])*(1.-sigmoid(zs[t]))
            delta[t] = sig_prime_t * self.params.U.T.dot(eps_t)
            self.sgrads.L[xs[t]] = delta[t].copy()
            d_grad += delta[t].copy()
            self.grads.H += outer(delta[t], hs[t-1])
            for i in xrange(1, self.bptt):
                j = t-i
                if j < 0: continue
                sig_prime_j = sigmoid(zs[j])*(1.-sigmoid(zs[j]))
                delta[j] = sig_prime_j * self.params.H.T.dot(delta[j+1])
                self.sgrads.L[xs[j]] = delta[j].copy()
                d_grad += delta[j].copy()
                self.grads.H += outer(delta[j], hs[j-1])

        self.sgrads.D[d] = d_grad.copy()
Exemple #37
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        a1 = hstack(self.sparams.L[window, :])
        z2 = self.params.W.dot(a1) + self.params.b1
        a2 = tanh(z2) # h
        z3 = self.params.U.dot(a2) + self.params.b2
        y_hat = softmax(z3)

        y = make_onehot(label, len(y_hat))
        delta3 = y_hat - y
        ##
        # Backpropagation

        # dJ/dU
        self.grads.U += outer(delta3, a2) + self.lreg * self.params.U

        # dJ/db2
        self.grads.b2 += delta3

        delta2 = self.params.U.T.dot(delta3) * d_tanh(z2)

        # dJ/dW @TODO: check
        self.grads.W += outer(delta2, a1) + self.lreg * self.params.W

        # dJ/db1
        self.grads.b1 += delta2

        # dJ/dL
        dL = self.params.W.T.dot(delta2).reshape(self.windowsize, -1)
        for idx in xrange(self.windowsize):
            self.sgrads.L[window[idx], :] = dL[idx]
Exemple #38
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        #x_ = concatenate([self.wv[window[0],:], self.wv[window[1],:], self.wv[window[2],:]])
        x_ = hstack(self.sparams.L[window, :])
        lam = self.lreg
        alpha = self.alpha
        W = self.params.W
        b1 = self.params.b1
        U = self.params.U
        b2 = self.params.b2

        ##
        # Forward propagation
        z1 = W.dot(x_) + b1
        h = tanh(z1)
        z2 = U.dot(h) + b2
        y = softmax(z2)
        
        ##
        # Backpropagation
        target = make_onehot(label, len(y))
        dscore = y - target
        self.grads.U += outer(dscore, h) + lam * U
        self.grads.b2 += dscore
        
        dhidden = U.T.dot(dscore)*(1 - h ** 2)
        
        self.grads.W += outer(dhidden, x_) + lam * W
        self.grads.b1 += dhidden
        
        dx_ = dot(W.T, dhidden)
        dx_ = dx_.reshape(self.windowSize, self.wordVecLen)
        for i in xrange(self.windowSize):
            self.sgrads.L[window[i],:] = dx_[i, :]
Exemple #39
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        a1 = hstack(self.sparams.L[window, :])
        z2 = self.params.W.dot(a1) + self.params.b1
        a2 = tanh(z2)  # h
        z3 = self.params.U.dot(a2) + self.params.b2
        y_hat = softmax(z3)

        y = make_onehot(label, len(y_hat))
        delta3 = y_hat - y
        ##
        # Backpropagation

        # dJ/dU
        self.grads.U += outer(delta3, a2) + self.lreg * self.params.U

        # dJ/db2
        self.grads.b2 += delta3

        delta2 = self.params.U.T.dot(delta3) * d_tanh(z2)

        # dJ/dW @TODO: check
        self.grads.W += outer(delta2, a1) + self.lreg * self.params.W

        # dJ/db1
        self.grads.b1 += delta2

        # dJ/dL
        dL = self.params.W.T.dot(delta2).reshape(self.windowsize, -1)
        for idx in xrange(self.windowsize):
            self.sgrads.L[window[idx], :] = dL[idx]
Exemple #40
0
    def _acc_grads(self, x, label):
        """
        Accumulate gradients from a training example.
        """
        ##
        # Forward propagation
        #import ipdb; ipdb.set_trace()
        p = softmax(self.params.W.dot(x) + self.params.b)

        ##
        # Compute gradients w.r.t cross-entropy loss
        y = make_onehot(label, len(p))
        delta = p - y
        # dJ/dW, dJ/db1
        self.grads.W += np.outer(delta, x) + self.lreg * self.params.W
        self.grads.b += delta
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """
        J = 0 # total loss
        ys = [init] # emitted sequence

        ns = maxlen
        hs = np.zeros((ns+1,self.hdim))
        #### YOUR CODE HERE ####
        for i in range(ns):
            hs[i+1] = sigmoid(self.params.H.dot(hs[i])+self.params.W.dot(self.sparams.L[ys[i]]))            
            p = self.hierarchicalU.getDistribution(hs[i+1])
            y = multinomial_sample(p)
            ys.append(y)
            if y == end:
                break
            p = p*make_onehot(y,self.vdim)
            J += -np.log(np.sum(p))


        ##
        #x only compute the node which gradient is updated 
        x = self.hierarchicalU.getSumSquareU(self.hierarchicalU.root)
        Jreg = 0.5*self.lreg*(np.sum(self.params.H**2)+np.sum(self.params.W**2)+ x)
        #### YOUR CODE HERE ####
        return ys, J+Jreg
Exemple #42
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        x = hstack([self.sparams.L[idx] for idx in window]) # extract representation,(150,) matrix
        a =self.params.W.dot(x)+self.params.b1#(100,150)*(150,)+(100,)=>(100,)
        h = tanh(a)#(100,)
        p = softmax(self.params.U.dot(h) + self.params.b2)#(5,100)*(100,)+(100,)=>(5,)
        
        # Compute gradients w.r.t cross-entropy loss
        y = make_onehot(label, len(p))
        delta = p - y #(5,)
        ##
        # Backpropagation
        # dJ/dh
        dh = self.params.U.T.dot(delta) #(100,5)*(5,)=>(100,)
        # dJ/da
        da = dh * (1-tanh(a)**2)#(100,) right
        L_updatevalue=self.params.W.T.dot(da)#(150,100)*(100,)=>(150,)
        
        # dJ/dU, dJ/db2
        self.grads.U += outer(delta, h) + self.lreg * self.params.U#(5,100)
        self.grads.b2 += delta#(5,)

        # dJ/dW, dJ/db1
        self.grads.W += outer(da,x) + self.lreg * self.params.W #(100,)*(150,)+(100,150)=>(100,150)
        self.grads.b1 += da #(100,)
        
        # dJ/dL, sparse update: use sgrads
        dL = self.params.W.T.dot(da).reshape(self.window_size, self.word_vec_size)#(150,100)*(100,)=>(150,)=>(3,50)
        for i in xrange(self.window_size):
            self.sgrads.L[window[i], :] = dL[i]
Exemple #43
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        window_vecs = []
        for w in window:
            window_vecs.append(self.sparams.L[w])

        x = append([], window_vecs)
        z2 = dot(self.params.W1, x)
        a2 = tanh(z2 + self.params.b2)
        z3 = dot(self.params.W2, a2)
        y_hat = softmax(z3 + self.params.b3)

        #print 'acc_grads: ' + str(y_hat)
        ##
        # Backpropagation
        d3 = y_hat - make_onehot(label, y_hat.shape[0]); # label is 'y'

        self.grads.b3 += d3
        self.grads.W2 += outer(d3, a2) + self.lreg * self.params.W2

        d2 = multiply(tanh_derivative(a2), dot(self.params.W2.T, d3))

        self.grads.W1 += outer(d2, x) + self.lreg * self.params.W1
        self.grads.b2 += d2

        x_grads = dot(self.params['W1'].T, d2)

        for i in range(0, len(window)):
            self.sgrads.L[ window[i] ] = x_grads[i*self.n : (i+1)*self.n]
Exemple #44
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        x = self.sparams.L[window]
        x = reshape(x,(-1,))

        W = self.params.W
        U = self.params.U
        b1 = self.params.b1
        b2 = self.params.b2
        lreg = self.lreg

        h = tanh(W.dot(x) + b1)
        p = softmax(U.dot(h) + b2)

        y = make_onehot(label, len(p))
        delta2 = p - y
        delta1 = multiply(transpose(U).dot(delta2), 1-h*h)

        ##
        # Backpropagation
        self.grads.b2 += delta2
        self.grads.U += outer(delta2, h) + lreg * U
        self.grads.b1 += delta1
        self.grads.W += outer(delta1, x) + lreg * W
        C = window.shape[0]
        gradx = reshape(transpose(W).dot(delta1),(C,-1))
        # for i in range(C):
            # self.sgrads.L[window[i]] = gradx[i]
        self.sgrads.L[window] = gradx
Exemple #45
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        # Forward propagation
        #words = [self.sparams.L[window[0]], self.sparams.L[window[1]], self.sparams.L[window[2]]]
        #x = reshape(words, self.sparams.L.shape[1] *3) # 3n row vector
        x = self.sparams.L[window, :].flatten()
        h = tanh(self.params.W.dot(x) + self.params.b1)  # 100*1
        yhat = softmax(self.params.U.dot(h) + self.params.b2)  # 5*1

        # Compute gradients w.r.t cross-entropy loss
        # Backpropagation
        y = make_onehot(label, len(yhat))
        delta = yhat - y

        # dJ/dU, dJ/db2
        self.grads.U += (outer(delta, h) + self.lreg * self.params.U)
        self.grads.b2 += delta

        # dJ/dW, dJ/db1
        delta2 = multiply((1 - square(h)), self.params.U.T.dot(delta))
        self.grads.W += (outer(delta2, x) + self.lreg * self.params.W)
        self.grads.b1 += delta2

        # dJ/dL, sparse grad update
        dJ_dL = self.params.W.T.dot(delta2).reshape(len(window),
                                                    self.sparams.L.shape[1])

        #for i, w in enumerate(window):
        #    self.sgrads.L[w] = dJ_dL[i]
        for k in range(len(window)):
            self.sgrads.L[window[k]] = dJ_dL[k]
Exemple #46
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        ##
        # Forward propagation
        words = np.array([self.params.L[x] for x in window])
        x = np.reshape(words, -1)
        layer1 = np.tanh(self.params.W.dot(x) + self.params.b1)
        probs  = softmax(self.params.U.dot(layer1) + self.params.b2)
        ##
        # Backpropagation
        y = make_onehot(label, len(probs))
        dx = probs - y
        dU = np.outer(dx, layer1)
        delta2 = np.multiply((1 - np.square(dU)),
                             self.params.U.T.dot(dx))
        dW  = np.outer(delta2, x)
        db1 = delta2
        dL  = self.params.W.T.dot(delta2)
        dL  = np.reshape(dL, (3, self.params.L.shape[1]))

        dW += self.lreg * self.params.W
        dU += self.lreg * self.params.U

        self.grads.U += dU
        self.grads.W += dW
        self.grads.b2 += dx
        self.grads.b1 += delta2

        self.sgrads.L[window[0]] = dL[0]
        self.sgrads.L[window[1]] = dL[1]
        self.sgrads.L[window[2]] = dL[2]
Exemple #47
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        # Forward propagation
        #words = [self.sparams.L[window[0]], self.sparams.L[window[1]], self.sparams.L[window[2]]]
        #x = reshape(words, self.sparams.L.shape[1] *3) # 3n row vector 
        x = self.sparams.L[window, :].flatten()
        h = tanh(self.params.W.dot(x) + self.params.b1) # 100*1
        yhat = softmax(self.params.U.dot(h) + self.params.b2) # 5*1
        
        # Compute gradients w.r.t cross-entropy loss
        # Backpropagation
        y = make_onehot(label, len(yhat))
        delta = yhat - y
        
        # dJ/dU, dJ/db2
        self.grads.U += (outer(delta, h) + self.lreg * self.params.U)
        self.grads.b2 += delta
        
        # dJ/dW, dJ/db1
        delta2 = multiply((1 - square(h)), self.params.U.T.dot(delta))
        self.grads.W += (outer(delta2, x) + self.lreg * self.params.W)
        self.grads.b1 += delta2

        
        # dJ/dL, sparse grad update
        dJ_dL = self.params.W.T.dot(delta2).reshape(len(window), self.sparams.L.shape[1])
                
        #for i, w in enumerate(window):
        #    self.sgrads.L[w] = dJ_dL[i]
        for k in range(len(window)):
            self.sgrads.L[window[k]] = dJ_dL[k]
Exemple #48
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        N = len(window)
        x = self.sparams.L[window]
        d = x.shape[1]
        x = x.reshape((x.shape[0] * x.shape[1]))
        z = self.params.W.dot(x) + self.params.b1
        h = tanh(z)
        y_hat = softmax(self.params.U.dot(h) + self.params.b2)

        ##
        # Backpropagation
        y = make_onehot(label, len(y_hat))
        delta = y_hat - y

        # dJ/dU, dJ/db2, dJ/dW, dJ/db1, dJ/dL
        self.grads.U += outer(delta, h) + self.lreg * self.params.U
        self.grads.b2 += delta

        tanh_prime_z = 1 - tanh(z)**2
        self.grads.W += outer(tanh_prime_z * delta.dot(self.params.U),
                              x) + self.lreg * self.params.W
        self.grads.b1 += tanh_prime_z * delta.dot(self.params.U)

        temp = (tanh_prime_z * delta.dot(self.params.U)).dot(self.params.W)
        for n in xrange(N):
            self.sgrads.L[window[n]] = temp[n * d:(n + 1) * d]
Exemple #49
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        ##
        # Forward propagation
        words = np.array([self.params.L[x] for x in window])
        x = np.reshape(words, -1)
        layer1 = np.tanh(self.params.W.dot(x) + self.params.b1)
        probs = softmax(self.params.U.dot(layer1) + self.params.b2)
        ##
        # Backpropagation
        y = make_onehot(label, len(probs))
        dx = probs - y
        dU = np.outer(dx, layer1)
        delta2 = np.multiply((1 - np.square(dU)), self.params.U.T.dot(dx))
        dW = np.outer(delta2, x)
        db1 = delta2
        dL = self.params.W.T.dot(delta2)
        dL = np.reshape(dL, (3, self.params.L.shape[1]))

        dW += self.lreg * self.params.W
        dU += self.lreg * self.params.U

        self.grads.U += dU
        self.grads.W += dW
        self.grads.b2 += dx
        self.grads.b1 += delta2

        self.sgrads.L[window[0]] = dL[0]
        self.sgrads.L[window[1]] = dL[1]
        self.sgrads.L[window[2]] = dL[2]
Exemple #50
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        x =  hstack(self.sparams.L[window])
        
        dd = len(x)/3
        a =  self.params.W.dot(x) + self.params.b1
        h =  tanh(a)
        scores = self.params.U.dot(h) + self.params.b2
        p = softmax(scores) 
        ##
        # Backpropagation
        y = make_onehot(label,len(p))
        delta = p - y
        self.grads.U += outer(delta,h) + self.lreg * self.params.U  # 5 *100
        self.grads.b2 += delta
        dh = self.params.U.T.dot(delta)          #100
        da = dh * (1-tanh(a)**2)
        self.grads.W += outer(da,x) + self.lreg * self.params.W      #100*150
        self.grads.b1 = da

        # good
        dx = self.params.W.T.dot(da)
        dx__ = reshape(dx,(3,-1))
        self.sgrads.L[window[0]] = dx__[0]
        self.sgrads.L[window[1]] = dx__[1]
        self.sgrads.L[window[2]] = dx__[2]
Exemple #51
0
    def _acc_grads(self, idx, label):
        """
        Accumulate gradients from a training example.
        """
        ##
        # Forward propagation
        x = self.sparams.L[idx] # extract representation
        p = softmax(self.params.W.dot(x) + self.params.b)

        ##
        # Compute gradients w.r.t cross-entropy loss
        y = make_onehot(label, len(p))
        delta = p - y
        # dJ/dW, dJ/db1
        self.grads.W += outer(delta, x) + self.lreg * self.params.W
        self.grads.b += delta
        # dJ/dL, sparse update: use sgrads
        # this stores an update to the row L[idx]
        self.sgrads.L[idx] = self.params.W.T.dot(delta)
Exemple #52
0
    def _acc_grads(self, idx, label):
        """
        Accumulate gradients from a training example.
        """
        ##
        # Forward propagation
        x = self.sparams.L[idx] # extract representation
        p = softmax(self.params.W.dot(x) + self.params.b)

        ##
        # Compute gradients w.r.t cross-entropy loss
        y = make_onehot(label, len(p))
        delta = p - y
        # dJ/dW, dJ/db1
        self.grads.W += outer(delta, x) + self.lreg * self.params.W
        self.grads.b += delta
        # dJ/dL, sparse update: use sgrads
        # this stores an update to the row L[idx]
        self.sgrads.L[idx] = self.params.W.T.dot(delta)
Exemple #53
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        N = len(window)
        x = self.sparams.L[window]
        d = x.shape[1]
        x = x.reshape((x.shape[0]*x.shape[1]))
        z = self.params.W.dot(x) + self.params.b1
        h = tanh(z)
        y_hat = softmax(self.params.U.dot(h) + self.params.b2)

        ##
        # Backpropagation
        y = make_onehot(label, len(y_hat))
        delta = y_hat - y

        # dJ/dU, dJ/db2, dJ/dW, dJ/db1, dJ/dL
        self.grads.U += outer(delta, h) + self.lreg * self.params.U
        self.grads.b2 += delta

        tanh_prime_z = 1-tanh(z)**2
        self.grads.W += outer(tanh_prime_z*delta.dot(self.params.U), x) + self.lreg * self.params.W
        self.grads.b1 += tanh_prime_z*delta.dot(self.params.U)

        temp = (tanh_prime_z*delta.dot(self.params.U)).dot(self.params.W)
        for n in xrange(N):
            self.sgrads.L[window[n]] = temp[n*d:(n+1)*d]
Exemple #54
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        # x: dim0 x 1
        # W: dim1 x dim0
        # b1: dim1 x 1
        # U: dim2 x dim1
        # b2: dim2 x 1

        x, z1, h, z2, y = self._forward(window)
        label_vector = make_onehot(label, len(y))

        ##
        # Backpropagation

        dz2 = y - label_vector  # dim2 x 1
        self.grads.U += outer(dz2, h)
        self.grads.U += self.lreg * self.params.U
        self.grads.b2 += dz2
        dz1 = self.params.U.T.dot(dz2) * (1 - h * h)  # dim1 x 1
        self.grads.W += outer(dz1, x)  # dim1 x dim0
        self.grads.W += self.lreg * self.params.W
        self.grads.b1 += dz1
        dx = self.params.W.T.dot(dz1)
        for i, w in enumerate(window):
            self.sgrads.L[w] = dx[i * self.D:(i + 1) * self.D]
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        x = concatenate([self.sparams.L[w] for w in window])
        z1 = self.params.W.dot(x) + self.params.b1
        h = 2 * sigmoid(2 * z1) - 1
        z2 = self.params.U.dot(h) + self.params.b2
        p = softmax(z2)
        y = make_onehot(label, len(p))
        ##
        # Backpropagation
        # compute the gradients w.r.t cross-entropy loss
        delta1 = p - y
        # dJ/dU, dJ/db2
        self.grads.U += outer(delta1, h) + self.lreg * self.params.U
        self.grads.b2 += delta1

        # dJ/dW, dJ/db1
        delta2 = self.params.U.T.dot(delta1) * (1 - h**2)
        self.grads.W += outer(delta2, x) + self.lreg * self.params.W
        self.grads.b1 += delta2

        # dj/dLi
        for i, w_chunk in enumerate(split(self.params.W, len(window), axis=1)):
            self.sgrads.L[window[i]] = w_chunk.T.dot(delta2)
Exemple #56
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        ##
        # Forward propagation
        # build input context
        x = self.build_input_context(window)
        # first hidden layer
        z1 = self.params.W.dot(x) + self.params.b1
        a1 = tanh(z1)
        # second hidden layer
        z2 = self.params.U.dot(a1) + self.params.b2
        a2 = softmax(z2)
        ##
        # Backpropagation
        # second hidden layer
        delta2 = a2 - make_onehot(label, self.nclass)
        self.grads.b2 += delta2
        self.grads.U += outer(delta2, a1) + self.lreg * self.params.U
        # first hidden layer
        delta1 = (1.0 - a1**2) * self.params.U.T.dot(delta2)
        self.grads.b1 += delta1
        self.grads.W += outer(delta1, x) + self.lreg * self.params.W

        for j, idx in enumerate(window):
            start = j * self.n
            stop = (j + 1) * self.n
            self.sgrads.L[idx] = self.params.W[:, start:stop].T.dot(delta1)
Exemple #57
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####


        ##
        # Forward propagation
        x = hstack(self.sparams.L[window, :])
        z1 = self.params.W.dot(x) + self.params.b1
        h = tanh(z1)
        z2 = self.params.U.dot(h) + self.params.b2
        y_pre = softmax(z2)
        y_act = make_onehot(label, len(y_pre))
        ##
        # Backpropagation
        delta = y_pre - y_act
        self.grads.U += outer(delta, h) + self.lreg * self.params.U
        self.grads.b2 += delta
        
        grad_h = self.params.U.T.dot(delta)
        self.grads.W += outer(grad_h * (1 - h ** 2), x) + self.lreg * self.params.W
        self.grads.b1 += grad_h * (1 - h ** 2)
        
        sgrad_L = self.params.W.T.dot(grad_h * (1 - h ** 2))
        sgrad_L = sgrad_L.reshape(self.windowSize, self.wordVecLen)
#        self.sgrads.L[window,:] = sgrad_L
        for i in xrange(self.windowSize):
            self.sgrads.L[window[i], :] = sgrad_L[i, :]