Beispiel #1
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        onehot_vecs = expand_dims(self.sparams.L[window, :].flatten(), axis=0)

        #print "onehot_vecs.shape: %s " % (onehot_vecs.shape,)

        ##
        # Forward propagation
        a1 = self.params.W.dot(onehot_vecs.T).T + self.params.b1
        s = sigmoid(2.0 * a1)
        h = 2.0 * s - 1.0
        a2 = self.params.U.dot(h.T).T + self.params.b2
        y_hat = softmax(a2)

        ##
        # Backpropagation
        t = zeros(y_hat.shape)
        t[:, label] = 1

        delta_out = y_hat - t

        self.grads.U += h.T.dot(delta_out).T + self.lreg * self.params.U

        #print "delta_out  shape: %s" % (delta_out.shape,)

        self.grads.b2 += delta_out.flatten()
        #print "self.grads.b2.shape: %s " % (self.grads.b2.shape,)

        delta_hidden = delta_out.dot(self.params.U) * 4.0 * sigmoid_grad(s)

        self.grads.W += delta_hidden.T.dot(
            onehot_vecs) + self.lreg * self.params.W
        self.grads.b1 += delta_hidden.flatten()

        #print "self.grads.b2.shape: %s " % (self.grads.b1.shape,)

        grad_xs = delta_hidden.dot(self.params.W).T
        #print "grad_xs.shape: %s " % (grad_xs.shape,)

        self.sgrads.L[window[0]] = grad_xs[range(0, 50)].flatten()
        self.sgrads.L[window[1]] = grad_xs[range(50, 100)].flatten()
        self.sgrads.L[window[2]] = grad_xs[range(100, 150)].flatten()
Beispiel #2
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        onehot_vecs = expand_dims(self.sparams.L[window,:].flatten(),axis=0)

        #print "onehot_vecs.shape: %s " % (onehot_vecs.shape,)

        ##
        # Forward propagation
        a1 = self.params.W.dot(onehot_vecs.T).T + self.params.b1
        s  = sigmoid( 2.0 * a1 )
        h  = 2.0 * s - 1.0
        a2 = self.params.U.dot(h.T).T + self.params.b2
        y_hat = softmax( a2 ) 

        ##
        # Backpropagation
        t = zeros( y_hat.shape )
        t[:,label] = 1

        delta_out = y_hat - t

        self.grads.U  += h.T.dot(delta_out).T + self.lreg * self.params.U

        #print "delta_out  shape: %s" % (delta_out.shape,)

        self.grads.b2 += delta_out.flatten()
        #print "self.grads.b2.shape: %s " % (self.grads.b2.shape,)

        delta_hidden = delta_out.dot(self.params.U) * 4.0 * sigmoid_grad( s )
        
        self.grads.W  += delta_hidden.T.dot(onehot_vecs) + self.lreg * self.params.W
        self.grads.b1 += delta_hidden.flatten()

        #print "self.grads.b2.shape: %s " % (self.grads.b1.shape,)

        grad_xs = delta_hidden.dot(self.params.W).T
        #print "grad_xs.shape: %s " % (grad_xs.shape,)

        self.sgrads.L[window[0]] = grad_xs[range(0,50)].flatten()
        self.sgrads.L[window[1]] = grad_xs[range(50,100)].flatten()
        self.sgrads.L[window[2]] = grad_xs[range(100,150)].flatten()
Beispiel #3
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns + 1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####
        ##
        # Forward propagation
        for step in xrange(0, ns):
            # print "hs[step-1].shape %s" % (hs[step-1].shape,)
            # print "self.params.H.shape %s" % (self.params.H.shape,)
            # print "self.sparams.L.shape %s" % (self.sparams.L.shape,)
            # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,)
            a1 = self.params.H.dot(hs[step - 1].T).T + self.sparams.L[xs[step]]
            a1 = expand_dims(a1, axis=0)
            h = sigmoid(a1)
            a2 = self.params.U.dot(h.T).T
            # print "h.flatten().shape %s" % (h.flatten().shape,)
            # print "a2.shape %s" % (a2.shape,)
            # print "self.params.U.shape %s" % (self.params.U.shape,)
            y_hat = softmax(a2)

            # print "y_hat.shape %s" % (y_hat.shape,)

            hs[step] = h.flatten()
            ps[step] = y_hat

        ##
        # Backward propagation through time
        for step in xrange(ns - 1, -1, -1):
            t = zeros(ps[step].shape)
            t[ys[step]] = 1
            delta_out = ps[step] - t
            self.grads.U += outer(hs[step], delta_out).T

            delta_hidden = delta_out.dot(self.params.U) * sigmoid_grad(
                hs[step])

            for step_bp in xrange(step, step - self.bptt - 1, -1):
                if step_bp < 0:
                    break
                self.grads.H += outer(delta_hidden, hs[step_bp - 1])
                self.sgrads.L[xs[step_bp]] = delta_hidden
                delta_hidden = delta_hidden.dot(self.params.H) * sigmoid_grad(
                    hs[step_bp - 1])
Beispiel #4
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####
        ##
        # Forward propagation
        for step in xrange(0,ns):
            # print "hs[step-1].shape %s" % (hs[step-1].shape,)
            # print "self.params.H.shape %s" % (self.params.H.shape,)
            # print "self.sparams.L.shape %s" % (self.sparams.L.shape,)
            # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,)
            a1 = self.params.H.dot(hs[step-1].T).T + self.sparams.L[xs[step]]
            a1 = expand_dims(a1,axis=0)
            h  = sigmoid( a1 )
            a2 = self.params.U.dot(h.T).T
            # print "h.flatten().shape %s" % (h.flatten().shape,)
            # print "a2.shape %s" % (a2.shape,)
            # print "self.params.U.shape %s" % (self.params.U.shape,)
            y_hat = softmax( a2 )

            # print "y_hat.shape %s" % (y_hat.shape,)

            hs[step] = h.flatten()
            ps[step] = y_hat

        ##
        # Backward propagation through time
        for step in xrange(ns-1,-1,-1):
            t = zeros( ps[step].shape )
            t[ys[step]] = 1
            delta_out = ps[step] - t
            self.grads.U += outer(hs[step],delta_out).T

            delta_hidden = delta_out.dot(self.params.U) * sigmoid_grad( hs[step] )

            for step_bp in xrange(step,step-self.bptt-1,-1):
                if step_bp < 0:
                    break
                self.grads.H  += outer(delta_hidden,hs[step_bp-1])
                self.sgrads.L[xs[step_bp]] = delta_hidden
                delta_hidden = delta_hidden.dot(self.params.H) * sigmoid_grad( hs[step_bp-1] )