Ejemplo n.º 1
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = np.zeros((ns+1,self.hdim))
        for i in range(ns):
            hs[i+1] = sigmoid(self.params.H.dot(hs[i])+self.params.W.dot(self.sparams.L[xs[i]]))
            nodeCur = self.word2node[ys[i]]
            while nodeCur.parent != None:
                t = 1
                if nodeCur.isLeft == False:
                    t = -1
                nodeCur = nodeCur.parent
                J += -np.log(sigmoid(t*nodeCur.hActs.dot(hs[i+1])))
        #### END YOUR CODE ####
        x = self.hierarchicalU.getSumSquareU(self.hierarchicalU.root)
        Jreg = 0.5*self.lreg*(np.sum(self.params.H**2)+np.sum(self.params.W**2) + x)
        return J + Jreg
Ejemplo n.º 2
0
    def compute_seq_ppl(self, xs, ys):
        #### YOUR CODE HERE ####
        J = 0
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        cs = zeros((ns, self.cdim))
        # predicted probas
        ps = zeros((ns, self.Udim))

        #### YOUR CODE HERE ####
        L = self.sparams.L
        Lc = self.Lcluster
        cfreq = self.cfreq
        cwords = self.cwords
        direct_size = self.hsize
        U = self.params.U
        H = self.params.H
        C = zeros((self.cdim, self.hdim))
        if self.isCompression is True:
            C = self.params.C
        ##
        # Forward propagation
        for i in xrange(ns):
            hs[i+1] = sigmoid(H.dot(hs[i]) + L[xs[i]])
            #hs[i+1] = 2.0/(1 + exp(-2.0*(H.dot(hs[i]) + L[xs[i]]))) - 1
            #without maximum entropy optimization
            word_cluster = Lc[ys[i]]
            st_word = cwords[word_cluster, 0]
            ed_word = st_word + cfreq[word_cluster]
            
            part_cluster = zeros((self.class_size, ))
            part_word = zeros((ed_word - st_word, ))
            if self.isME is True:
                if direct_size > 0 and xs[i] != -1:
                    part_cluster += self.params.cluster_direct[xs[i]]
                    indexs = cwords[word_cluster, 0:int(cfreq[word_cluster])]
                    
                    if xs[i] < direct_size:
                        part_word += self.params.word_direct[xs[i], indexs]
            
            if self.isCompression is True:
                cs[i] = sigmoid(C.dot(hs[i+1]))
                part_cluster += U[self.vdim:].dot(cs[i])
                part_word += U[st_word:ed_word].dot(cs[i])
                ps[i, self.vdim:] = softmax(part_cluster)
                ps[i, st_word:ed_word] = softmax(part_word)
                
            else:
                part_cluster += U[self.vdim:].dot(hs[i+1])
                part_word += U[st_word:ed_word].dot(hs[i+1])
                
                ps[i, self.vdim:] = softmax(part_cluster)
                ps[i, st_word:ed_word] = softmax(part_word)
                #ps[i, self.vdim:] = softmax(U[self.vdim:,:].dot(hs[i+1]))
                #ps[i, st_word:ed_word] = softmax(U[st_word:ed_word,:].dot(hs[i+1]))
            
            #print maximum(ps[i, ys[st_word:ed_word]]), ps[i,ys[i]], maximum(ps[i, self.vdim:]), ps[i, self.vdim+word_cluster]
            J -= log(ps[i, ys[i]] * ps[i, self.vdim+word_cluster])
        
        return J
Ejemplo n.º 3
0
    def _acc_grads(self, xs, ys):
        #### YOUR CODE HERE ####
        # Expect xs as list of indices
        ns = len(xs)
        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = np.zeros((ns+1, self.hdim))
        # predicted probas
        ps = np.zeros((ns+1, self.vdim))

        #### YOUR CODE HERE ####
        ##
        # Forward propagation

        zs = np.zeros((ns+1,self.hdim))
        for i in range(ns):
            zs[i+1] = self.params.H.dot(hs[i]) + self.params.W.dot(self.sparams.L[xs[i]])
            hs[i+1] = sigmoid(zs[i+1])
            
        ##
        # Backward propagation through time
        sgradsTmp = np.zeros((self.vdim,self.hdim)) 
        grad0 = np.zeros((ns+1,self.hdim)) # (y-t)*U 
        for i in range(ns):
            nodeCur = self.word2node[ys[i]]
            while nodeCur.parent != None:
                t = 1
                if nodeCur.isLeft == False:
                    t = 0
                nodeCur = nodeCur.parent
                if nodeCur.grad == None:
                    nodeCur.grad = (sigmoid(nodeCur.hActs.dot(hs[i+1]))-t)*hs[i+1]
                else:
                    nodeCur.grad = nodeCur.grad + (sigmoid(nodeCur.hActs.dot(hs[i+1]))-t)*hs[i+1]
                    
                grad0[i+1] = grad0[i+1] + (sigmoid(nodeCur.hActs.dot(hs[i+1]))-t)*nodeCur.hActs

                    
            vectorCurrent = grad0[i+1]*sigmoidGrad(zs[i+1])
            for j in range(min(i+1,self.bptt+1)):
                xh1 = np.ones((self.hdim, self.hdim)).dot(np.diag(hs[i-j]))
                self.grads.H += np.diag(vectorCurrent).dot(xh1)
                x1 = np.ones((self.hdim, self.hdim)).dot(np.diag(self.sparams.L[xs[i-j]]))
                self.grads.W += np.diag(vectorCurrent).dot(x1)
                sgradsTmp[xs[i-j]] += vectorCurrent.dot(self.params.W)
                
                vectorCurrent = vectorCurrent.dot(self.params.H)
                vectorCurrent = vectorCurrent*sigmoidGrad(zs[i-j])

        
        self.hierarchicalU.regularizedGrad(self.hierarchicalU.root,self.lreg)
        self.grads.H += self.lreg*self.params.H
        self.grads.W += self.lreg*self.params.W
        
        for i in range(len(sgradsTmp)):
            self.sgrads.L[i] = sgradsTmp[i,:]
Ejemplo n.º 4
0
    def _acc_grads(self, xs, ys, d):

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))

        # predicted probas
        ps = zeros((ns, self.vdim))

        zs = zeros((ns+1, self.hdim))

        ##
        # Forward propagation
        d_vec = self.sparams.D[d]
        for t in xrange(ns):
            x_t = xs[t]
            zs[t] = self.params.H.dot(hs[t-1]) + self.sparams.L[x_t] + d_vec
            hs[t] = sigmoid(zs[t])
            ps[t] = softmax(self.params.U.dot(hs[t]) + self.params.G.dot(d_vec.T).reshape(self.vdim,))

        ##
        # Backward propagation through time

        d_grad = zeros_like(self.sparams.D[0])
        for t in reversed(xrange(ns)):
            delta = zeros((ns, self.hdim))
            p_t = ps[t]
            eps_t = p_t - make_onehot(ys[t], len(p_t))
            self.grads.U += outer(eps_t, hs[t])
            self.grads.G += outer(eps_t, d_vec)
            d_grad += self.params.G.T.dot(eps_t)
            sig_prime_t = sigmoid(zs[t])*(1.-sigmoid(zs[t]))
            delta[t] = sig_prime_t * self.params.U.T.dot(eps_t)
            self.sgrads.L[xs[t]] = delta[t].copy()
            d_grad += delta[t].copy()
            self.grads.H += outer(delta[t], hs[t-1])
            for i in xrange(1, self.bptt):
                j = t-i
                if j < 0: continue
                sig_prime_j = sigmoid(zs[j])*(1.-sigmoid(zs[j]))
                delta[j] = sig_prime_j * self.params.H.T.dot(delta[j+1])
                self.sgrads.L[xs[j]] = delta[j].copy()
                d_grad += delta[j].copy()
                self.grads.H += outer(delta[j], hs[j-1])

        self.sgrads.D[d] = d_grad.copy()
Ejemplo n.º 5
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        ps = zeros((ns, self.vdim))

        for i in xrange(ns):
            hs[i] = sigmoid(self.params.H.dot(hs[i-1]) + self.sparams.L[xs[i]])
            ps[i] = softmax(self.params.U.dot(hs[i]))
            J -= log(ps[i][ys[i]])


        #### END YOUR CODE ####
        return J
Ejemplo n.º 6
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        for t in xrange(ns):
            hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]])
            ps[t] = softmax(self.params.U.dot(hs[t]))
            J -= log(ps[t,ys[t]])
        #### END YOUR CODE ####
        return J
Ejemplo n.º 7
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.
        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        self.xs = xs
        self.ys=ys
        
        hs = zeros((ns+1, self.hdim))
        self.hs1 = hs
        # for each time step
        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]])
            y_hat = softmax(dot(self.params.U, hs[t]))
            J -= log(y_hat[ys[t]])

        #### END YOUR CODE ####
        return J
Ejemplo n.º 8
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probs
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####

        # forward propagation
        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]])
            ps[t] = softmax(dot(self.sparams.U, hs[t]))

        # backpropagation through time
        for i in xrange(ns):
            d2i = ps[i]
            d2i[ys[i]] -= 1
            d1 = dot(self.sparams.U.T, d2i) * hs[i] * (1 - hs[i])

            self.sgrads.U = dot(d2i.reshape((-1, 1)), hs[i].reshape((1, -1)))

            for t in xrange(i, i - self.bptt - 1, -1):
                if t >= 0:                          # the farthest reference will thus be hs[-1]
                    self.sgrads.L[xs[t]] = d1
                    self.grads.H += dot(d1.reshape((-1, 1)), hs[t-1].reshape((1, -1)))
                    d1 = dot(self.params.H.T, d1) * hs[t-1] * (1 - hs[t-1])     # accumulate punishments/deltas
Ejemplo n.º 9
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####
        L = self.sparams.L
        U = self.params.U
        H = self.params.H
        
        ##
        # Forward propagation
        for i in xrange(ns):
            hs[i+1] = sigmoid(H.dot(hs[i]) + L[xs[i]])
            #hs[i+1] = 2.0/(1.0 + exp(-2.0*(H.dot(hs[i]) + L[xs[i]]))) - 1.0
            ps[i] = softmax(U.dot(hs[i+1]))
            J -= log(ps[i][ys[i]])
        
        

        #### END YOUR CODE ####
        return J
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        # construct input matrix
        x = vstack([concatenate([self.sparams.L[idx] for idx in window]) for window in windows])
        z1 = self.params.W.dot(x.T) + self.params.b1[:, newaxis]
        h1 = 2 * sigmoid(2 * z1) - 1
        z2 = self.params.U.dot(h1) + self.params.b2[:, newaxis]
        P = softmax(z2.T)
        #### END YOUR CODE ####

        return P # rows are output for each input
Ejemplo n.º 11
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """


       
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        ps = zeros((ns, self.vdim))
        
        for i in range(ns):
            z1 = self.params.H.dot(hs[i-1]) + self.sparams.L[xs[i]]
            hs[i] = sigmoid(z1)
            z2 = self.params.U.dot(hs[i])
            ps[i] = softmax(z2)        

        J = sum(-log(ps[range(len(ps)), ys]))

        return J
Ejemplo n.º 12
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        ns = len(xs)

        h_ant = zeros((1, self.hdim))

        J = 0
        #### YOUR CODE HERE ####
        for step in xrange(0,ns):
            # print "hs[step-1].shape %s" % (hs[step-1].shape,)
            # print "self.params.H.shape %s" % (self.params.H.shape,)
            # print "self.sparams.L.shape %s" % (self.sparams.L.shape,)
            # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,)
            a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[xs[step]]
            h  = sigmoid( a1 )
            a2 = self.params.U.dot(h.T).T
            # print "h.shape %s" % (h.shape,)
            # print "a2.shape %s" % (a2.shape,)
            # print "self.params.U.shape %s" % (self.params.U.shape,)
            y_hat = softmax( a2 )
            h_ant = h

            J -= log( y_hat[:,ys[step]] )

        #### END YOUR CODE ####
        return J
Ejemplo n.º 13
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        h_prev = zeros(self.hdim)
        for t in xrange(ns):
            h_t = sigmoid(dot(self.params.H, h_prev) + self.sparams.L[xs[t]])
            if t == ns - 1:
                yhat_t = softmax(dot(self.params.U, h_t))
                J = -log(yhat_t[ys])

            h_prev = h_t

        J += .5 * self.lamb * (sum(self.params.H**2) + sum(self.params.U**2))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 14
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        ps = zeros((ns, self.vdim))#(3,10)
        # Forward propagation
        for t in xrange(ns):
            hs[t] = sigmoid(self.params.H.dot(hs[t - 1]) + self.sparams.L[xs[t]])#(Dh,Dh)*(Dh,)+(Dh,)
            ps[t] = softmax(self.params.U.dot(hs[t]))#(V,Dh)*(Dh,)
            J += - log(ps[t][ys[t]])
            #print ps[t]
            #print [ys[t]]
            #J += -ys[t]*log(ps[t])
        #### END YOUR CODE ####
        return J
Ejemplo n.º 15
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        # hs[-1] = initial hidden state (zeros)
        ns = len(ys)
        hs = zeros((ns+1, self.hdim))

        for t in range(ns):
            hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]])
            #ps[t] = softmax(self.params.U.dot(hs[t]))
            #J -= log(ps[t][ys[t]])
        h_final = hs[ns-1]
        z = self.params.U.dot(h_final) 
        y_hat = []
        for i in range(n_aspect):
            current = z[sent_dim*i:sent_dim*(i+1)]
            y_hat.extend(softmax(current))
        J =- sum(ys.reshape(len(ys),1)*log(array(y_hat).reshape(len(y_hat),1)))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 16
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))

        # _for memory purposes_, we do not compute the loss in one fell swoop
        # forward propagation
        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]])
            p = softmax(dot(self.sparams.U, hs[t]))
            J -= sum(log(p[ys[t]]))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 17
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        #J = 0
        ns = len(xs)
        #### YOUR CODE HERE ####
        # forward propagation
        hs = zeros((ns+1, self.hdim))
        ps = zeros((ns, self.vdim)) # predicted probas
        for t in range(0, ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t], :])
            ps[t] = softmax(dot(self.params.U, hs[t]))

        J = - sum(log(ps[arange(ns), ys]))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 18
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)
        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.
        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row
        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H)
                and self.sgrads (for L,U)
        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.
        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs) #3
        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####

        ##
        # Forward propagation

        # for each time step
        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]])
            ps[t] = softmax(dot(self.params.U, hs[t]))

        ##
        # Backward propagation through time

        for j in xrange(ns):
            y = make_onehot(ys[j], self.vdim)
            y_hat_minus_y = ps[j] - y
            self.grads.U += outer(y_hat_minus_y, hs[j])
            delta = dot(self.params.U.T, y_hat_minus_y) * hs[j] * (1.0 - hs[j])

            # start at j and go back self.bptt times (total self.bptt + 1 elements, including current one)
            for t in xrange(j, j - self.bptt - 1, -1):
                if t - 1 >= -1:
                    self.grads.H += outer(delta, hs[t - 1]) #See from above.. hs[-1] is list of zeros.
                    self.sgrads.L[xs[t]] = delta
                    delta = dot(self.params.H.T, delta) * hs[t - 1] * (1.0 - hs[t - 1])
Ejemplo n.º 19
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = list of index of start words (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = init # emitted sequence

        #### YOUR CODE HERE ####
        h = np.zeros(self.hdim)
        for x in ys:
            z = self.params.H.dot(h) + self.sparams.L[x]
            h = sigmoid(z)
        while ys[-1] != end:
            x = ys[-1]
            z = self.params.H.dot(h) + self.sparams.L[x]
            h = sigmoid(z)
            y_hat = softmax(self.params.U.dot(h))
            y = multinomial_sample(y_hat)
            J -= np.log(y_hat[y])
            ys.append(y)


        #### YOUR CODE HERE ####
        return ys, J
Ejemplo n.º 20
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = [init] # emitted sequence
        
        #### YOUR CODE HERE ####
        ns = maxlen

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####
        H = self.params.H
        U = self.params.U
        L = self.sparams.L
        bptt = self.bptt
        ##
        # Forward propagation
        for t in xrange(ns):
            hs[t + 1] = sigmoid(H.dot(hs[t]) + L[ys[t]])
            ps[t] = softmax(U.dot(hs[t + 1]))
            ys = ys + [multinomial_sample(ps[t])]
            #ys.append(multinomial_sample(ps[t]))
            J -= log(ps[t][ys[t]])
            if ys[t + 1] == end:
                break
            if t == ns - 1:
                ys = ys + [end]
                       
        #### YOUR CODE HERE ####
        return ys, J
Ejemplo n.º 21
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        onehot_vecs = expand_dims(self.sparams.L[window,:].flatten(),axis=0)

        #print "onehot_vecs.shape: %s " % (onehot_vecs.shape,)

        ##
        # Forward propagation
        a1 = self.params.W.dot(onehot_vecs.T).T + self.params.b1
        s  = sigmoid( 2.0 * a1 )
        h  = 2.0 * s - 1.0
        a2 = self.params.U.dot(h.T).T + self.params.b2
        y_hat = softmax( a2 ) 

        ##
        # Backpropagation
        t = zeros( y_hat.shape )
        t[:,label] = 1

        delta_out = y_hat - t

        self.grads.U  += h.T.dot(delta_out).T + self.lreg * self.params.U

        #print "delta_out  shape: %s" % (delta_out.shape,)

        self.grads.b2 += delta_out.flatten()
        #print "self.grads.b2.shape: %s " % (self.grads.b2.shape,)

        delta_hidden = delta_out.dot(self.params.U) * 4.0 * sigmoid_grad( s )
        
        self.grads.W  += delta_hidden.T.dot(onehot_vecs) + self.lreg * self.params.W
        self.grads.b1 += delta_hidden.flatten()

        #print "self.grads.b2.shape: %s " % (self.grads.b1.shape,)

        grad_xs = delta_hidden.dot(self.params.W).T
        #print "grad_xs.shape: %s " % (grad_xs.shape,)

        self.sgrads.L[window[0]] = grad_xs[range(0,50)].flatten()
        self.sgrads.L[window[1]] = grad_xs[range(50,100)].flatten()
        self.sgrads.L[window[2]] = grad_xs[range(100,150)].flatten()
Ejemplo n.º 22
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = [init] # emitted sequence

        #### YOUR CODE HERE ####
        h = sigmoid(self.sparams.L[init]) 
        for t in range(maxlen):
            h = sigmoid(self.params.H.dot(h) + self.sparams.L[ys[-1]]) 
            pred = softmax(self.params.U.dot(h))
            y = multinomial_sample(pred)
            ys.append(y)

            J += -1*log(pred[y])
            if y == end:
                break

        #### YOUR CODE HERE ####
        return ys, J
Ejemplo n.º 23
0
    def forward_propagation(self,xs):
        n_aspect = N_ASPECTS
        sent_dim = SENT_DIM
        ns = len(xs)
        hs_f = zeros((ns+1, self.hdim))
        hs_b = zeros((ns+1, self.hdim))
        for t in range(ns):
            hs_f[t] = sigmoid(self.params.H_f.dot(hs_f[t-1]) + self.sparams.L[xs[t]] + self.params.b1_f)
        h_f_final = hs_f[ns-1]
        inverted_xs = list(reversed(xs))
        for t in range(ns):
            hs_b[t] = sigmoid(self.params.H_b.dot(hs_b[t-1]) + self.sparams.L[inverted_xs[t]] + self.params.b1_b)
        h_b_final = hs_b[ns-1]

        z = self.params.U.dot(hstack([h_f_final,h_b_final])) + self.params.b2
        y_hat = []
        for i in range(n_aspect):
            current = z[sent_dim*i:sent_dim*(i+1)]
            y_hat.extend(softmax(current))
        return hs_f,hs_b,y_hat
Ejemplo n.º 24
0
    def generate_missing_word(self, before, after, wv=None, nres=5):
        Ps = []
        missings = []

        lh = np.zeros(self.hdim)
        for x in before:
            vec = wv[x] if x >= self.vdim else self.sparams.LL[x]
            z = self.params.LH.dot(lh) + vec
            lh = sigmoid(z)
        rh = np.zeros(self.hdim)
        for x in reversed(after):
            vec = wv[x] if x >= self.vdim else self.sparams.RL[x]
            z = self.params.RH.dot(rh) + vec
            rh = sigmoid(z)
        y_hat = softmax(self.params.U.dot(np.concatenate((lh, rh))))
        for i in xrange(nres):
            high_idx = np.argmax(y_hat)
            missings.append(high_idx)
            Ps.append(y_hat[high_idx])
            y_hat[high_idx] = 0.0
        return missings
Ejemplo n.º 25
0
    def compute_seq_loss(self, sentence, ys, wv=None):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        """
        ys are not used
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(sentence)
        lhs = np.zeros((ns, self.hdim))
        rhs = np.zeros((ns, self.hdim))
        for i in xrange(ns-1):
            x = sentence[i]
            h = lhs[i]
            vec = wv[x] if x >= self.vdim else self.sparams.LL[x]
            lhs[i+1] = sigmoid(self.params.LH.dot(h) + vec)
        for i in reversed(xrange(1, ns)):
            x = sentence[i]
            h = rhs[i]
            vec = wv[x] if x >= self.vdim else self.sparams.RL[x]
            rhs[i-1] = sigmoid(self.params.RH.dot(h) + vec)
        for i in xrange(1, ns-1):
            y = sentence[i]
            if y >= self.vdim:
                y = 3 # UUUNKKK
            y_hat = softmax(self.params.U.dot(np.concatenate((lhs[i], rhs[i]))))
            J -= np.log(y_hat[y])

        #### END YOUR CODE ####
        return J
Ejemplo n.º 26
0
    def forward_propagation(self,xs):
        n_aspect = N_ASPECTS
        sent_dim = SENT_DIM
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        for t in range(ns):
            hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]] + self.params.b1)
        h_final = hs[ns-1]
        z = self.params.U.dot(h_final) + self.params.b2
        y_hat = []
        for i in range(n_aspect):
            current = z[sent_dim*i:sent_dim*(i+1)]
            y_hat.extend(softmax(current))

        return hs,y_hat
Ejemplo n.º 27
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = [init] # emitted sequence

        #### YOUR CODE HERE ####
        ns = len(ys)
        t = 0
        nextIdx = init

        hs = zeros((maxlen+1, self.hdim))
        ps = zeros((maxlen, self.vdim))

        while ns <= maxlen and nextIdx != end:
            hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[ys[t]])
            ps[t] = softmax(self.params.U.dot(hs[t]))

            J -= log(ps[t,ys[t]])

            nextIdx = multinomial_sample(ps[t])
            ys.append(nextIdx)
            ns = len(ys)
            t += 1
        #### YOUR CODE HERE ####
        return ys, J
Ejemplo n.º 28
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = [init] # emitted sequence

        #### YOUR CODE HERE ####
        ps = zeros((maxlen, self.vdim))
        hs = zeros((maxlen, self.hdim))
        H = self.params.H
        L = self.sparams.L
        U = self.params.U
        
        start = init
        for i in xrange(maxlen):
            hs[i+1] = sigmoid(H.dot(hs[i]) + L[start])
            ps[i] = softmax(U.dot(hs[i+1]))
            start = multinomial_sample(ps[i])
            J -= log(ps[i][start])
            ys.append(start)
            
            if start == end:
                break

        #### YOUR CODE HERE ####
        return ys, J
Ejemplo n.º 29
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = [init] # emitted sequence

        #### YOUR CODE HERE ####
        h_ant = zeros((1, self.hdim))

        for step in xrange(maxlen):
            a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[ys[step]]
            h  = sigmoid( a1 )
            a2 = self.params.U.dot(h.T).T
            # print "h.shape %s" % (h.shape,)
            # print "a2.shape %s" % (a2.shape,)
            # print "self.params.U.shape %s" % (self.params.U.shape,)
            y_hat = softmax( a2 )
            h_ant = h
            ys.append( multinomial_sample(y_hat) )
            J -= log( y_hat[:,ys[step]] )


        ys.append(end)

        #### YOUR CODE HERE ####
        return ys, J
Ejemplo n.º 30
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """
        J = 0 # total loss
        ys = [init] # emitted sequence

        ns = maxlen
        hs = np.zeros((ns+1,self.hdim))
        #### YOUR CODE HERE ####
        for i in range(ns):
            hs[i+1] = sigmoid(self.params.H.dot(hs[i])+self.params.W.dot(self.sparams.L[ys[i]]))            
            p = self.hierarchicalU.getDistribution(hs[i+1])
            y = multinomial_sample(p)
            ys.append(y)
            if y == end:
                break
            p = p*make_onehot(y,self.vdim)
            J += -np.log(np.sum(p))


        ##
        #x only compute the node which gradient is updated 
        x = self.hierarchicalU.getSumSquareU(self.hierarchicalU.root)
        Jreg = 0.5*self.lreg*(np.sum(self.params.H**2)+np.sum(self.params.W**2)+ x)
        #### YOUR CODE HERE ####
        return ys, J+Jreg
Ejemplo n.º 31
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0  # total loss
        ys = [init]  # emitted sequence

        #### YOUR CODE HERE ####
        h_ant = zeros((1, self.hdim))

        for step in xrange(maxlen):
            a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[ys[step]]
            h = sigmoid(a1)
            a2 = self.params.U.dot(h.T).T
            # print "h.shape %s" % (h.shape,)
            # print "a2.shape %s" % (a2.shape,)
            # print "self.params.U.shape %s" % (self.params.U.shape,)
            y_hat = softmax(a2)
            h_ant = h
            ys.append(multinomial_sample(y_hat))
            J -= log(y_hat[:, ys[step]])

        ys.append(end)

        #### YOUR CODE HERE ####
        return ys, J
Ejemplo n.º 32
0
    def compute_seq_loss(self, xs, ys):

        J = 0
        #### YOUR CODE HERE ####
        
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        ps = zeros((ns, self.vdim))

        # Forward propagation
        for t in range(0,ns):
            hs[t] = sigmoid(dot(hs[t-1],self.params.H)+self.sparams.L[xs[t],:])
            ps[t] = softmax(dot(self.params.U,hs[t]))
            J += -log(ps[t][ys[t]])
        
        #### END YOUR CODE ####
        return J
    def forward_propagation(self, xs):
        n_aspect = N_ASPECTS
        sent_dim = SENT_DIM
        ns = len(xs)
        hs = zeros((ns + 1, self.hdim))
        for t in range(ns):
            hs[t] = sigmoid(
                self.params.H.dot(hs[t - 1]) + self.sparams.L[xs[t]] +
                self.params.b1)
        h_final = hs[ns - 1]
        z = self.params.U.dot(h_final) + self.params.b2
        y_hat = []
        for i in range(n_aspect):
            current = z[sent_dim * i:sent_dim * (i + 1)]
            y_hat.extend(softmax(current))

        return hs, y_hat
Ejemplo n.º 34
0
    def generate_sequence(self, d, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = [init] # emitted sequence

        hs = zeros((maxlen+1, self.hdim))

        curr = init
        t = 0
        d_vec = self.sparams.D[d]
        while curr != end and len(ys) < maxlen:
            x_t = curr
            zs_t = self.params.H.dot(hs[t-1]) + self.sparams.L[x_t] + d_vec
            hs[t] = sigmoid(zs_t)
            ps_t = softmax(self.params.U.dot(hs[t]) + self.params.G.dot(d_vec))
            y = multinomial_sample(ps_t)
            ys.append(y)
            curr = y
            J += -1*log(ps_t[y])
            t += 1

        return ys, J
Ejemplo n.º 35
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = [init] # emitted sequence

        #### YOUR CODE HERE ####
        h = zeros(self.hdim)
        t = 1
        while t < maxlen:
            # shape Dh
            h = sigmoid(self.params.H.dot(h) + self.sparams.L[ys[t-1]])
            # shape V
            p = softmax(self.params.U.dot(h))
            ys += [multinomial_sample(p)]
            J += -log(p[ys[t]])
            if ys[t] == end:
                break
            t += 1


        #### YOUR CODE HERE ####
        return ys, J
Ejemplo n.º 36
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0  # total loss
        ys = [init]  # emitted sequence

        hs = zeros((maxlen + 1, self.hdim))
        ps = zeros((maxlen, self.vdim))

        for w in range(maxlen):
            z1 = self.params.H.dot(hs[w - 1]) + self.sparams.L[ys[w]]
            hs[w] = sigmoid(z1)
            z2 = self.params.U.dot(hs[w])
            ps = softmax(z2)
            y = multinomial_sample(ps)
            ys.append(y)
            J += -log(ps[y])
            if y == end:
                break

        return ys, J
Ejemplo n.º 37
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0 # total loss
        ys = [init] # emitted sequence

        #### YOUR CODE HERE ####

        hs = zeros((maxlen+1, self.hdim))

        for t in xrange(maxlen):
            hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[ys[t]])
            y_hat = softmax(dot(self.params.U, hs[t]))
            y_index = multinomial_sample(y_hat)
            ys.append(y_index)
            J -= log(y_hat[y_index])
            if y_index == end:
                break

        #### YOUR CODE HERE ####
        return ys, J
    def predict(self, xs):
        n_aspect = N_ASPECTS
        sent_dim = SENT_DIM
        #### YOUR CODE HERE ####
        # hs[-1] = initial hidden state (zeros)
        ns = len(xs)
        hs = zeros((ns + 1, self.hdim))

        for t in range(ns):
            hs[t] = sigmoid(
                self.params.H.dot(hs[t - 1, :]) + self.sparams.L[xs[t]])

        h_final = hs[ns - 1]
        z = self.params.U.dot(h_final)
        y_hat = []
        for i in range(n_aspect):
            current = z[sent_dim * i:sent_dim * (i + 1)]
            y_hat.extend(softmax(current))
        return y_hat
Ejemplo n.º 39
0
    def generate_sequence(self, init, end, maxlen=100):
        """
        Generate a sequence from the language model,
        by running the RNN forward and selecting,
        at each timestep, a random word from the
        a word from the emitted probability distribution.

        The MultinomialSampler class (in nn.math) may be helpful
        here for sampling a word. Use as:

            y = multinomial_sample(p)

        to sample an index y from the vector of probabilities p.


        Arguments:
            init = index of start word (word_to_num['<s>'])
            end = index of end word (word_to_num['</s>'])
            maxlen = maximum length to generate

        Returns:
            ys = sequence of indices
            J = total cross-entropy loss of generated sequence
        """

        J = 0  # total loss
        ys = [init]  # emitted sequence

        #### YOUR CODE HERE ####
        t = 0
        hs = zeros(self.hdim)
        while True:
            if (len(ys) > maxlen) or (ys[-1] == end):
                break
            hs = sigmoid(self.params.H.dot(hs) + self.sparams.L[ys[t], :])
            ps = softmax(self.params.U.dot(hs))
            y = multinomial_sample(ps)
            J += -log(ps[y])
            ys.append(y)
            t += 1
        #### YOUR CODE HERE ####
        return ys, J
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        x = concatenate([self.sparams.L[w] for w in window])
        z1 = self.params.W.dot(x) + self.params.b1
        h = 2 * sigmoid(2 * z1) - 1
        z2 = self.params.U.dot(h) + self.params.b2
        p = softmax(z2)
        y = make_onehot(label, len(p))
        ##
        # Backpropagation
        # compute the gradients w.r.t cross-entropy loss
        delta1 = p - y
        # dJ/dU, dJ/db2
        self.grads.U += outer(delta1, h) + self.lreg * self.params.U
        self.grads.b2 += delta1

        # dJ/dW, dJ/db1
        delta2 = self.params.U.T.dot(delta1) * (1 - h**2)
        self.grads.W += outer(delta2, x) + self.lreg * self.params.W
        self.grads.b1 += delta2

        # dj/dLi
        for i, w_chunk in enumerate(split(self.params.W, len(window), axis=1)):
            self.sgrads.L[window[i]] = w_chunk.T.dot(delta2)
Ejemplo n.º 41
0
    def backward(self, m, lambd=0.1):
        self.dA = np.dot(self.next_layer.W.T, self.next_layer.dZ)

        if self.activation == 'relu':
            self.dZ = np.multiply(self.dA, np.int64(self.A > 0))

        elif self.activation == 'sigmoid':
            s = sigmoid(self.Z)
            self.dZ = self.dA * s * (1 - s)

        elif self.activation == 'linear':
            self.dZ = self.dA

        elif self.activation == 'tanh':
            self.dZ = np.dot(self.next_layer.W.T,
                             self.next_layer.dZ) * (1 - np.power(self.A, 2))

        self.dW = (1 / m) * np.dot(self.dZ,
                                   self.prev_layer.A.T) + (lambd / m) * self.W
        self.db = (1 / m) * np.sum(self.dZ, axis=1, keepdims=True)

        self.prev_layer.backward(m)
Ejemplo n.º 42
0
    def compute_seq_loss(self, xs, ys, d):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        d_vec = self.sparams.D[d]
        for t in xrange(ns):
            x_t = xs[t]
            zs_t = self.params.H.dot(hs[t-1]) + self.sparams.L[x_t] + d_vec
            hs[t] = sigmoid(zs_t)
            ps_t = softmax(self.params.U.dot(hs[t]) + self.params.G.dot(d_vec.T).reshape(self.vdim,))
            J += -1*log(ps_t[ys[t]])
        return J
Ejemplo n.º 43
0
    def _acc_grads_batch(self, X, Y):
        """
        Accumulate gradients from a training examples,
        X matrix, Y vector of targets
        TODO hidden layer average activation is done separately 
        can be rewritten to be twice as fast
        """
        # Frist compute average activation for examples
        ro_hat = np.zeros_like(self.params.b1)
        for i in range(len(Y)):
            x = X[i]
            _, h = self.forward_pass(x)
            ro_hat += h
        ro_hat /= float(len(Y))


        ##
        # Forward propagation
        for i in range(len(Y)):
            x = X[i]
            y = Y[i]
            z1 = self.params.W.dot(x) + self.params.b1
            h = sigmoid(z1)
            z2 = np.dot(self.params.U, h) + self.params.b2
            y_hat = z2
            
            d2 = (y_hat - y) 
            #d2 *= (1./len(y))
            self.grads.b2 += d2
            self.grads.U += np.outer(d2, h) + self.lreg * self.params.U
            
            # incorporate kld gradient into d1
            kl_grad = self.beta * (- self.ro / ro_hat + 
                    (1. - self.ro) / (1 - ro_hat))
            d1 = (np.dot(self.params.U.T, d2) + kl_grad) * sigmoid_grad(z1)
            
            self.grads.W += np.outer(d1, x) + self.lreg * self.params.W
            self.grads.b1 += d1
Ejemplo n.º 44
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = np.zeros((ns + 1, self.hdim))
        for i in range(ns):
            hs[i +
               1] = sigmoid(self.params.H.dot(hs[i]) + self.sparams.L[xs[i]])
            p = softmax(self.params.U.dot(hs[i + 1]))
            p = p * make_onehot(ys[i], self.vdim)
            J += -np.log(np.sum(p))
        #### END YOUR CODE ####
        return J
Ejemplo n.º 45
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = zeros((ns + 1, self.hdim))
        ps = zeros((ns, self.vdim))
        for t in range(0, ns):
            hs[t] = sigmoid(
                self.params.H.dot(hs[t - 1]) + self.sparams.L[xs[t], :])
            ps[t] = softmax(self.params.U.dot(hs[t]))
            J += -log(ps[t][ys[t]])
        #### END YOUR CODE ####
        return J
Ejemplo n.º 46
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        h = zeros(self.hdim)
        for t in range(len(xs)):
            h = sigmoid(self.params.H.dot(h) + self.sparams.L[xs[t]])
            pred = softmax(self.params.U.dot(h))
            J += -1 * log(pred[ys[t]])

        #pdb.set_trace()

        #### END YOUR CODE ####
        return J
Ejemplo n.º 47
0
    def predict(self, xs):
        # predicts yhat based on xs

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        #ps = zeros((ns, self.vdim))
        yhat = None

        ##
        # Forward propagation
        #hs[-1] is kindly all 0s (always)
        for t in xrange(ns):
            theta_t = dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]]
            hs[t] = sigmoid(theta_t)
            if t == ns - 1:
                yhat = softmax(dot(self.params.U, hs[t]))

        return yhat
Ejemplo n.º 48
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        ns = len(xs)
        hs = zeros((ns + 1, self.hdim))
        ps = zeros((ns, self.vdim))

        for i in range(ns):
            z1 = self.params.H.dot(hs[i - 1]) + self.sparams.L[xs[i]]
            hs[i] = sigmoid(z1)
            z2 = self.params.U.dot(hs[i])
            ps[i] = softmax(z2)

        J = sum(-log(ps[range(len(ps)), ys]))

        return J
Ejemplo n.º 49
0
    def forward(self, X):
        """Forward propagation
        
        Parameter:
        X: A numpy array of size (n, m)
            n: the number features (excluding the bias term)"""
        self.Z = np.dot(self.W, X)
        if self.use_bias:
            self.Z = self.Z + self.b

        if self.activation == 'sigmoid':
            self.A = sigmoid(self.Z)
        elif self.activation == 'relu':
            self.A = relu(self.Z)
        elif self.activation == 'softmax':
            self.A = softmax(self.Z)
        elif self.activation == 'tanh':
            self.A = tanh(self.Z)
        else:
            self.A = linear(self.Z)

        if self.next_layer is not None:
            return self.next_layer.forward(self.Z)
        return self.A
Ejemplo n.º 50
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####

        ##
        # Forward propagation

        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]])
            ps[t] = softmax(dot(self.params.U, hs[t]))

        ##
        # Backward propagation through time

        for j in xrange(ns):
            y = make_onehot(ys[j], self.vdim)
            y_hat_minus_y = ps[j] - y
            self.grads.U += outer(y_hat_minus_y, hs[j])
            delta = dot(self.params.U.T, y_hat_minus_y) * hs[j] * (1.0 - hs[j])

            # start at j and go back self.bptt times (total self.bptt + 1 elements, including current one)
            for t in xrange(j, j - self.bptt - 1, -1):
                if t - 1 >= -1:
                    self.grads.H += outer(delta, hs[t - 1])
                    self.sgrads.L[xs[t]] = delta
                    delta = dot(self.params.H.T, delta) * hs[t - 1] * (1.0 - hs[t - 1])
Ejemplo n.º 51
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns + 1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####
        for i in xrange(ns):
            hs[i] = sigmoid(
                self.params.H.dot(hs[i - 1]) + self.sparams.L[xs[i]])
            ps[i] = softmax(self.params.U.dot(hs[i]))
        ##
        # Forward propagation

        for i in xrange(ns):
            delta_pre = ps[i]
            delta_pre[ys[i]] -= 1
            self.grads.U += outer(delta_pre, hs[i])

            delta = self.params.U.T.dot(delta_pre) * hs[i] * (1 - hs[i])
            #            self.grads.H += outer(delta, hs[i - 1])
            #            self.sgrads.L[xs[i]] = delta

            j = i
            while j >= 0 and j >= (i - self.bptt):
                self.grads.H += outer(delta, hs[j - 1])
                self.sgrads.L[xs[j]] = delta
                delta = self.params.H.T.dot(delta) * hs[j - 1] * (1 -
                                                                  hs[j - 1])
                j -= 1
Ejemplo n.º 52
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns + 1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####
        ##
        # Forward propagation
        for step in xrange(0, ns):
            # print "hs[step-1].shape %s" % (hs[step-1].shape,)
            # print "self.params.H.shape %s" % (self.params.H.shape,)
            # print "self.sparams.L.shape %s" % (self.sparams.L.shape,)
            # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,)
            a1 = self.params.H.dot(hs[step - 1].T).T + self.sparams.L[xs[step]]
            a1 = expand_dims(a1, axis=0)
            h = sigmoid(a1)
            a2 = self.params.U.dot(h.T).T
            # print "h.flatten().shape %s" % (h.flatten().shape,)
            # print "a2.shape %s" % (a2.shape,)
            # print "self.params.U.shape %s" % (self.params.U.shape,)
            y_hat = softmax(a2)

            # print "y_hat.shape %s" % (y_hat.shape,)

            hs[step] = h.flatten()
            ps[step] = y_hat

        ##
        # Backward propagation through time
        for step in xrange(ns - 1, -1, -1):
            t = zeros(ps[step].shape)
            t[ys[step]] = 1
            delta_out = ps[step] - t
            self.grads.U += outer(hs[step], delta_out).T

            delta_hidden = delta_out.dot(self.params.U) * sigmoid_grad(
                hs[step])

            for step_bp in xrange(step, step - self.bptt - 1, -1):
                if step_bp < 0:
                    break
                self.grads.H += outer(delta_hidden, hs[step_bp - 1])
                self.sgrads.L[xs[step_bp]] = delta_hidden
                delta_hidden = delta_hidden.dot(self.params.H) * sigmoid_grad(
                    hs[step_bp - 1])
Ejemplo n.º 53
0
    def _acc_grads(self, X, y, just_probs=False):
        """
        Accumulate gradients, given a pair of training sequences:
        X = input word vectors (N x WvecDim matrix)
        y = document classifcation (as an integer)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect X as a matrix of word vectors
        # ith row is a word embedding for the ith word
        ns = X.shape[0]

        #### YOUR CODE HERE ####

        ##############
        # FORWARD PROP

        # X.shape = (ns, Dw)

        #### A1
        # A1.shape = (ns, Dh)
        A1 = sigmoid((self.params.W11.dot(X.T)).T + self.params.b11)

        assert A1.shape == (ns, self.hdim)

        # if dropout set A1
        if self.drop_p > 0.:
            A1[random.rand(*A1.shape) <= self.drop_p] = 0.

        #### A2
        # A2.shape = (ns, Dh)
        A2 = sigmoid((self.params.W12.dot(A1.T)).T + self.params.b12)

        assert A2.shape == (ns, self.hdim)

        # if dropout set A2
        if self.drop_p > 0.:
            A2[random.rand(*A2.shape) <= self.drop_p] = 0.

        #### MAX POOLING
        # Max each node of A over time (max of each column over all rows)
        # use argmax for use in backprop
        mx = argmax(A2,0)

        # Max pooling vector
        # this will select max elements of A:
        # h.shape == (Dh,)
        h1 = A2[mx,list(range(len(mx)))]
        assert h1.shape == (self.hdim,)

        #### HIDDEN POOLED LAYER
        h2 = sigmoid(self.params.W21.dot(h1) + self.params.b21)
        assert h2.shape == (self.hdim,)

        # prediction probabilities
        ps = softmax(self.params.Ws.dot(h2) + self.params.bs)

        if just_probs: return ps

        #############
        # BACK PROP

        y = array(y).astype(int)

        #### SOFTMAX LAYER
        err_o = ps
        err_o[y] += -1

        self.grads.Ws += outer(err_o, h2)
        self.grads.bs += err_o

        err_h2 = self.params.Ws.T.dot(err_o) * h2 * (1-h2)
        assert err_h2.shape == (self.hdim,)

        #### HIDDEN POOLED LAYER

        self.grads.W21 += outer(err_h2, h1)
        self.grads.b21 += err_h2

        err_h_max = self.params.W21.T.dot(err_h2) * h1 * (1-h1)
        assert err_h_max.shape == (self.hdim,)

        #### HIDDEN UNPOOLED LAYER 2
        # the inputs to hidden unpooled layers
        # for the examples that went in to the argmax instance of each node
        A1_max = A1[mx,:]
        assert A1_max.shape == (self.hdim, self.hdim)


        # How to multiply by the same thing in each row (columnwise multiplication)
        # zeros((10,5)) * reshape(range(10), (10,))[:,newaxis]
        self.grads.W12 += A1_max * err_h_max[:,newaxis]
        self.grads.b12 += err_h_max

        # output for argmax node
        a1_mx = A1[mx,list(range(len(mx)))]

        err_A2 = zeros((ns,self.hdim))
        err_A2[mx,list(range(len(mx)))] = err_h_max
        assert err_A2.shape == (ns, self.hdim)

        err_a1_max = self.params.W12.T.dot(err_A2.T).T * A1*(1-A1)
        assert err_a1_max.shape == (ns,self.hdim)

        ##### HiddEN UNPOOLED LAYER 1

        self.grads.W11 += err_a1_max.T.dot(X)
        self.grads.b11 += sum(err_a1_max,axis=0)

        #### REGULARIZATION
        self.grads.W11 += self.rho * self.params.W11
        self.grads.W12 += self.rho * self.params.W12
        self.grads.W21 += self.rho * self.params.W21
        self.grads.Ws += self.rho * self.params.Ws
Ejemplo n.º 54
0
 def forward_pass(self, x):
     " Compute the final and the hidden layer "
     z1 = self.params.W.dot(x) + self.params.b1
     h = sigmoid(z1)
     z2 = np.dot(self.params.U, h) + self.params.b2
     return (z2, h)
Ejemplo n.º 55
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H)
                and self.sgrads (for L,U)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        #ps = zeros((ns, self.vdim))
        yhat = None

        #### YOUR CODE HERE ####

        ##
        # Forward propagation
        #hs[-1] is kindly all 0s (always)
        for t in xrange(ns):
            theta_t = dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]]
            hs[t] = sigmoid(theta_t)
            if t == ns - 1:
                yhat = softmax(dot(self.params.U, hs[t]))

        ##
        # Backward propagation through time
        def get_delta_i(delta_next, t):
            ddht = dot(transpose(self.params.H), delta_next)
            return ddht * hs[t] * (1- hs[t])

        #for t in xrange(ns-1, -1, -1):
        t = ns-1

        dJ_dUht = yhat
        dJ_dUht[ys] -= 1 # (-y + yhat)

        self.grads.U += outer(dJ_dUht, hs[t])
        dJ_dht = dot(transpose(self.params.U), dJ_dUht) # h(t) = sig(theta)
        dJ_dThetat = dJ_dht * (hs[t]) * (1 - hs[t])
        delta_t = dJ_dThetat
            
        # BPTT
        delta_next = None
        i = t
        while i >= max(t - self.bptt + 1, 0):
            # note that bptt=1 means we only run it on regular t
            delta_i = get_delta_i(delta_next, i) if i != t else delta_t
            self.sgrads.L[xs[i]] = delta_i
            self.grads.H += outer(delta_i, hs[i-1])

            delta_next = delta_i
            i -= 1


        # regularization
        self.grads.H += self.lamb * self.params.H
        self.grads.U += self.lamb * self.params.U
Ejemplo n.º 56
0
def tanh(x):
    return 2 * sigmoid(2 * x) - 1.0
Ejemplo n.º 57
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row, matrix L: |V| * dim(h)

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros) # compact h(t)s to a matrix
        hs = zeros((ns + 1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        # Forward propagation

        for i in range(ns):
            z1 = self.params.H.dot(hs[i - 1]) + self.sparams.L[xs[i]]
            hs[i] = sigmoid(z1)
            z2 = self.params.U.dot(hs[i])
            ps[i] = softmax(z2)

        ps_copy = ps.copy()
        ps_copy[
            arange(len(ys)),
            ys] -= 1.  # Matrix, each row is a prob distribution for predicting certain word.
        yhat_y = ps_copy

        mean_grads_H = zeros_like(self.params.H)
        mean_grads_U = zeros_like(self.params.U)

        # Backward propagation through time
        for t in reversed(range(ns)):

            # Start from the latest step, e.g: 4, 3, 2, 1, 0
            mean_grads_U += outer(yhat_y[t], hs[t])
            delta = (self.params.U.T.dot(yhat_y[t])) * (hs[t] * (1 - hs[t]))

            for s in range(min(t, self.bptt) + 1):
                #for s in range(max(0, t-self.bptt), t+1):
                mean_grads_H += outer(delta, hs[t - s - 1])
                self.sgrads.L[xs[t - s]] = delta
                delta = self.params.H.T.dot(delta) * (hs[t - s - 1] *
                                                      (1 - hs[t - s - 1]))

        self.grads.H += mean_grads_H
        self.grads.U += mean_grads_U
Ejemplo n.º 58
0
def d_tanh(x):
    return 4 * sigmoid(2.0 * x) * (1.0 - sigmoid(2.0 * x))