Ejemplo n.º 1
0
    def compute_seq_ppl(self, xs, ys):
        #### YOUR CODE HERE ####
        J = 0
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        cs = zeros((ns, self.cdim))
        # predicted probas
        ps = zeros((ns, self.Udim))

        #### YOUR CODE HERE ####
        L = self.sparams.L
        Lc = self.Lcluster
        cfreq = self.cfreq
        cwords = self.cwords
        direct_size = self.hsize
        U = self.params.U
        H = self.params.H
        C = zeros((self.cdim, self.hdim))
        if self.isCompression is True:
            C = self.params.C
        ##
        # Forward propagation
        for i in xrange(ns):
            hs[i+1] = sigmoid(H.dot(hs[i]) + L[xs[i]])
            #hs[i+1] = 2.0/(1 + exp(-2.0*(H.dot(hs[i]) + L[xs[i]]))) - 1
            #without maximum entropy optimization
            word_cluster = Lc[ys[i]]
            st_word = cwords[word_cluster, 0]
            ed_word = st_word + cfreq[word_cluster]
            
            part_cluster = zeros((self.class_size, ))
            part_word = zeros((ed_word - st_word, ))
            if self.isME is True:
                if direct_size > 0 and xs[i] != -1:
                    part_cluster += self.params.cluster_direct[xs[i]]
                    indexs = cwords[word_cluster, 0:int(cfreq[word_cluster])]
                    
                    if xs[i] < direct_size:
                        part_word += self.params.word_direct[xs[i], indexs]
            
            if self.isCompression is True:
                cs[i] = sigmoid(C.dot(hs[i+1]))
                part_cluster += U[self.vdim:].dot(cs[i])
                part_word += U[st_word:ed_word].dot(cs[i])
                ps[i, self.vdim:] = softmax(part_cluster)
                ps[i, st_word:ed_word] = softmax(part_word)
                
            else:
                part_cluster += U[self.vdim:].dot(hs[i+1])
                part_word += U[st_word:ed_word].dot(hs[i+1])
                
                ps[i, self.vdim:] = softmax(part_cluster)
                ps[i, st_word:ed_word] = softmax(part_word)
                #ps[i, self.vdim:] = softmax(U[self.vdim:,:].dot(hs[i+1]))
                #ps[i, st_word:ed_word] = softmax(U[st_word:ed_word,:].dot(hs[i+1]))
            
            #print maximum(ps[i, ys[st_word:ed_word]]), ps[i,ys[i]], maximum(ps[i, self.vdim:]), ps[i, self.vdim+word_cluster]
            J -= log(ps[i, ys[i]] * ps[i, self.vdim+word_cluster])
        
        return J
Ejemplo n.º 2
0
    def forwardProp(self, node, correct, guess):
        cost = total = 0.0
        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:, node.word]
            node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1

        c1, t1 = self.forwardProp(node.left, correct, guess)
        c2, t2 = self.forwardProp(node.right, correct, guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            tmp = np.zeros(len(node.left.hActs1))
            for i in range(len(tmp)):
                tmp[i] = h.dot(self.V[i]).dot(h)
            node.hActs1 = self.ReLU(self.W.dot(h) + self.b + tmp)
            node.probs = softmax(self.Ws.dot(node.hActs1) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))

        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
Ejemplo n.º 3
0
    def forwardProp(self, node, correct=[], guess=[]):
        cost = total = 0.0
        # this is exactly the same setup as forwardProp in rnn.py
        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:, node.word]
            node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2)
            node.probs = softmax(
                self.Ws.dot(node.hActs2 * self.mask) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1

        c1, t1 = self.forwardProp(node.left, correct, guess)
        c2, t2 = self.forwardProp(node.right, correct, guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1)
            node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2)
            node.probs = softmax(
                self.Ws.dot(node.hActs2 * self.mask) + self.bs)
            p = node.probs * make_onehot(node.label, len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))

        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
Ejemplo n.º 4
0
 def forwardProp(self,node, correct=[], guess=[]):
     cost  =  total = 0.0
     # this is exactly the same setup as forwardProp in rnn.py
     if node.isLeaf == True:
         node.fprop = True
         node.hActs1 = self.L[:,node.word]
         node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2)
         node.probs = softmax(self.Ws.dot(node.hActs2)+self.bs)
         p = node.probs*make_onehot(node.label,len(self.bs))
         cost = -np.log(np.sum(p))
         correct.append(node.label)
         guess.append(np.argmax(node.probs))
         return cost, 1
     
     c1,t1 = self.forwardProp(node.left,correct,guess)
     c2,t2 = self.forwardProp(node.right,correct,guess)
     if node.left.fprop and node.right.fprop:
         node.fprop = True
         h = np.hstack([node.left.hActs1, node.right.hActs1])
         node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1)
         node.hActs2 = self.ReLU(self.W2.dot(node.hActs1) + self.b2)
         node.probs = softmax(self.Ws.dot(node.hActs2)+self.bs)
         p = node.probs*make_onehot(node.label,len(self.bs))
         cost = -np.log(np.sum(p))
         correct.append(node.label)
         guess.append(np.argmax(node.probs))
         
     cost += c1
     cost += c2
     total += t1
     total += t2
     return cost, total + 1
Ejemplo n.º 5
0
 def forwardProp(self,node,correct, guess):
     cost = total = 0.0
     if node.isLeaf == True:
         node.fprop = True
         node.hActs1 = self.L[:, node.word]
         node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs)
         p = node.probs*make_onehot(node.label, len(self.bs))
         cost = -np.log(np.sum(p))
         correct.append(node.label)
         guess.append(np.argmax(node.probs))
         return cost, 1
         
     c1,t1 = self.forwardProp(node.left,correct,guess)
     c2,t2 = self.forwardProp(node.right,correct,guess)
     if node.left.fprop and node.right.fprop:
         node.fprop = True
         h = np.hstack([node.left.hActs1, node.right.hActs1])
         tmp = np.zeros(len(node.left.hActs1))
         for i in range(len(tmp)):
             tmp[i] = h.dot(self.V[i]).dot(h)
         node.hActs1 = self.ReLU(self.W.dot(h) + self.b + tmp)
         node.probs = softmax(self.Ws.dot(node.hActs1)+self.bs)
         p = node.probs*make_onehot(node.label,len(self.bs))
         cost = -np.log(np.sum(p))
         correct.append(node.label)
         guess.append(np.argmax(node.probs))
         
     cost += c1
     cost += c2
     total += t1
     total += t2
     return cost, total + 1
Ejemplo n.º 6
0
    def f_prop(self, ys, h_in):
        """Given a series of xs and a series of ys, returns hidden vector at
        end, and also the cost"""
        N = len(ys) # total num timesteps
        #L = self.params['L']
        Wh = self.params['Wh']
        #Wx = self.params['Wx']
        U = self.params['U']
        b1 = self.params['b1']
        b2 = self.params['b2']
        
        self.yhats = np.zeros([self.outdim, N])
        self.hs = np.zeros([self.hdim, N+1])
        # np.random.seed(2234)
        # self.hs[:,-1] = np.random.normal(0,.1,(self.hdim))
        self.hs[:,-1] = h_in

        cost = 0
        
        for t in xrange(N):
            h_prev = self.hs[:,t-1]
            z_1 = np.dot(Wh, h_prev) + b1 #+ np.dot(Wx, Lx)
            h1 = np.maximum(z_1, 0)
            self.hs[:,t] = h1
            yhat = softmax(np.dot(U, h1) + b2)
            self.yhats[:,t] = yhat
            cost += -np.log(yhat[ys[t]])

        return cost
Ejemplo n.º 7
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        # shape T x Dh
        hs = zeros((ns+1, self.hdim))
        # predicted probas, shape T x V
        ps = zeros((ns, self.vdim))

        for t in range(ns):
            # shape Dh
            hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]])
            # shape T x V
            ps[t] = softmax(self.params.U.dot(hs[t]))
            J += -log(ps[t,ys[t]])


        #### END YOUR CODE ####
        return J
Ejemplo n.º 8
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        # TODO: Vectorize this

        P = zeros((len(windows), self.params.b2.shape[0]))
        for idx in range(0, len(windows)):
            # Forward propagation
            window = array(windows[idx])
            words = [
                self.sparams.L[window[0]], self.sparams.L[window[1]],
                self.sparams.L[window[2]]
            ]
            x = reshape(words, self.sparams.L.shape[1] * 3)  # 3n row vector
            z2 = self.params.W.dot(x) + self.params.b1
            a2 = tanh(z2)
            z3 = self.params.U.dot(a2) + self.params.b2
            a3 = softmax(z3)
            P[idx, :] = a3

        return P  # rows are output for each input
Ejemplo n.º 9
0
    def compute_loss(self, windows, labels):
        """
        Compute the loss for a given dataset.
        windows = same as for predict_proba
        labels = list of class labels, for each row of windows
        """

        #### YOUR CODE HERE ####

        print "windows shape ", windows.shape 
        x = self.sparams.L[windows[:,0]]
        for i in range(len(windows[0])-1):
            x = np.concatenate((x,self.sparams.L[windows[:,i+1]]),axis=1)

        z = self.params.W.dot(x.T)+self.params.b1.reshape((self.params.b1.shape[0],1))
        h = tanh(z)
        p = softmax(self.params.U.dot(h)+self.params.b2.reshape((self.params.b2.shape[0],1)))
        labelArray = np.zeros((len(labels),self.params.b2.shape[0]))
        for i in range(len(labels)):
            labelArray[i] = make_onehot(labels[i],self.params.b2.shape[0])
        batch = len(labels)
        p = p*labelArray.T
        p = np.sum(p,axis=0)
        J = np.sum(-np.log(p))
        Jreg = batch*(self.lreg/2.0)*(np.sum(self.params.W**2)+np.sum(self.params.U**2))
        J += Jreg                    
        #### END YOUR CODE ####
        return J
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        
        #hasattr(	object, name)
        #The arguments are an object and a string. The result is True if the string is the name of one of the object's
        #attributes, False if not. (This is implemented by calling getattr(object, name) and seeing whether it raises an
        #exception or not.)
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        P = []
        for window in windows:
            x = hstack(self.sparams.L[window])
            h = tanh(self.params.W.dot(x) + self.params.b1)
            p = softmax(self.params.U.dot(h) + self.params.b2)
            P.append(p)

        #### END YOUR CODE ####

        return P # rows are output for each input
Ejemplo n.º 11
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        # x - (W) -> a - (tanh) -> h - (U) -> z - (softmax) -> p
        P = []
        for window in windows: # Is it possible to use fully-vectorized method instead of for loop?
            x = hstack(self.sparams.L[window, :]) # the same as above 
            h = tanh(self.params.W.dot(x) + self.params.b1)
            p = softmax(self.params.U.dot(h) + self.params.b2)
            P.append(p)

        #### END YOUR CODE ####

        return array(P) # rows are output for each input
Ejemplo n.º 12
0
    def f_prop(self, ys, h_in):
        """Given a series of xs and a series of ys, returns hidden vector at
        end, and also the cost"""
        N = len(ys)  # total num timesteps
        #L = self.params['L']
        Wh = self.params['Wh']
        #Wx = self.params['Wx']
        U = self.params['U']
        b1 = self.params['b1']
        b2 = self.params['b2']

        self.yhats = np.zeros([self.outdim, N])
        self.hs = np.zeros([self.hdim, N + 1])
        # np.random.seed(2234)
        # self.hs[:,-1] = np.random.normal(0,.1,(self.hdim))
        self.hs[:, -1] = h_in

        cost = 0

        for t in xrange(N):
            h_prev = self.hs[:, t - 1]
            z_1 = np.dot(Wh, h_prev) + b1  #+ np.dot(Wx, Lx)
            h1 = np.maximum(z_1, 0)
            self.hs[:, t] = h1
            yhat = softmax(np.dot(U, h1) + b2)
            self.yhats[:, t] = yhat
            cost += -np.log(yhat[ys[t]])

        return cost
Ejemplo n.º 13
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        N = len(windows)
        P = zeros((N, self.params.b2.shape[0]))

        for n in xrange(N):
            x = self.sparams.L[windows[n]]
            x = x.reshape((x.shape[0]*x.shape[1]))
            z = self.params.W.dot(x) + self.params.b1
            h = tanh(z)
            P[n,:] = softmax(self.params.U.dot(h) + self.params.b2)

        #### END YOUR CODE ####

        return P # rows are output for each input
Ejemplo n.º 14
0
    def _acc_grads(self, x, label):
        """
        Accumulate gradients from a training example.
        """
        #import ipdb; ipdb.set_trace()
        ##
        # Forward propagation
        z1 = self.params.W.dot(x) + self.params.b1
        h1 = tanh(z1)
        z2 = np.dot(self.params.U, h1) + self.params.b2
        h2 = tanh(z2)
        z3 = np.dot(self.params.G, h2) + self.params.b3
        y_hat = softmax(z3)

        y = make_onehot(label, self.outputsize)
        d3 = y_hat - y
        self.grads.b3 += d3
        self.grads.G += np.outer(d3, h2) + self.lreg * self.params.G

        d2 = np.dot(self.params.G.T, d3) * tanhd(z2)

        self.grads.b2 += d2
        self.grads.U += np.outer(d2, h1) + self.lreg * self.params.U
        d1 = np.dot(self.params.U.T, d2) * tanhd(z1)

        self.grads.W += np.outer(d1, x) + self.lreg * self.params.W
        self.grads.b1 += d1
Ejemplo n.º 15
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####

        P = zeros((len(windows), self.params.U.shape[0]))

        for i, window in enumerate(windows):
            a1 = hstack(self.sparams.L[window, :])
            a2 = tanh(self.params.W.dot(a1) + self.params.b1)  # h
            y_hat = softmax(self.params.U.dot(a2) + self.params.b2)
            P[i, :] = y_hat

        #### END YOUR CODE ####

        return P  # rows are output for each input
Ejemplo n.º 16
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####

        #### END YOUR CODE ####
        P = zeros((len(windows), self.params.b2.shape[0]))  # (|V|, 5)
        for i in range(P.shape[0]):
            # Forward propagation
            x = array([self.sparams.L[w] for w in windows[i]]).reshape(self.sparams.L.shape[1] * len(windows[0]))
            a1 = x  # 3n = 150 input vector
            z1 = dot(self.params.W, a1) + self.params.b1  # 100 vector
            a2 = tanh(z1)  # 100 vector
            z2 = dot(self.params.U, a2) + self.params.b2  # 5 vector
            a3 = softmax(z2)  # 5 vector
            P[i, :] = a3

        return P  # rows are output for each input
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        # construct input matrix
        x = vstack([concatenate([self.sparams.L[idx] for idx in window]) for window in windows])
        z1 = self.params.W.dot(x.T) + self.params.b1[:, newaxis]
        h1 = 2 * sigmoid(2 * z1) - 1
        z2 = self.params.U.dot(h1) + self.params.b2[:, newaxis]
        P = softmax(z2.T)
        #### END YOUR CODE ####

        return P # rows are output for each input
Ejemplo n.º 18
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = np.zeros((ns + 1, self.hdim))
        for i in range(ns):
            hs[i + 1] = sigmoid(
                self.params.H.dot(hs[i]) +
                self.params.W.dot(self.sparams.L[xs[i]]))
            p = softmax(self.params.U.dot(hs[i + 1]))
            p = p * make_onehot(ys[i], self.vdim)
            J += -np.log(np.sum(p))
        #### END YOUR CODE ####

        Jreg = 0.5 * self.lreg * (np.sum(self.params.H**2) + np.sum(
            self.params.W**2) + np.sum(self.params.U**2))
        return J + Jreg
Ejemplo n.º 19
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.
        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        self.xs = xs
        self.ys = ys

        hs = zeros((ns + 1, self.hdim))
        self.hs1 = hs
        # for each time step
        for t in xrange(ns):
            hs[t] = sigmoid(
                dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]])
            y_hat = softmax(dot(self.params.U, hs[t]))
            J -= log(y_hat[ys[t]])

        #### END YOUR CODE ####
        return J
Ejemplo n.º 20
0
def ModelOutputCenterWord(clf,word_to_num,num_to_word,num_to_tag,windowsize):
    d = len(clf.sparams.L[0])
    partW = clf.params.W[:,(windowsize/2)*d:(windowsize/2+1)*d]
    z = clf.sparams.L.dot(partW.T) + clf.params.b1 # z -> (N,h)
    h = clf.tanh(z)
    p = softmax(h.dot(clf.params.U.T)+clf.params.b2)  #p ->(N,C)
    
    outputLayer = collections.defaultdict(list)
    for i in range(len(p)):
        for j in range(len(p[i])):
            outputLayer[j].append((p[i][j],i))

    topN = 10
    topscores = np.zeros((len(clf.params.b2),topN))
    topwords = np.zeros((len(clf.params.b2),topN))
    for i in range(len(outputLayer)):
        a = sorted(outputLayer[i],numericalCmp)
        for j in range(topN):
            topscores[i][j] = a[j][0]
            topwords[i][j] = a[j][1]

    print "topscores -->"
    print topscores
    for i in range(1,5):
        print "Output Neuron %d: %s" % (i,num_to_tag[i])
        words = []
        for j in topwords[i]:
            words.append(num_to_word[j])
        print_scores(topscores[i],words)
Ejemplo n.º 21
0
    def compute_loss(self, windows, labels):
        """
        Compute the loss for a given dataset.
        windows = same as for predict_proba
        labels = list of class labels, for each row of windows
        """

        #### YOUR CODE HERE ####
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]
            labels = [labels]

        N = len(windows)

        # x = self.sparams.L[windows]
        # x = x.reshape((N,x.shape[-2]*x.shape[-1]))
        # z = x.dot(self.params.W.T) + self.params.b1
        # h = tanh(z)
        # z2 = h.dot(self.params.U.T) + self.params.b2
        # p = softmax(z2)
        # J -= sum(log(p[0][labels])
        # J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0))

        J = 0
        for n in xrange(N):
            x = self.sparams.L[windows[n]]
            x = reshape(x, x.shape[0]*x.shape[1])
            h = tanh(self.params.W.dot(x) + self.params.b1)
            y_hat = softmax(self.params.U.dot(h) + self.params.b2)
            y = make_onehot(labels[n], len(y_hat))
            J -= sum(y*log(y_hat))
        J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0))
        #### END YOUR CODE ####
        return J
Ejemplo n.º 22
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        xf = []
        for idx in window:
            xf.extend( self.sparams.L[idx]) # extract representation
        tanhX = tanh(self.params.W.dot(xf) + self.params.b1)
        softmaxP = softmax(self.params.U.dot(tanhX) + self.params.b2)
        y = make_onehot(label, len(softmaxP))
        delta2 = softmaxP -y
        self.grads.U += outer(delta2, tanhX) + self.lreg * self.params.U
        self.grads.b2 += delta2
        delta1 = self.params.U.T.dot(delta2)*(1. - tanhX*tanhX)
        self.grads.W += outer(delta1, xf) + self.lreg * self.params.W
        self.grads.b1 += delta1
Ejemplo n.º 23
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        P = []
        for window in windows:
            # extract representation: concatenate window of words into a numpy colunm vector
            x = hstack(self.sparams.L[window, :])
            # just two layers, so simple
            h = tanh(self.params.W.dot(x) + self.params.b1)
            p = softmax(self.params.U.dot(h) + self.params.b2)
            P.append(p)

        return array(P) # rows are output for each input
Ejemplo n.º 24
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        ns = len(xs)

        h_ant = zeros((1, self.hdim))

        J = 0
        #### YOUR CODE HERE ####
        for step in xrange(0,ns):
            # print "hs[step-1].shape %s" % (hs[step-1].shape,)
            # print "self.params.H.shape %s" % (self.params.H.shape,)
            # print "self.sparams.L.shape %s" % (self.sparams.L.shape,)
            # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,)
            a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[xs[step]]
            h  = sigmoid( a1 )
            a2 = self.params.U.dot(h.T).T
            # print "h.shape %s" % (h.shape,)
            # print "a2.shape %s" % (a2.shape,)
            # print "self.params.U.shape %s" % (self.params.U.shape,)
            y_hat = softmax( a2 )
            h_ant = h

            J -= log( y_hat[:,ys[step]] )

        #### END YOUR CODE ####
        return J
Ejemplo n.º 25
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        #J = 0
        ns = len(xs)
        #### YOUR CODE HERE ####
        # forward propagation
        hs = zeros((ns+1, self.hdim))
        ps = zeros((ns, self.vdim)) # predicted probas
        for t in range(0, ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t], :])
            ps[t] = softmax(dot(self.params.U, hs[t]))

        J = - sum(log(ps[arange(ns), ys]))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 26
0
    def predict_seq_proba(self, X):
        #### YOUR CODE HERE ####
        # Expect xs as list of indices
        #ns = len(xs)
        ns = X.shape[0]

        # X.shape = (ns, Dw)
        #X = self.L[xs,:]

        Z = (self.params.W1.dot(X.T)).T + self.params.b1
        # A.shape = (ns, Dh)
        A = sigmoid(Z)
        assert A.shape == (ns, self.hdim)

        if self.drop_p > 0.:
            A = A * (1 - self.drop_p)

        # Max each node of A over time (max of each column over all rows)
        # use argmax for use in backprop
        mx = argmax(A, 0)

        # Max pooling vector
        # this will select max elements of A:
        # h.shape == (Dh,)
        h = A[mx, list(range(len(mx)))]
        assert h.shape == (self.hdim, )

        # prediction probabilities
        ps = softmax(self.params.Ws.dot(h) + self.params.bs)

        return (ps)
 def compute_loss(self, windows, labels):
     """
     Compute the loss for a given dataset.
     windows = same as for predict_proba
     labels = list of class labels, for each row of windows
     """
    
     #### YOUR CODE HERE ####
     L = self.sparams.L
     U = self.params.U
     W = self.params.W
     b1 = self.params.b1
     b2 = self.params.b2
     lambda_ = self.lreg
     J = 0
     
     labels_tem = None
     if not hasattr(windows[0], "__iter__"):
         windows = [windows]
         labels_tem = [labels]
     else:
         labels_tem = labels
     
     for i in xrange(len(windows)):
         x = hstack(L[windows[i], :])
         h = tanh(W.dot(x) + b1)
         y_hat = softmax(U.dot(h) + b2)
         J -= log(y_hat[labels_tem[i]])
     J += (lambda_ / 2.0) * (sum(W ** 2.0) + sum(U ** 2.0))
     #### END YOUR CODE ####
     return J
Ejemplo n.º 28
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        # predicted probas
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####
        L = self.sparams.L
        U = self.params.U
        H = self.params.H
        
        ##
        # Forward propagation
        for i in xrange(ns):
            hs[i+1] = sigmoid(H.dot(hs[i]) + L[xs[i]])
            #hs[i+1] = 2.0/(1.0 + exp(-2.0*(H.dot(hs[i]) + L[xs[i]]))) - 1.0
            ps[i] = softmax(U.dot(hs[i+1]))
            J -= log(ps[i][ys[i]])
        
        

        #### END YOUR CODE ####
        return J
Ejemplo n.º 29
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        ps = zeros((ns, self.vdim))

        for i in xrange(ns):
            hs[i] = sigmoid(self.params.H.dot(hs[i-1]) + self.sparams.L[xs[i]])
            ps[i] = softmax(self.params.U.dot(hs[i]))
            J -= log(ps[i][ys[i]])


        #### END YOUR CODE ####
        return J
Ejemplo n.º 30
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]
        n = len(windows)
        P = zeros((len(windows),self.params.b2.shape[0]))
        #### YOUR CODE HERE ####
        for idx in xrange(n):
            window = windows[idx]
            x =  hstack(self.sparams.L[window])
            h = tanh(self.params.W.dot(x) + self.params.b1)
            scores = self.params.U.dot(h) + self.params.b2
            P[idx,:]= softmax(scores)
        #### END YOUR CODE ####

        return P # rows are output for each input
Ejemplo n.º 31
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        x = self.sparams.L[windows[:,0]]
        for i in range(len(windows[0])-1):
            x = np.concatenate((x,self.sparams.L[windows[:,i+1]]),axis=1)
            
        z = self.params.W.dot(x.T)+self.params.b1.reshape((self.params.b1.shape[0],1))
        h = self.tanh(z)
        p = softmax(self.params.U.dot(h)+self.params.b2.reshape((self.params.b2.shape[0],1)))
        #### END YOUR CODE ####
        return p # rows are output for each input
Ejemplo n.º 32
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]
        
        P = empty((len(windows), self.nclass))
        #### YOUR CODE HERE ####
        for i, row in enumerate(windows):

            x = self.sparams.L[row, :].flatten()
            #words = [self.sparams.L[row[0]], self.sparams.L[row[1]], self.sparams.L[row[2]]]
            #x = reshape(words, self.sparams.L.shape[1] *3) # 3n row vector
            h = tanh(self.params.W.dot(x) + self.params.b1)
        
            p = softmax(self.params.U.dot(h) + self.params.b2)
            P[i, :] = p
        #### END YOUR CODE ####

        return P # rows are output for each input
Ejemplo n.º 33
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        h_prev = zeros(self.hdim)
        for t in xrange(ns):
            h_t = sigmoid(dot(self.params.H, h_prev) + self.sparams.L[xs[t]])
            if t == ns - 1:
                yhat_t = softmax(dot(self.params.U, h_t))
                J = -log(yhat_t[ys])

            h_prev = h_t

        J += .5 * self.lamb * (sum(self.params.H**2) + sum(self.params.U**2))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 34
0
    def _acc_grads(self, xs, ys):
        """
        Accumulate gradients, given a pair of training sequences:
        xs = [<indices>] # input words
        ys = [<indices>] # output words (to predict)

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.H += (your gradient dJ/dH)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # update row

        Per the handout, you should:
            - make predictions by running forward in time
                through the entire input sequence
            - for *each* output word in ys, compute the
                gradients with respect to the cross-entropy
                loss for that output word
            - run backpropagation-through-time for self.bptt
                timesteps, storing grads in self.grads (for H, U)
                and self.sgrads (for L)

        You'll want to store your predictions \hat{y}(t)
        and the hidden layer values h(t) as you run forward,
        so that you can access them during backpropagation.

        At time 0, you should initialize the hidden layer to
        be a vector of zeros.
        """

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))
        # predicted probs
        ps = zeros((ns, self.vdim))

        #### YOUR CODE HERE ####

        # forward propagation
        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]])
            ps[t] = softmax(dot(self.sparams.U, hs[t]))

        # backpropagation through time
        for i in xrange(ns):
            d2i = ps[i]
            d2i[ys[i]] -= 1
            d1 = dot(self.sparams.U.T, d2i) * hs[i] * (1 - hs[i])

            self.sgrads.U = dot(d2i.reshape((-1, 1)), hs[i].reshape((1, -1)))

            for t in xrange(i, i - self.bptt - 1, -1):
                if t >= 0:                          # the farthest reference will thus be hs[-1]
                    self.sgrads.L[xs[t]] = d1
                    self.grads.H += dot(d1.reshape((-1, 1)), hs[t-1].reshape((1, -1)))
                    d1 = dot(self.params.H.T, d1) * hs[t-1] * (1 - hs[t-1])     # accumulate punishments/deltas
Ejemplo n.º 35
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        P = empty((len(windows), self.nclass))
        #### YOUR CODE HERE ####
        for i, row in enumerate(windows):

            x = self.sparams.L[row, :].flatten()
            #words = [self.sparams.L[row[0]], self.sparams.L[row[1]], self.sparams.L[row[2]]]
            #x = reshape(words, self.sparams.L.shape[1] *3) # 3n row vector
            h = tanh(self.params.W.dot(x) + self.params.b1)

            p = softmax(self.params.U.dot(h) + self.params.b2)
            P[i, :] = p
        #### END YOUR CODE ####

        return P  # rows are output for each input
Ejemplo n.º 36
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        hs = zeros((ns + 1, self.hdim))
        ps = zeros((ns, self.vdim))

        for i in xrange(ns):
            hs[i] = sigmoid(
                self.params.H.dot(hs[i - 1]) + self.sparams.L[xs[i]])
            ps[i] = softmax(self.params.U.dot(hs[i]))
            J -= log(ps[i][ys[i]])

        #### END YOUR CODE ####
        return J
Ejemplo n.º 37
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        P = []
        for window in windows:
            x = hstack(self.sparams.L[window])
            h = tanh(self.params.W.dot(x) + self.params.b1)
            p = softmax(self.params.U.dot(h) + self.params.b2)
            P.append(p)


        #### END YOUR CODE ####

        return P # rows are output for each input
Ejemplo n.º 38
0
    def compute_loss(self, windows, labels):
        """
        Compute the loss for a given dataset.
        windows = same as for predict_proba
        labels = list of class labels, for each row of windows
        """
        
        labels_list = None
        #### YOUR CODE HERE ####
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]
            labels_list = [labels]
        else:
            labels_list = labels

        J = 0
        for i in xrange(len(windows)):
            x = hstack(self.sparams.L[windows[i], :]) # extract representation
            h = tanh(self.params.W.dot(x) + self.params.b1)
            p = softmax(self.params.U.dot(h) + self.params.b2)
            J += - log(p[labels_list[i]])
        Jreg = (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0))
        #### END YOUR CODE ####
        
        return J + Jreg
Ejemplo n.º 39
0
    def forward_pass(self, x):
        ''' Forward pass,
        Arguemnt: x input vectore
        Return:(zs, hs) hidden and output activations (hs)
        and inputs to activation function (zs) 

        example:
            input _ dims [100, 30, 20, 5]
            output: hs = [x, h1, h2, h3]
                    zs = [z1, z2, z3]
        '''

        hs = [x]
        zs = []
        h = x
        for i in range(1, len(self.dims)):
            W = self._get_param('W', i)
            b = self._get_param('b', i)
            z = W.dot(h) + b
            zs.append(z)
            # now activation function, if it's a last layer we use softmax
            # else tanh
            if i == len(self.dims) - 1:
                # last layer
                h = softmax(z)
            else:
                h = self.act(z)
            hs.append(h)
        return (zs, hs)
Ejemplo n.º 40
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        # hs[-1] = initial hidden state (zeros)
        ns = len(ys)
        hs = zeros((ns+1, self.hdim))

        for t in range(ns):
            hs[t] = sigmoid(self.params.H.dot(hs[t-1]) + self.sparams.L[xs[t]])
            #ps[t] = softmax(self.params.U.dot(hs[t]))
            #J -= log(ps[t][ys[t]])
        h_final = hs[ns-1]
        z = self.params.U.dot(h_final) 
        y_hat = []
        for i in range(n_aspect):
            current = z[sent_dim*i:sent_dim*(i+1)]
            y_hat.extend(softmax(current))
        J =- sum(ys.reshape(len(ys),1)*log(array(y_hat).reshape(len(y_hat),1)))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 41
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        #print 'windows.shape',windows[0]
        P=[]
        for window in windows:
            x = hstack([self.sparams.L[idx] for idx in window]) # extract representation,(150,) matrix
            #x=reshape(x,(x.shape[0]*x.shape[1]))
            #print self.params.W.shape,' ',x.shape,' ',self.params.b1.shape
            a =self.params.W.dot(x)+self.params.b1#(100,150)*(150,)+(100,)=>(100,)
            h = tanh(a)#(100,)
            p = softmax(self.params.U.dot(h) + self.params.b2)#(5,100)*(100,)+(100,)=>(5,)
            P.append(p)
        #### END YOUR CODE ####


        return P # rows are output for each input
Ejemplo n.º 42
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        """
        #### YOUR CODE HERE ####
        ##
        # Forward propagation
        x = hstack(self.sparams.L[window, :])
        h = tanh(2*(self.params.W.dot(x)+self.params.b1))
        p = softmax(self.params.U.dot(h)+self.params.b2)
        ##
        y = make_onehot(label, 5)
        delta = p - y
        # Backpropagation
        self.grads.U += outer(delta, h) + self.lreg * self.params.U
        self.grads.b2 += delta
        gradh = dot(self.params.U.T,delta) * (1-h**2)
        self.grads.W += outer(gradh, x) + self.lreg * self.params.W
        self.grads.b1 += gradh

        dL = self.params.W.T.dot(gradh).reshape(self.window_size, self.word_vec_size)
        for i in xrange(self.window_size):
            self.sgrads.L[window[i], :] = dL[i]
Ejemplo n.º 43
0
    def compute_loss(self, windows, labels):
        """
        Compute the loss for a given dataset.
        windows = same as for predict_proba
        labels = list of class labels, for each row of windows
        """

        #### YOUR CODE HERE ####

        labels_lst = None

        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]
            labels_lst = [labels]
        else:
            labels_lst = labels

        J = 0.0

        for window, label in zip(windows, labels_lst):
            x = hstack(self.sparams.L[window])  # (150,) --> (X,)
            h = tanh(dot(self.params.W, x) + self.params.b1)  # (H,)
            y_hat = softmax(dot(self.params.U, h) + self.params.b2)  # (Dy,)
            J -= log(y_hat[label])

        J += (self.lreg / 2.0) * (sum(self.params.W**2.0) +
                                  sum(self.params.U**2.0))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 44
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####

        print windows

        N = len(windows)
        windowsize = len(windows[0])
        (Dy, H) = self.params.U.shape

        P = zeros((N, Dy))

        for i, window in enumerate(windows):
            x = hstack(self.sparams.L[window])  # (150,) --> (X,)
            h = tanh(dot(self.params.W, x) + self.params.b1)  # (H,)
            y_hat = softmax(dot(self.params.U, h) + self.params.b2)  # (Dy,)
            P[i, :] = y_hat

        #### END YOUR CODE ####

        return P  # rows are output for each input
Ejemplo n.º 45
0
 def predict_proba(self, idx):
     """
     Predict class probabilities.
     """
     x = self.sparams.L[idx]
     p = softmax(self.params.W.dot(x) + self.params.b)
     return p
Ejemplo n.º 46
0
    def predict_proba(self, windows):
        """
        Predict class probabilities.

        Should return a matrix P of probabilities,
        with each row corresponding to a row of X.

        windows = array (n x windowsize),
            each row is a window of indices
        """
        # handle singleton input by making sure we have
        # a list-of-lists
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]

        #### YOUR CODE HERE ####
        N = len(windows)
        P = zeros((N, self.params.b2.shape[0]))

        for n in xrange(N):
            x = self.sparams.L[windows[n]]
            x = x.reshape((x.shape[0] * x.shape[1]))
            z = self.params.W.dot(x) + self.params.b1
            h = tanh(z)
            P[n, :] = softmax(self.params.U.dot(h) + self.params.b2)

        #### END YOUR CODE ####

        return P  # rows are output for each input
Ejemplo n.º 47
0
 def predict_proba(self, idx):
     """
     Predict class probabilities.
     """
     x = self.sparams.L[idx]
     p = softmax(self.params.W.dot(x) + self.params.b)
     return p
Ejemplo n.º 48
0
    def compute_loss(self, windows, labels):
        """
        Compute the loss for a given dataset.
        windows = same as for predict_proba
        labels = list of class labels, for each row of windows
        """

        #### YOUR CODE HERE ####
        if not hasattr(windows[0], "__iter__"):
            windows = [windows]
            labels = [labels]

        N = len(windows)

        # x = self.sparams.L[windows]
        # x = x.reshape((N,x.shape[-2]*x.shape[-1]))
        # z = x.dot(self.params.W.T) + self.params.b1
        # h = tanh(z)
        # z2 = h.dot(self.params.U.T) + self.params.b2
        # p = softmax(z2)
        # J -= sum(log(p[0][labels])
        # J += (self.lreg / 2.0) * (sum(self.params.W**2.0) + sum(self.params.U**2.0))

        J = 0
        for n in xrange(N):
            x = self.sparams.L[windows[n]]
            x = reshape(x, x.shape[0] * x.shape[1])
            h = tanh(self.params.W.dot(x) + self.params.b1)
            y_hat = softmax(self.params.U.dot(h) + self.params.b2)
            y = make_onehot(labels[n], len(y_hat))
            J -= sum(y * log(y_hat))
        J += (self.lreg / 2.0) * (sum(self.params.W**2.0) +
                                  sum(self.params.U**2.0))
        #### END YOUR CODE ####
        return J
Ejemplo n.º 49
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        ns = len(xs)

        h_ant = zeros((1, self.hdim))

        J = 0
        #### YOUR CODE HERE ####
        for step in xrange(0, ns):
            # print "hs[step-1].shape %s" % (hs[step-1].shape,)
            # print "self.params.H.shape %s" % (self.params.H.shape,)
            # print "self.sparams.L.shape %s" % (self.sparams.L.shape,)
            # print "self.sparams.L[xs[step]].shape %s" % (self.sparams.L[xs[step]].shape,)
            a1 = self.params.H.dot(h_ant.T).T + self.sparams.L[xs[step]]
            h = sigmoid(a1)
            a2 = self.params.U.dot(h.T).T
            # print "h.shape %s" % (h.shape,)
            # print "a2.shape %s" % (a2.shape,)
            # print "self.params.U.shape %s" % (self.params.U.shape,)
            y_hat = softmax(a2)
            h_ant = h

            J -= log(y_hat[:, ys[step]])

        #### END YOUR CODE ####
        return J
Ejemplo n.º 50
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####

        # Expect xs as list of indices
        ns = len(xs)

        # make matrix here of corresponding h(t)
        # hs[-1] = initial hidden state (zeros)
        hs = zeros((ns+1, self.hdim))

        # _for memory purposes_, we do not compute the loss in one fell swoop
        # forward propagation
        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t-1]) + self.sparams.L[xs[t]])
            p = softmax(dot(self.sparams.U, hs[t]))
            J -= sum(log(p[ys[t]]))

        #### END YOUR CODE ####
        return J
Ejemplo n.º 51
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.

        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """


       
        ns = len(xs)
        hs = zeros((ns+1, self.hdim))
        ps = zeros((ns, self.vdim))
        
        for i in range(ns):
            z1 = self.params.H.dot(hs[i-1]) + self.sparams.L[xs[i]]
            hs[i] = sigmoid(z1)
            z2 = self.params.U.dot(hs[i])
            ps[i] = softmax(z2)        

        J = sum(-log(ps[range(len(ps)), ys]))

        return J
Ejemplo n.º 52
0
    def compute_seq_loss(self, xs, ys):
        """
        Compute the total cross-entropy loss
        for an input sequence xs and output
        sequence (labels) ys.
        You should run the RNN forward,
        compute cross-entropy loss at each timestep,
        and return the sum of the point losses.
        """

        J = 0
        #### YOUR CODE HERE ####
        ns = len(xs)
        self.xs = xs
        self.ys=ys
        
        hs = zeros((ns+1, self.hdim))
        self.hs1 = hs
        # for each time step
        for t in xrange(ns):
            hs[t] = sigmoid(dot(self.params.H, hs[t - 1]) + self.sparams.L[xs[t]])
            y_hat = softmax(dot(self.params.U, hs[t]))
            J -= log(y_hat[ys[t]])

        #### END YOUR CODE ####
        return J
Ejemplo n.º 53
0
    def _acc_grads(self, window, label):
        """
        Accumulate gradients, given a training point
        (window, label) of the format

        window = [x_{i-1} x_{i} x_{i+1}] # three ints
        label = {0,1,2,3,4} # single int, gives class

        Your code should update self.grads and self.sgrads,
        in order for gradient_check and training to work.

        So, for example:
        self.grads.U += (your gradient dJ/dU)
        self.sgrads.L[i] = (gradient dJ/dL[i]) # this adds an update for that index
        
        """
        W, b1, U, b2 = self.params.W, self.params.b1, self.params.U, self.params.b2
        H, input_size = W.shape
        C, H = U.shape
        
        # Convert window indices to input
        X = self.sparams.L[window].reshape(input_size, 1)

        # Forward Pass (predictions)
        z = np.dot(W, X) + b1.reshape(H, 1)
        hidden = np.tanh(z)
        
        # Tanh
        scores = np.dot(U, hidden) + b2.reshape(C, 1)
        probs = softmax(scores)
        y_hat = probs[label]

        # Cross Entropy Loss
        loss = -np.log(y_hat)

        # Backpropagate!
        dscores = probs
        dscores[label] -= 1
        
        self.grads.b2 += dscores.reshape(C)
        self.grads.U += np.dot(dscores, hidden.T)
        
        dhidden = np.dot(U.T, dscores)
        dz = (1 - hidden**2) * dhidden # tanh derivative
        
        self.grads.b1 += dz.reshape(H)
        self.grads.W += np.dot(dz, X.T)

        # Push input vectors around
        dX = np.dot(W.T, dz).reshape(self.windowsize, self.D)
        self.sgrads.L[window] = dX
        
        # Regularization
        loss += 0.5 * self.lreg*(np.sum(W**2) + np.sum(b1**2) + np.sum(U**2) + np.sum(b2**2))
        
        self.grads.W  += self.lreg*W
        self.grads.b1 += self.lreg*b1
        self.grads.U  += self.lreg*U
        self.grads.b2 += self.lreg*b2
Ejemplo n.º 54
0
 def forward_pass(self, x):
     z1 = self.params.W.dot(x) + self.params.b1
     h1 = tanh(z1)
     z2 = np.dot(self.params.U, h1) + self.params.b2
     h2 = tanh(z2)
     z3 = np.dot(self.params.G, h2) + self.params.b3
     y_hat = softmax(z3)
     return y_hat
Ejemplo n.º 55
0
    def predict(self, node, correct=[], guess=[]):
        cost  =  total = 0.0
        # this is exactly the same setup as forwardProp in rnn.py
        if node.isLeaf == True:
            node.fprop = True
            node.hActs1 = self.L[:,node.word]
            #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2)

            tmp = node.hActs1*self.dropoutP
            tmpMaxout = np.zeros((self.maxoutK, self.middleDim))
            for i in range(self.maxoutK):
                tmpMaxout[i] = self.W2[i].dot(tmp)+self.b2[i]
            (node.hActs2, node.idx) = self.maxout(tmpMaxout)
            
            node.probs = softmax((self.Ws*self.dropoutP).dot(node.hActs1)+self.bs)
            p = node.probs*make_onehot(node.label,len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            return cost, 1
        
        c1,t1 = self.forwardProp(node.left,correct,guess)
        c2,t2 = self.forwardProp(node.right,correct,guess)
        if node.left.fprop and node.right.fprop:
            node.fprop = True
            h = np.hstack([node.left.hActs1, node.right.hActs1])
            node.hActs1 = self.ReLU(self.W1.dot(h) + self.b1)
            #node.hActs2 = self.ReLU(self.W2.dot(node.hActs1)+self.b2)
            tmp = node.hActs1*self.dropoutP
            tmpMaxout = np.zeros((self.maxoutK,self.middleDim))
            for i in range(self.maxoutK):
                tmpMaxout[i] = self.W2[i].dot(tmp)+self.b2[i]
            (node.hActs2, node.idx) = self.maxout(tmpMaxout)
            
            node.probs = softmax((self.Ws*self.dropoutP).dot(node.hActs2)+self.bs)
            p = node.probs*make_onehot(node.label,len(self.bs))
            cost = -np.log(np.sum(p))
            correct.append(node.label)
            guess.append(np.argmax(node.probs))
            
        cost += c1
        cost += c2
        total += t1
        total += t2
        return cost, total + 1
Ejemplo n.º 56
0
 def compute_loss(self, x, label):
     """
     Compute the cost function for a single example.
     """
     #import ipdb; ipdb.set_trace()
     ##
     # Forward propagation
     p = softmax(self.params.W.dot(x) + self.params.b)
     J = -1 * np.log(p[label])  # cross-entropy loss
     Jreg = (self.lreg / 2.0) * np.sum(self.params.W**2.0)
     return J + Jreg
Ejemplo n.º 57
0
 def compute_loss(self, idx, label):
     """
     Compute the cost function for a single example.
     """
     ##
     # Forward propagation
     x = self.sparams.L[idx]
     p = softmax(self.params.W.dot(x) + self.params.b)
     J = -1*log(p[label]) # cross-entropy loss
     Jreg = (self.lreg / 2.0) * sum(self.params.W**2.0)
     return J + Jreg