Example #1
0
    def initParams(self):
        # crude way of random initialization (random seed) for parameters
        import time
        self.seed = int(time.time()) % 100000
        # for tt in range(self.seed): gp.rand()

        sizes = [self.inputDim] + self.layerSizes + [self.outputDim]
        scales = [
            gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:])
        ]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        self.hActs = [gp.empty((s, self.mbSize)) for s in sizes]

        if self.train:
            self.deltas = [gp.empty((s, self.mbSize)) for s in sizes[1:]]
            self.grad = [[gp.empty(w.shape),
                          gp.empty(b.shape)] for w, b in self.stack]
            for tt in range(self.seed):
                gp.rand()

            self.stack = [[
                ws[0] + .01 * gp.randn(ws[0].shape),
                ws[1] + .01 * gp.randn(ws[1].shape)
            ] for ws in self.stack]
Example #2
0
File: nnet.py Project: awni/awni_ml
    def initParams(self):
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        self.hActs = [gp.empty((s,self.mbSize)) for s in sizes]

        if self.train:
            self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]]
            self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
Example #3
0
 def diff(A,axis,out):
     if axis==0:
         if out == None:
             out = gp.empty((A.shape[0]-1,A.shape[1]),dtype=A.dtype)
         A._base_shaped(1).diff_cols(target=out._base_shaped(1))
         return out
     else:
         if out == None:
             out = gp.empty((A.shape[0],A.shape[1]-1),dtype=A.dtype)
         A._base_shaped(1).diff_rows(target=out._base_shaped(1))
         return out
Example #4
0
 def diff(A, axis, out):
     if axis == 0:
         if out == None:
             out = gp.empty((A.shape[0] - 1, A.shape[1]), dtype=A.dtype)
         A._base_shaped(1).diff_cols(target=out._base_shaped(1))
         return out
     else:
         if out == None:
             out = gp.empty((A.shape[0], A.shape[1] - 1), dtype=A.dtype)
         A._base_shaped(1).diff_rows(target=out._base_shaped(1))
         return out
Example #5
0
    def initParams(self):
	"""
	Initialize parameters using 6/sqrt(fanin+fanout)
	"""
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        self.hActs = [gp.empty((s,1)) for s in sizes]

        if self.train:
            self.deltas = [gp.empty((s,1)) for s in sizes[1:]]
            self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
Example #6
0
File: rnn.py Project: yujiali/pynn
    def forward_prop(self, X=None, T=10, h_init=None, **kwargs):
        """
        options:
        - X can be None, when there's no input, then T must be specified
        - if X is not None, T will not be used
        - an extra h_init can be given to the forward prop to feed into the
          first hidden state activation.
        """
        if X is not None and self.has_input:
            X = gnp.as_garray(X)
            self.X = X

            T = X.shape[0]

            self.A = X.dot(self.W_ih) + self.b
        else:
            self.X = None
            self.A = self.b.tile((T,1))

        self.H = gnp.empty((T, self.out_dim))

        if h_init is not None:
            self.h_init = gnp.as_garray(h_init)
            self.A[0] += self.h_init.reshape(1,-1).dot(self.W_hh)
        else:
            self.h_init = None

        self.H[0] = self.nonlin.forward_prop(self.A[0])

        for t in range(1, T):
            self.A[t] += self.H[t-1].reshape(1,-1).dot(self.W_hh)
            self.H[t] = self.nonlin.forward_prop(self.A[t])

        return self.H
Example #7
0
File: rnn.py Project: yujiali/pynn
    def forward_prop(self, X=None, T=10, h_init=None, **kwargs):
        """
        options:
        - X can be None, when there's no input, then T must be specified
        - if X is not None, T will not be used
        - an extra h_init can be given to the forward prop to feed into the
          first hidden state activation.
        """
        if X is not None and self.has_input:
            X = gnp.as_garray(X)
            self.X = X

            T = X.shape[0]

            self.A = X.dot(self.W_ih) + self.b
        else:
            self.X = None
            self.A = self.b.tile((T, 1))

        self.H = gnp.empty((T, self.out_dim))

        if h_init is not None:
            self.h_init = gnp.as_garray(h_init)
            self.A[0] += self.h_init.reshape(1, -1).dot(self.W_hh)
        else:
            self.h_init = None

        self.H[0] = self.nonlin.forward_prop(self.A[0])

        for t in range(1, T):
            self.A[t] += self.H[t - 1].reshape(1, -1).dot(self.W_hh)
            self.H[t] = self.nonlin.forward_prop(self.A[t])

        return self.H
Example #8
0
 def divide(A,B,out):
     if out == None:
         out = gp.empty(A.shape)
     if np.isscalar(B):         A._base_shaped(1).divide(B,target=out._base_shaped(1))
     elif A.shape == B.shape:   A._base_shaped(1).divide(B._base_shaped(1),target=out._base_shaped(1))
     else: raise NotImplementedError("broadcasted division not implemented by cudamat")
     return out
Example #9
0
 def dot_tn(A, B, out):
     if out == None:
         out = gp.empty((A.shape[1], B.shape[1]), dtype=A.dtype)
     cudamat.dot(B._base_as_2d(),
                 A._base_as_2d().T,
                 target=out._base_as_2d())
     return out
Example #10
0
 def dot_nt(A,B,out):
     # Using B._base_as_2d().T does not work; cudamat returns dimensionality error
     B._base.mat.is_trans = not B._base.mat.is_trans
     if out == None:
         out = gp.empty((A.shape[1],B.shape[1]))
     cudamat.dot(B._base_as_2d(),A._base_as_2d(),target=out._base_as_2d())
     B._base.mat.is_trans = not B._base.mat.is_trans
     return out
Example #11
0
 def maximum(A,B,out):
     if out == None:
         out = gp.empty(A.shape)
     if np.isscalar(A) and not np.isscalar(B):
         A,B = B,A
     if np.isscalar(B): A._base_shaped(1).maximum(B,target=out._base_shaped(1))
     else:              A._base_shaped(1).maximum(B._base_shaped(1),target=out._base_shaped(1))
     return out
Example #12
0
 def dot_nt(A, B, out):
     # Using B._base_as_2d().T does not work; cudamat returns dimensionality error
     B._base.mat.is_trans = not B._base.mat.is_trans
     if out == None:
         out = gp.empty((A.shape[1], B.shape[1]), dtype=A.dtype)
     cudamat.dot(B._base_as_2d(), A._base_as_2d(), target=out._base_as_2d())
     B._base.mat.is_trans = not B._base.mat.is_trans
     return out
Example #13
0
    def initParams(self):
        # crude way of random initialization (random seed) for parameters
        import time
        self.seed = int(time.time()) % 100000;
        # for tt in range(self.seed): gp.rand()
        
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        self.hActs = [gp.empty((s,self.mbSize)) for s in sizes]

        if self.train:
            self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]]
            self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
            for tt in range(self.seed): gp.rand()

            self.stack = [[ws[0]+.01 * gp.randn(ws[0].shape),ws[1]+.01 * gp.randn(ws[1].shape)] 
                        for ws in self.stack]
Example #14
0
    def __init__(self, memCache, batchsize, dim, capacity):
        self.memCache = memCache
        self.batchsize = batchsize

        #read maxrows per call to MemCache, which is aligned to batchsize
        self.maxrows = calcRowsForAlign(capacity, batchsize, dim)

        self.data = gp.empty((self.maxrows, dim))
        self.index = 0
        self.size = 0
Example #15
0
    def __init__(self, memCache, batchsize, dim, capacity):
        self.memCache=memCache
        self.batchsize=batchsize

        #read maxrows per call to MemCache, which is aligned to batchsize
        self.maxrows=calcRowsForAlign(capacity, batchsize, dim)

        self.data=gp.empty((self.maxrows, dim))
        self.index=0
        self.size=0
Example #16
0
    def initParams(self):
	"""
	Initialize parameters using 6/sqrt(fanin+fanout)
	"""
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        if self.temporalLayer > 0:
            rs = sizes[self.temporalLayer]
            s = gp.sqrt(6)/ rs
            # temporal layer stored at end of stack
            self.stack.append([gp.rand(rs,rs) * 2 * s - s, gp.zeros((2,1))])
        
        if self.train:
            #TODO why store all deltas?
            #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]]
            #NOTE if a temporal layer is used it's already added to stack so will have a grad
            self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
Example #17
0
 def maximum(A, B, out):
     if out == None:
         out = gp.empty(A.shape, dtype=A.dtype)
     if np.isscalar(A) and not np.isscalar(B):
         A, B = B, A
     if np.isscalar(B):
         A._base_shaped(1).maximum(B, target=out._base_shaped(1))
     else:
         A._base_shaped(1).maximum(B._base_shaped(1),
                                   target=out._base_shaped(1))
     return out
Example #18
0
 def divide(A, B, out):
     if out == None:
         out = gp.empty(A.shape, dtype=A.dtype)
     if np.isscalar(B):
         A._base_shaped(1).divide(B, target=out._base_shaped(1))
     elif A.shape == B.shape:
         A._base_shaped(1).divide(B._base_shaped(1),
                                  target=out._base_shaped(1))
     else:
         raise NotImplementedError(
             "broadcasted division not implemented by cudamat")
     return out
Example #19
0
 def _multiply(A,B,out):
     if out == None:
         out = gp.empty(A.shape)
     if np.isscalar(B): 
         A._base_shaped(1).mult(B,target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[0] == 1) and B.size == A.shape[1]:
         A._base_shaped(1).mult_by_col(B._base_shaped(1),target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]:
         A._base_shaped(1).mult_by_row(B._base_shaped(1),target=out._base_shaped(1))
     else:
         A._base_shaped(1).mult(B._base_shaped(1),target=out._base_shaped(1))
     return out
Example #20
0
 def min(A,axis,out):
     if A.ndim == 2: 
         if out == None:
             out = gp.empty((A.shape[0],1) if axis == 1 else (1,A.shape[1]),dtype=A.dtype)
         A._base_shaped(1).min(1-axis,target=out._base_shaped(1))
         return out
     else:
         r = gp.min(A,axis)  # gnumpy has optimized max over 1D vectors, so use it
         if out != None:
             assert(out.size == 1)
             out[:] = r[:]
         return r
Example #21
0
 def _add(A,B,out):
     if out == None:
         out = gp.empty(A.shape)
     if np.isscalar(B): 
         A._base_shaped(1).add(B,target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[0] == 1) and B.size == A.shape[1]:
         A._base_shaped(1).add_col_vec(B._base_shaped(1),target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]:
         A._base_shaped(1).add_row_vec(B._base_shaped(1),target=out._base_shaped(1))
     else:
         A._base_shaped(1).add(B._base_shaped(1),target=out._base_shaped(1))
     return out
Example #22
0
 def sum(A,axis,out):
     if A.ndim == 2: 
         if out == None:
             out = gp.empty((A.shape[0],1) if axis == 1 else (1,A.shape[1]))
         cudamat.sum(A._base_shaped(1),1-axis,target=out._base_shaped(1))
         return out
     else:
         r = gp.sum(A,axis)  # gnumpy has optimized sum over 1D vectors, so use it
         if out != None:
             assert(out.size == 1)
             out[:] = r[:]
         return r
Example #23
0
    def initParams(self):
        """
	Initialize parameters using 6/sqrt(fanin+fanout)
	"""
        sizes = [self.inputDim] + self.layerSizes + [self.outputDim]
        scales = [
            gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:])
        ]
        self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \
                            for n,m,s in zip(sizes[:-1],sizes[1:],scales)]
        if self.temporalLayer > 0:
            rs = sizes[self.temporalLayer]
            s = gp.sqrt(6) / rs
            # temporal layer stored at end of stack
            self.stack.append([gp.rand(rs, rs) * 2 * s - s, gp.zeros((2, 1))])

        if self.train:
            #TODO why store all deltas?
            #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]]
            #NOTE if a temporal layer is used it's already added to stack so will have a grad
            self.grad = [[gp.empty(w.shape),
                          gp.empty(b.shape)] for w, b in self.stack]
Example #24
0
 def subtract(A,B,out):
     if out == None:
         out = gp.empty(A.shape,dtype=A.dtype)
     if np.isscalar(B):
         A._base_shaped(1).subtract(B,target=out._base_shaped(1))
     elif B.shape == A.shape:
         A._base_shaped(1).subtract(B._base_shaped(1),target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[0] == 1) and (A.ndim == 1 or B.size == A.shape[1]):
         A._base_shaped(1).subtract_col_vec(B._base_shaped(1),target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]:
         A._base_shaped(1).subtract_row_vec(B._base_shaped(1),target=out._base_shaped(1))
     else:
         raise Exception("unhandled case")
     return out
Example #25
0
 def _multiply(A,B,out):
     if out == None:
         out = gp.empty(A.shape,dtype=A.dtype)
     if np.isscalar(B): 
         A._base_shaped(1).mult(B,target=out._base_shaped(1))
     elif B.shape == A.shape:
         A._base_shaped(1).mult(B._base_shaped(1),target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[0] == 1) and B.size == A.shape[1]:
         A._base_shaped(1).mult_by_col(B._base_shaped(1),target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]:
         A._base_shaped(1).mult_by_row(B._base_shaped(1),target=out._base_shaped(1))
     else:
         raise Exception("unhandled case")
     return out
Example #26
0
 def min(A, axis, out):
     if A.ndim == 2:
         if out == None:
             out = gp.empty((A.shape[0], 1) if axis == 1 else
                            (1, A.shape[1]),
                            dtype=A.dtype)
         A._base_shaped(1).min(1 - axis, target=out._base_shaped(1))
         return out
     else:
         r = gp.min(
             A, axis)  # gnumpy has optimized max over 1D vectors, so use it
         if out != None:
             assert (out.size == 1)
             out[:] = r[:]
         return r
Example #27
0
 def _multiply(A, B, out):
     if out == None:
         out = gp.empty(A.shape, dtype=A.dtype)
     if np.isscalar(B):
         A._base_shaped(1).mult(B, target=out._base_shaped(1))
     elif B.shape == A.shape:
         A._base_shaped(1).mult(B._base_shaped(1),
                                target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[0] == 1) and B.size == A.shape[1]:
         A._base_shaped(1).mult_by_col(B._base_shaped(1),
                                       target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]:
         A._base_shaped(1).mult_by_row(B._base_shaped(1),
                                       target=out._base_shaped(1))
     else:
         raise Exception("unhandled case")
     return out
Example #28
0
 def subtract(A, B, out):
     if out == None:
         out = gp.empty(A.shape, dtype=A.dtype)
     if np.isscalar(B):
         A._base_shaped(1).subtract(B, target=out._base_shaped(1))
     elif B.shape == A.shape:
         A._base_shaped(1).subtract(B._base_shaped(1),
                                    target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[0] == 1) and (A.ndim == 1
                                                or B.size == A.shape[1]):
         A._base_shaped(1).subtract_col_vec(B._base_shaped(1),
                                            target=out._base_shaped(1))
     elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]:
         A._base_shaped(1).subtract_row_vec(B._base_shaped(1),
                                            target=out._base_shaped(1))
     else:
         raise Exception("unhandled case")
     return out
Example #29
0
File: rnn.py Project: yujiali/pynn
    def backward_prop(self, grad=None, grad_end=None):
        if grad is not None:
            T = grad.shape[0]
            assert T == self.H.shape[0]

            dH = grad.copy()
        else:
            T = self.H.shape[0]
            dH = gnp.zeros((T, self.H.shape[1]))

        if grad_end is not None:
            dH[-1] += gnp.as_garray(grad_end).ravel()

        dA = gnp.empty((dH.shape[0], dH.shape[1]))

        for t in range(1,T)[::-1]:
            dA[t] = self.nonlin.backward_prop(self.A[t], self.H[t]) * dH[t]
            dH[t-1] += self.W_hh.dot(dA[t].reshape(-1,1)).ravel()
        dA[0] = self.nonlin.backward_prop(self.A[0], self.H[0]) * dH[0]

        self.dW_hh += self.H[:-1].T.dot(dA[1:])

        if self.h_init is not None:
            self.dW_hh += self.h_init.reshape(-1,1).dot(dA[0].reshape(1,-1))

        self.db += dA.sum(axis=0)

        if self.X is not None:
            dX = dA.dot(self.W_ih.T)
            self.dW_ih += self.X.T.dot(dA)
        else:
            dX = None

        if self.h_init is not None:
            self.dh_init = self.W_hh.dot(dA[0].reshape(-1,1)).ravel()

        return dX
Example #30
0
File: rnn.py Project: yujiali/pynn
    def backward_prop(self, grad=None, grad_end=None):
        if grad is not None:
            T = grad.shape[0]
            assert T == self.H.shape[0]

            dH = grad.copy()
        else:
            T = self.H.shape[0]
            dH = gnp.zeros((T, self.H.shape[1]))

        if grad_end is not None:
            dH[-1] += gnp.as_garray(grad_end).ravel()

        dA = gnp.empty((dH.shape[0], dH.shape[1]))

        for t in range(1, T)[::-1]:
            dA[t] = self.nonlin.backward_prop(self.A[t], self.H[t]) * dH[t]
            dH[t - 1] += self.W_hh.dot(dA[t].reshape(-1, 1)).ravel()
        dA[0] = self.nonlin.backward_prop(self.A[0], self.H[0]) * dH[0]

        self.dW_hh += self.H[:-1].T.dot(dA[1:])

        if self.h_init is not None:
            self.dW_hh += self.h_init.reshape(-1, 1).dot(dA[0].reshape(1, -1))

        self.db += dA.sum(axis=0)

        if self.X is not None:
            dX = dA.dot(self.W_ih.T)
            self.dW_ih += self.X.T.dot(dA)
        else:
            dX = None

        if self.h_init is not None:
            self.dh_init = self.W_hh.dot(dA[0].reshape(-1, 1)).ravel()

        return dX
Example #31
0
 def subtract_nt(A, B, out):
     if out == None:
         out = gp.empty(A.shape, dtype=A.dtype)
     A._base_shaped(1).subtract_transpose(B._base_shaped(1),
                                          target=out._base_shaped(1))
     return out
Example #32
0
 def reciprocal(A, out):
     if out == None:
         out = gp.empty(A.shape, dtype=A.dtype)
     A._base_as_row().reciprocal(out._base_as_row())
     return out
Example #33
0
 def subtract_nt(A,B,out):
     if out == None:
         out = gp.empty(A.shape,dtype=A.dtype)
     A._base_shaped(1).subtract_transpose(B._base_shaped(1),target=out._base_shaped(1))
     return out
Example #34
0
 def empty(shape, dtype):
     return gp.empty(shape, dtype=dtype)
Example #35
0
 def _unary(func, A, out):
     if out == None:
         out = gp.empty(A.shape, dtype=A.dtype)
     func(A._base_as_row(), target=out._base_as_row())
     return out
Example #36
0
 def dot_tn(A,B,out):
     if out == None:
         out = gp.empty((A.shape[1],B.shape[1]))
     cudamat.dot(B._base_as_2d(),A._base_as_2d().T,target=out._base_as_2d())
     return out
Example #37
0
    def empty(shape):    return gp.empty(shape)

    @staticmethod
Example #38
0
    def costAndGrad(self, data, labels, key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        T = data.shape[1]
        sizes = [self.inputDim] + self.layerSizes + [self.outputDim]
        stackMax = len(self.stack) - 1
        if self.temporalLayer > 0:
            stackMax -= 1

        self.hActs = [gp.empty((s, T)) for s in sizes]
        self.hActs[0] = data
        #for t in range(T):
        i = 1
        for l in range(stackMax + 1):
            w, b = self.stack[l]

            self.hActs[i] = w.dot(self.hActs[i - 1]) + b
            # loop over time for recurrent layer
            if (self.temporalLayer - 1) == l:
                for t in range(T):
                    if t > 0:
                        self.hActs[i][:, t] += self.stack[-1][0].dot(
                            self.hActs[i][:, t - 1])
                    # nonlinearity
                    if i <= stackMax:
                        self.hActs[i][:, t] = self.activation(self.hActs[i][:,
                                                                            t])
            # hidden layer activation function for batch forward prop
            elif i <= stackMax:
                self.hActs[i] = self.activation(self.hActs[i])

            #    w_t,b_t = self.stack[-1][0]
            #    self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
            i += 1

        # convert final layer to probs after all time iteration complete
        probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0)
        probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs / np.sum(probs, axis=0)

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us?
        cost, delta_output, skip = ctc.ctc_loss(probs,
                                                labels.squeeze(),
                                                blank=0)

        # Store probabilities and error signal for a given key
        if key is not None and key in self.hist:
            self.hist[key].append((probs, delta_output))

        if not self.train:
            return cost, None

        delta_output = gp.garray(delta_output)
        ## back prop through time
        # zero gradients
        self.grad = [[gp.zeros(w.shape), gp.zeros(b.shape)]
                     for w, b in self.stack]
        if self.temporalLayer > 0:
            delta_t = np.zeros(self.layerSizes[self.temporalLayer - 1])
        for t in reversed(range(T)):
            # get delta from loss function
            delta = delta_output[:, t].T

            # compute gradient for output layer
            #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape
            #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape
            # TODO can we get rid of some of these annoying reshape -1 1?
            self.grad[stackMax][0] += delta.reshape(-1, 1).dot(
                self.hActs[-2][:, t].reshape(-1, 1).T)
            self.grad[stackMax][1] += delta.reshape(-1, 1)

            # push delta through output layer
            delta = self.stack[stackMax][0].T.dot(delta)

            # iterate over lower layers
            i = len(self.layerSizes) - 1
            while i >= 0:
                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer - 1) == i:
                    #print delta.shape, delta_t.shape
                    delta += delta_t
                # push delta through activation function for this layer
                #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape
                delta = delta * self.activation(self.hActs[i + 1][:, t], True)
                #embed()
                # compute the gradient
                #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape
                self.grad[i][0] += delta.reshape(-1, 1).dot(
                    self.hActs[i][:, t].T.reshape(1, -1))
                self.grad[i][1] += delta.reshape(-1, 1)

                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer - 1) == i and t > 0:
                    self.grad[-1][0] += delta.reshape(-1, 1).dot(
                        self.hActs[i + 1][:, t - 1].T.reshape(1, -1))
                    # push delta through temporal connections
                    delta_t = self.stack[-1][0].T.dot(delta)

                    # HACK no bias for temporal layer. Give it a gradient of 0
                    self.grad[-1][1] = np.zeros((2, 1))

                # push the delta downward
                w, b = self.stack[i]
                delta = w.T.dot(delta)
                i -= 1
        #print self.grad
        return cost, self.grad, skip
Example #39
0
 def dot(A,B,out):
     if out == None:
         out = gp.empty((A.shape[0],B.shape[1]),dtype=A.dtype)
     cudamat.dot(B._base_as_2d(),A._base_as_2d(),target=out._base_as_2d())
     return out
Example #40
0
 def reciprocal(A,out):
     if out == None:
         out = gp.empty(A.shape)
     A._base_as_row().reciprocal(out._base_as_row())
     return out
Example #41
0
 def transpose(A, out):
     if out == None:
         out = gp.empty((A.shape[1], A.shape[0]), dtype=A.dtype)
     A._base_shaped(1).transpose(out._base_shaped(1))
     return out
Example #42
0
 def square(A,out):
     if out == None:
         out = gp.empty(A.shape)
     cudamat.square(A._base_as_row(),target=out._base_as_row())
     return out
Example #43
0
 def square(A, out):
     if out == None:
         out = gp.empty(A.shape, dtype=A.dtype)
     cudamat.square(A._base_as_row(), target=out._base_as_row())
     return out
Example #44
0
 def _unary(func,A,out):
     if out == None:
         out = gp.empty(A.shape)
     func(A._base_as_row(),target=out._base_as_row())
     return out
Example #45
0
    def costAndGrad(self,data,labels,key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        T = data.shape[1]
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        stackMax = len(self.stack)-1
        if self.temporalLayer > 0:
            stackMax -= 1

        self.hActs = [gp.empty((s,T)) for s in sizes]
        self.hActs[0] = data
        #for t in range(T):
        i = 1
        for l in range(stackMax+1):
            w,b = self.stack[l]

            self.hActs[i] = w.dot(self.hActs[i-1]) + b
            # loop over time for recurrent layer
            if (self.temporalLayer-1) == l:
                for t in range(T):
                    if t > 0:
                        self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
                    # nonlinearity 
                    if i <= stackMax:
                        self.hActs[i][:,t] = self.activation(self.hActs[i][:,t])
            # hidden layer activation function for batch forward prop
            elif i <= stackMax:
                self.hActs[i] = self.activation(self.hActs[i])

            #    w_t,b_t = self.stack[-1][0]
            #    self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
            i += 1

        # convert final layer to probs after all time iteration complete
        probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0)
	probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs/np.sum(probs,axis=0)

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us? 
        cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0)

	# Store probabilities and error signal for a given key
	if key is not None and key in self.hist:
	    self.hist[key].append((probs,delta_output))

        if not self.train:
            return cost,None

        delta_output =  gp.garray(delta_output)
        ## back prop through time
        # zero gradients
        self.grad = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.stack]
        if self.temporalLayer > 0:
            delta_t = np.zeros(self.layerSizes[self.temporalLayer-1])
        for t in reversed(range(T)):
            # get delta from loss function
            delta = delta_output[:,t].T

            # compute gradient for output layer
            #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape
            #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape
            # TODO can we get rid of some of these annoying reshape -1 1?
            self.grad[stackMax][0] +=  delta.reshape(-1,1).dot(self.hActs[-2][:,t].reshape(-1,1).T)
            self.grad[stackMax][1] +=  delta.reshape(-1, 1)

            # push delta through output layer
            delta = self.stack[stackMax][0].T.dot(delta)
            
            # iterate over lower layers
            i = len(self.layerSizes)-1
            while i >= 0:
                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer-1) == i:
                    #print delta.shape, delta_t.shape
                    delta += delta_t
                # push delta through activation function for this layer
                #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape
                delta = delta * self.activation(self.hActs[i+1][:,t], True)
                #embed()
                # compute the gradient
                #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape
                self.grad[i][0] += delta.reshape(-1,1).dot(self.hActs[i][:,t].T.reshape(1,-1))
                self.grad[i][1] += delta.reshape(-1,1)

                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer-1) == i and t > 0:
                    self.grad[-1][0] += delta.reshape(-1,1).dot(self.hActs[i+1][:,t-1].T.reshape(1,-1))
                    # push delta through temporal connections
                    delta_t = self.stack[-1][0].T.dot(delta)

                    # HACK no bias for temporal layer. Give it a gradient of 0
                    self.grad[-1][1] = np.zeros((2,1))

                # push the delta downward
                w,b = self.stack[i]
                delta = w.T.dot(delta)
                i -= 1
        #print self.grad
        return cost,self.grad, skip
Example #46
0
 def transpose(A,out):
     if out == None:
         out = gp.empty((A.shape[1],A.shape[0]),dtype=A.dtype)
     A._base_shaped(1).transpose(out._base_shaped(1))
     return out