def initParams(self): # crude way of random initialization (random seed) for parameters import time self.seed = int(time.time()) % 100000 # for tt in range(self.seed): gp.rand() sizes = [self.inputDim] + self.layerSizes + [self.outputDim] scales = [ gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:]) ] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s, self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s, self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape), gp.empty(b.shape)] for w, b in self.stack] for tt in range(self.seed): gp.rand() self.stack = [[ ws[0] + .01 * gp.randn(ws[0].shape), ws[1] + .01 * gp.randn(ws[1].shape) ] for ws in self.stack]
def initParams(self): sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s,self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def diff(A,axis,out): if axis==0: if out == None: out = gp.empty((A.shape[0]-1,A.shape[1]),dtype=A.dtype) A._base_shaped(1).diff_cols(target=out._base_shaped(1)) return out else: if out == None: out = gp.empty((A.shape[0],A.shape[1]-1),dtype=A.dtype) A._base_shaped(1).diff_rows(target=out._base_shaped(1)) return out
def diff(A, axis, out): if axis == 0: if out == None: out = gp.empty((A.shape[0] - 1, A.shape[1]), dtype=A.dtype) A._base_shaped(1).diff_cols(target=out._base_shaped(1)) return out else: if out == None: out = gp.empty((A.shape[0], A.shape[1] - 1), dtype=A.dtype) A._base_shaped(1).diff_rows(target=out._base_shaped(1)) return out
def initParams(self): """ Initialize parameters using 6/sqrt(fanin+fanout) """ sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s,1)) for s in sizes] if self.train: self.deltas = [gp.empty((s,1)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def forward_prop(self, X=None, T=10, h_init=None, **kwargs): """ options: - X can be None, when there's no input, then T must be specified - if X is not None, T will not be used - an extra h_init can be given to the forward prop to feed into the first hidden state activation. """ if X is not None and self.has_input: X = gnp.as_garray(X) self.X = X T = X.shape[0] self.A = X.dot(self.W_ih) + self.b else: self.X = None self.A = self.b.tile((T,1)) self.H = gnp.empty((T, self.out_dim)) if h_init is not None: self.h_init = gnp.as_garray(h_init) self.A[0] += self.h_init.reshape(1,-1).dot(self.W_hh) else: self.h_init = None self.H[0] = self.nonlin.forward_prop(self.A[0]) for t in range(1, T): self.A[t] += self.H[t-1].reshape(1,-1).dot(self.W_hh) self.H[t] = self.nonlin.forward_prop(self.A[t]) return self.H
def forward_prop(self, X=None, T=10, h_init=None, **kwargs): """ options: - X can be None, when there's no input, then T must be specified - if X is not None, T will not be used - an extra h_init can be given to the forward prop to feed into the first hidden state activation. """ if X is not None and self.has_input: X = gnp.as_garray(X) self.X = X T = X.shape[0] self.A = X.dot(self.W_ih) + self.b else: self.X = None self.A = self.b.tile((T, 1)) self.H = gnp.empty((T, self.out_dim)) if h_init is not None: self.h_init = gnp.as_garray(h_init) self.A[0] += self.h_init.reshape(1, -1).dot(self.W_hh) else: self.h_init = None self.H[0] = self.nonlin.forward_prop(self.A[0]) for t in range(1, T): self.A[t] += self.H[t - 1].reshape(1, -1).dot(self.W_hh) self.H[t] = self.nonlin.forward_prop(self.A[t]) return self.H
def divide(A,B,out): if out == None: out = gp.empty(A.shape) if np.isscalar(B): A._base_shaped(1).divide(B,target=out._base_shaped(1)) elif A.shape == B.shape: A._base_shaped(1).divide(B._base_shaped(1),target=out._base_shaped(1)) else: raise NotImplementedError("broadcasted division not implemented by cudamat") return out
def dot_tn(A, B, out): if out == None: out = gp.empty((A.shape[1], B.shape[1]), dtype=A.dtype) cudamat.dot(B._base_as_2d(), A._base_as_2d().T, target=out._base_as_2d()) return out
def dot_nt(A,B,out): # Using B._base_as_2d().T does not work; cudamat returns dimensionality error B._base.mat.is_trans = not B._base.mat.is_trans if out == None: out = gp.empty((A.shape[1],B.shape[1])) cudamat.dot(B._base_as_2d(),A._base_as_2d(),target=out._base_as_2d()) B._base.mat.is_trans = not B._base.mat.is_trans return out
def maximum(A,B,out): if out == None: out = gp.empty(A.shape) if np.isscalar(A) and not np.isscalar(B): A,B = B,A if np.isscalar(B): A._base_shaped(1).maximum(B,target=out._base_shaped(1)) else: A._base_shaped(1).maximum(B._base_shaped(1),target=out._base_shaped(1)) return out
def dot_nt(A, B, out): # Using B._base_as_2d().T does not work; cudamat returns dimensionality error B._base.mat.is_trans = not B._base.mat.is_trans if out == None: out = gp.empty((A.shape[1], B.shape[1]), dtype=A.dtype) cudamat.dot(B._base_as_2d(), A._base_as_2d(), target=out._base_as_2d()) B._base.mat.is_trans = not B._base.mat.is_trans return out
def initParams(self): # crude way of random initialization (random seed) for parameters import time self.seed = int(time.time()) % 100000; # for tt in range(self.seed): gp.rand() sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s,self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack] for tt in range(self.seed): gp.rand() self.stack = [[ws[0]+.01 * gp.randn(ws[0].shape),ws[1]+.01 * gp.randn(ws[1].shape)] for ws in self.stack]
def __init__(self, memCache, batchsize, dim, capacity): self.memCache = memCache self.batchsize = batchsize #read maxrows per call to MemCache, which is aligned to batchsize self.maxrows = calcRowsForAlign(capacity, batchsize, dim) self.data = gp.empty((self.maxrows, dim)) self.index = 0 self.size = 0
def __init__(self, memCache, batchsize, dim, capacity): self.memCache=memCache self.batchsize=batchsize #read maxrows per call to MemCache, which is aligned to batchsize self.maxrows=calcRowsForAlign(capacity, batchsize, dim) self.data=gp.empty((self.maxrows, dim)) self.index=0 self.size=0
def initParams(self): """ Initialize parameters using 6/sqrt(fanin+fanout) """ sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] if self.temporalLayer > 0: rs = sizes[self.temporalLayer] s = gp.sqrt(6)/ rs # temporal layer stored at end of stack self.stack.append([gp.rand(rs,rs) * 2 * s - s, gp.zeros((2,1))]) if self.train: #TODO why store all deltas? #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] #NOTE if a temporal layer is used it's already added to stack so will have a grad self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def maximum(A, B, out): if out == None: out = gp.empty(A.shape, dtype=A.dtype) if np.isscalar(A) and not np.isscalar(B): A, B = B, A if np.isscalar(B): A._base_shaped(1).maximum(B, target=out._base_shaped(1)) else: A._base_shaped(1).maximum(B._base_shaped(1), target=out._base_shaped(1)) return out
def divide(A, B, out): if out == None: out = gp.empty(A.shape, dtype=A.dtype) if np.isscalar(B): A._base_shaped(1).divide(B, target=out._base_shaped(1)) elif A.shape == B.shape: A._base_shaped(1).divide(B._base_shaped(1), target=out._base_shaped(1)) else: raise NotImplementedError( "broadcasted division not implemented by cudamat") return out
def _multiply(A,B,out): if out == None: out = gp.empty(A.shape) if np.isscalar(B): A._base_shaped(1).mult(B,target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[0] == 1) and B.size == A.shape[1]: A._base_shaped(1).mult_by_col(B._base_shaped(1),target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]: A._base_shaped(1).mult_by_row(B._base_shaped(1),target=out._base_shaped(1)) else: A._base_shaped(1).mult(B._base_shaped(1),target=out._base_shaped(1)) return out
def min(A,axis,out): if A.ndim == 2: if out == None: out = gp.empty((A.shape[0],1) if axis == 1 else (1,A.shape[1]),dtype=A.dtype) A._base_shaped(1).min(1-axis,target=out._base_shaped(1)) return out else: r = gp.min(A,axis) # gnumpy has optimized max over 1D vectors, so use it if out != None: assert(out.size == 1) out[:] = r[:] return r
def _add(A,B,out): if out == None: out = gp.empty(A.shape) if np.isscalar(B): A._base_shaped(1).add(B,target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[0] == 1) and B.size == A.shape[1]: A._base_shaped(1).add_col_vec(B._base_shaped(1),target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]: A._base_shaped(1).add_row_vec(B._base_shaped(1),target=out._base_shaped(1)) else: A._base_shaped(1).add(B._base_shaped(1),target=out._base_shaped(1)) return out
def sum(A,axis,out): if A.ndim == 2: if out == None: out = gp.empty((A.shape[0],1) if axis == 1 else (1,A.shape[1])) cudamat.sum(A._base_shaped(1),1-axis,target=out._base_shaped(1)) return out else: r = gp.sum(A,axis) # gnumpy has optimized sum over 1D vectors, so use it if out != None: assert(out.size == 1) out[:] = r[:] return r
def initParams(self): """ Initialize parameters using 6/sqrt(fanin+fanout) """ sizes = [self.inputDim] + self.layerSizes + [self.outputDim] scales = [ gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:]) ] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] if self.temporalLayer > 0: rs = sizes[self.temporalLayer] s = gp.sqrt(6) / rs # temporal layer stored at end of stack self.stack.append([gp.rand(rs, rs) * 2 * s - s, gp.zeros((2, 1))]) if self.train: #TODO why store all deltas? #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] #NOTE if a temporal layer is used it's already added to stack so will have a grad self.grad = [[gp.empty(w.shape), gp.empty(b.shape)] for w, b in self.stack]
def subtract(A,B,out): if out == None: out = gp.empty(A.shape,dtype=A.dtype) if np.isscalar(B): A._base_shaped(1).subtract(B,target=out._base_shaped(1)) elif B.shape == A.shape: A._base_shaped(1).subtract(B._base_shaped(1),target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[0] == 1) and (A.ndim == 1 or B.size == A.shape[1]): A._base_shaped(1).subtract_col_vec(B._base_shaped(1),target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]: A._base_shaped(1).subtract_row_vec(B._base_shaped(1),target=out._base_shaped(1)) else: raise Exception("unhandled case") return out
def _multiply(A,B,out): if out == None: out = gp.empty(A.shape,dtype=A.dtype) if np.isscalar(B): A._base_shaped(1).mult(B,target=out._base_shaped(1)) elif B.shape == A.shape: A._base_shaped(1).mult(B._base_shaped(1),target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[0] == 1) and B.size == A.shape[1]: A._base_shaped(1).mult_by_col(B._base_shaped(1),target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]: A._base_shaped(1).mult_by_row(B._base_shaped(1),target=out._base_shaped(1)) else: raise Exception("unhandled case") return out
def min(A, axis, out): if A.ndim == 2: if out == None: out = gp.empty((A.shape[0], 1) if axis == 1 else (1, A.shape[1]), dtype=A.dtype) A._base_shaped(1).min(1 - axis, target=out._base_shaped(1)) return out else: r = gp.min( A, axis) # gnumpy has optimized max over 1D vectors, so use it if out != None: assert (out.size == 1) out[:] = r[:] return r
def _multiply(A, B, out): if out == None: out = gp.empty(A.shape, dtype=A.dtype) if np.isscalar(B): A._base_shaped(1).mult(B, target=out._base_shaped(1)) elif B.shape == A.shape: A._base_shaped(1).mult(B._base_shaped(1), target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[0] == 1) and B.size == A.shape[1]: A._base_shaped(1).mult_by_col(B._base_shaped(1), target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]: A._base_shaped(1).mult_by_row(B._base_shaped(1), target=out._base_shaped(1)) else: raise Exception("unhandled case") return out
def subtract(A, B, out): if out == None: out = gp.empty(A.shape, dtype=A.dtype) if np.isscalar(B): A._base_shaped(1).subtract(B, target=out._base_shaped(1)) elif B.shape == A.shape: A._base_shaped(1).subtract(B._base_shaped(1), target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[0] == 1) and (A.ndim == 1 or B.size == A.shape[1]): A._base_shaped(1).subtract_col_vec(B._base_shaped(1), target=out._base_shaped(1)) elif (B.ndim == 1 or B.shape[1] == 1) and B.size == A.shape[0]: A._base_shaped(1).subtract_row_vec(B._base_shaped(1), target=out._base_shaped(1)) else: raise Exception("unhandled case") return out
def backward_prop(self, grad=None, grad_end=None): if grad is not None: T = grad.shape[0] assert T == self.H.shape[0] dH = grad.copy() else: T = self.H.shape[0] dH = gnp.zeros((T, self.H.shape[1])) if grad_end is not None: dH[-1] += gnp.as_garray(grad_end).ravel() dA = gnp.empty((dH.shape[0], dH.shape[1])) for t in range(1,T)[::-1]: dA[t] = self.nonlin.backward_prop(self.A[t], self.H[t]) * dH[t] dH[t-1] += self.W_hh.dot(dA[t].reshape(-1,1)).ravel() dA[0] = self.nonlin.backward_prop(self.A[0], self.H[0]) * dH[0] self.dW_hh += self.H[:-1].T.dot(dA[1:]) if self.h_init is not None: self.dW_hh += self.h_init.reshape(-1,1).dot(dA[0].reshape(1,-1)) self.db += dA.sum(axis=0) if self.X is not None: dX = dA.dot(self.W_ih.T) self.dW_ih += self.X.T.dot(dA) else: dX = None if self.h_init is not None: self.dh_init = self.W_hh.dot(dA[0].reshape(-1,1)).ravel() return dX
def backward_prop(self, grad=None, grad_end=None): if grad is not None: T = grad.shape[0] assert T == self.H.shape[0] dH = grad.copy() else: T = self.H.shape[0] dH = gnp.zeros((T, self.H.shape[1])) if grad_end is not None: dH[-1] += gnp.as_garray(grad_end).ravel() dA = gnp.empty((dH.shape[0], dH.shape[1])) for t in range(1, T)[::-1]: dA[t] = self.nonlin.backward_prop(self.A[t], self.H[t]) * dH[t] dH[t - 1] += self.W_hh.dot(dA[t].reshape(-1, 1)).ravel() dA[0] = self.nonlin.backward_prop(self.A[0], self.H[0]) * dH[0] self.dW_hh += self.H[:-1].T.dot(dA[1:]) if self.h_init is not None: self.dW_hh += self.h_init.reshape(-1, 1).dot(dA[0].reshape(1, -1)) self.db += dA.sum(axis=0) if self.X is not None: dX = dA.dot(self.W_ih.T) self.dW_ih += self.X.T.dot(dA) else: dX = None if self.h_init is not None: self.dh_init = self.W_hh.dot(dA[0].reshape(-1, 1)).ravel() return dX
def subtract_nt(A, B, out): if out == None: out = gp.empty(A.shape, dtype=A.dtype) A._base_shaped(1).subtract_transpose(B._base_shaped(1), target=out._base_shaped(1)) return out
def reciprocal(A, out): if out == None: out = gp.empty(A.shape, dtype=A.dtype) A._base_as_row().reciprocal(out._base_as_row()) return out
def subtract_nt(A,B,out): if out == None: out = gp.empty(A.shape,dtype=A.dtype) A._base_shaped(1).subtract_transpose(B._base_shaped(1),target=out._base_shaped(1)) return out
def empty(shape, dtype): return gp.empty(shape, dtype=dtype)
def _unary(func, A, out): if out == None: out = gp.empty(A.shape, dtype=A.dtype) func(A._base_as_row(), target=out._base_as_row()) return out
def dot_tn(A,B,out): if out == None: out = gp.empty((A.shape[1],B.shape[1])) cudamat.dot(B._base_as_2d(),A._base_as_2d().T,target=out._base_as_2d()) return out
def empty(shape): return gp.empty(shape) @staticmethod
def costAndGrad(self, data, labels, key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop T = data.shape[1] sizes = [self.inputDim] + self.layerSizes + [self.outputDim] stackMax = len(self.stack) - 1 if self.temporalLayer > 0: stackMax -= 1 self.hActs = [gp.empty((s, T)) for s in sizes] self.hActs[0] = data #for t in range(T): i = 1 for l in range(stackMax + 1): w, b = self.stack[l] self.hActs[i] = w.dot(self.hActs[i - 1]) + b # loop over time for recurrent layer if (self.temporalLayer - 1) == l: for t in range(T): if t > 0: self.hActs[i][:, t] += self.stack[-1][0].dot( self.hActs[i][:, t - 1]) # nonlinearity if i <= stackMax: self.hActs[i][:, t] = self.activation(self.hActs[i][:, t]) # hidden layer activation function for batch forward prop elif i <= stackMax: self.hActs[i] = self.activation(self.hActs[i]) # w_t,b_t = self.stack[-1][0] # self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) i += 1 # convert final layer to probs after all time iteration complete probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs / np.sum(probs, axis=0) ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0) # Store probabilities and error signal for a given key if key is not None and key in self.hist: self.hist[key].append((probs, delta_output)) if not self.train: return cost, None delta_output = gp.garray(delta_output) ## back prop through time # zero gradients self.grad = [[gp.zeros(w.shape), gp.zeros(b.shape)] for w, b in self.stack] if self.temporalLayer > 0: delta_t = np.zeros(self.layerSizes[self.temporalLayer - 1]) for t in reversed(range(T)): # get delta from loss function delta = delta_output[:, t].T # compute gradient for output layer #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape # TODO can we get rid of some of these annoying reshape -1 1? self.grad[stackMax][0] += delta.reshape(-1, 1).dot( self.hActs[-2][:, t].reshape(-1, 1).T) self.grad[stackMax][1] += delta.reshape(-1, 1) # push delta through output layer delta = self.stack[stackMax][0].T.dot(delta) # iterate over lower layers i = len(self.layerSizes) - 1 while i >= 0: # add the temporal delta if this is the recurrent layer if (self.temporalLayer - 1) == i: #print delta.shape, delta_t.shape delta += delta_t # push delta through activation function for this layer #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape delta = delta * self.activation(self.hActs[i + 1][:, t], True) #embed() # compute the gradient #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape self.grad[i][0] += delta.reshape(-1, 1).dot( self.hActs[i][:, t].T.reshape(1, -1)) self.grad[i][1] += delta.reshape(-1, 1) # add the temporal delta if this is the recurrent layer if (self.temporalLayer - 1) == i and t > 0: self.grad[-1][0] += delta.reshape(-1, 1).dot( self.hActs[i + 1][:, t - 1].T.reshape(1, -1)) # push delta through temporal connections delta_t = self.stack[-1][0].T.dot(delta) # HACK no bias for temporal layer. Give it a gradient of 0 self.grad[-1][1] = np.zeros((2, 1)) # push the delta downward w, b = self.stack[i] delta = w.T.dot(delta) i -= 1 #print self.grad return cost, self.grad, skip
def dot(A,B,out): if out == None: out = gp.empty((A.shape[0],B.shape[1]),dtype=A.dtype) cudamat.dot(B._base_as_2d(),A._base_as_2d(),target=out._base_as_2d()) return out
def reciprocal(A,out): if out == None: out = gp.empty(A.shape) A._base_as_row().reciprocal(out._base_as_row()) return out
def transpose(A, out): if out == None: out = gp.empty((A.shape[1], A.shape[0]), dtype=A.dtype) A._base_shaped(1).transpose(out._base_shaped(1)) return out
def square(A,out): if out == None: out = gp.empty(A.shape) cudamat.square(A._base_as_row(),target=out._base_as_row()) return out
def square(A, out): if out == None: out = gp.empty(A.shape, dtype=A.dtype) cudamat.square(A._base_as_row(), target=out._base_as_row()) return out
def _unary(func,A,out): if out == None: out = gp.empty(A.shape) func(A._base_as_row(),target=out._base_as_row()) return out
def costAndGrad(self,data,labels,key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop T = data.shape[1] sizes = [self.inputDim]+self.layerSizes+[self.outputDim] stackMax = len(self.stack)-1 if self.temporalLayer > 0: stackMax -= 1 self.hActs = [gp.empty((s,T)) for s in sizes] self.hActs[0] = data #for t in range(T): i = 1 for l in range(stackMax+1): w,b = self.stack[l] self.hActs[i] = w.dot(self.hActs[i-1]) + b # loop over time for recurrent layer if (self.temporalLayer-1) == l: for t in range(T): if t > 0: self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) # nonlinearity if i <= stackMax: self.hActs[i][:,t] = self.activation(self.hActs[i][:,t]) # hidden layer activation function for batch forward prop elif i <= stackMax: self.hActs[i] = self.activation(self.hActs[i]) # w_t,b_t = self.stack[-1][0] # self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) i += 1 # convert final layer to probs after all time iteration complete probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs/np.sum(probs,axis=0) ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0) # Store probabilities and error signal for a given key if key is not None and key in self.hist: self.hist[key].append((probs,delta_output)) if not self.train: return cost,None delta_output = gp.garray(delta_output) ## back prop through time # zero gradients self.grad = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.stack] if self.temporalLayer > 0: delta_t = np.zeros(self.layerSizes[self.temporalLayer-1]) for t in reversed(range(T)): # get delta from loss function delta = delta_output[:,t].T # compute gradient for output layer #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape # TODO can we get rid of some of these annoying reshape -1 1? self.grad[stackMax][0] += delta.reshape(-1,1).dot(self.hActs[-2][:,t].reshape(-1,1).T) self.grad[stackMax][1] += delta.reshape(-1, 1) # push delta through output layer delta = self.stack[stackMax][0].T.dot(delta) # iterate over lower layers i = len(self.layerSizes)-1 while i >= 0: # add the temporal delta if this is the recurrent layer if (self.temporalLayer-1) == i: #print delta.shape, delta_t.shape delta += delta_t # push delta through activation function for this layer #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape delta = delta * self.activation(self.hActs[i+1][:,t], True) #embed() # compute the gradient #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape self.grad[i][0] += delta.reshape(-1,1).dot(self.hActs[i][:,t].T.reshape(1,-1)) self.grad[i][1] += delta.reshape(-1,1) # add the temporal delta if this is the recurrent layer if (self.temporalLayer-1) == i and t > 0: self.grad[-1][0] += delta.reshape(-1,1).dot(self.hActs[i+1][:,t-1].T.reshape(1,-1)) # push delta through temporal connections delta_t = self.stack[-1][0].T.dot(delta) # HACK no bias for temporal layer. Give it a gradient of 0 self.grad[-1][1] = np.zeros((2,1)) # push the delta downward w,b = self.stack[i] delta = w.T.dot(delta) i -= 1 #print self.grad return cost,self.grad, skip
def transpose(A,out): if out == None: out = gp.empty((A.shape[1],A.shape[0]),dtype=A.dtype) A._base_shaped(1).transpose(out._base_shaped(1)) return out