def costAndGrad(self,data,labels): batchSize = data.shape[1] self.setViews(batchSize) # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in self.stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) if i <= len(self.layerSizes): # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) self.deltasC.assign(cm.CUDAMatrix(deltas)) if skip: return cost,self.grad,skip # back prop nl = len(self.layerSizes) i = nl deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(self.stack): # compute gradient cm.dot(deltasIn,self.hActs[i].T,target=self.grad[i][0]) deltasIn.sum(axis=1,target=self.grad[i][1]) # compute next layer deltas if i > 0: self.hActs[i].sign(target=self.tmpGrad) cm.dot(w.T,deltasIn,target=deltasOut) deltasOut.mult(self.tmpGrad) if i == nl: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def costAndGrad(self,data,labels=None, sentence=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-2] wtf,_ = self.stack[-2] wtb,_ = self.stack[-1] if self.train: grad = self.grad[:-2] dwtf,_ = self.grad[-2] dwtb,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop #TODO copy to device here self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: self.hActsFor.assign(self.hActs[i]) self.hActsBack.assign(self.hActs[i]) self.hActsFor.minmax(0.0,self.maxAct,col=0) self.hActsBack.minmax(0.0,self.maxAct,col=T-1) for t in xrange(1,T): cm.mvdot_col_slice(wtf,self.hActsFor,t-1,self.hActsFor,t,beta=1.0) self.hActsFor.minmax(0.0,self.maxAct,col=t) cm.mvdot_col_slice(wtb,self.hActsBack,T-t,self.hActsBack,T-t-1,beta=1.0) self.hActsBack.minmax(0.0,self.maxAct,col=T-t-1) self.hActsFor.add(self.hActsBack,target=self.hActs[i]) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: probs = self.probs.numpy_array return probs cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if self.reg > 0: self.regcost = 0.0 for w, b in self.stack: rc = (self.reg / 2.0) * (w.euclid_norm() ** 2) self.regcost += rc cost = cost + rc if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient # gradient for w cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) if self.reg > 0: grad[i][0].add_mult(w, alpha=self.reg) # gradient for b deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActsFor.within(0.0,self.maxAct,target=self.tmpGradFor) self.hActsBack.within(0.0,self.maxAct,target=self.tmpGradBack) self.deltasFor.assign(deltasOut) self.deltasBack.assign(deltasOut) self.deltasFor.mult_slice(T-1,self.tmpGradFor,T-1) self.deltasBack.mult_slice(0,self.tmpGradBack,0) for t in xrange(1,T): # Add in temporal delta cm.mvdot_col_slice(wtf.T,self.deltasFor,T-t, self.deltasFor,T-t-1,beta=1.0) cm.mvdot_col_slice(wtb.T,self.deltasBack,t-1, self.deltasBack,t,beta=1.0) # Push through activation fn self.deltasFor.mult_slice(T-t-1,self.tmpGradFor,T-t-1) self.deltasBack.mult_slice(t,self.tmpGradBack,t) # Accumulate temporal gradient cm.dot(self.deltasFor.get_col_slice(1,T), self.hActsFor.get_col_slice(0,T-1).T,target=dwtf) cm.dot(self.deltasBack.get_col_slice(0,T-1), self.hActsBack.get_col_slice(1,T).T,target=dwtb) # Accumulate next layer deltas self.deltasFor.add(self.deltasBack,target=deltasOut) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 if self.reg > 0: if self.temporalLayer > 0: dwtf.add_mult(wtf, alpha=self.reg) dwtb.add_mult(wtb, alpha=self.reg) return cost,self.grad,skip
def costAndGrad(self, data, labels): T = data.shape[1] # forward prop self.hActs[0] = data if self.temporalLayer > 0: stack = self.stack[:-2] wtf, _ = self.stack[-2] wtb, _ = self.stack[-1] grad = self.grad[:-2] dwtf, _ = self.grad[-2] dwtb, _ = self.grad[-1] else: stack = self.stack grad = self.grad i = 1 for w, b in stack: self.hActs[i] = np.dot(w, self.hActs[i - 1]) self.hActs[i] += b # forward prop through time if i == self.temporalLayer: preActs = np.array(self.hActs[i]) actsForward = np.empty(preActs.shape) actsForward[:, 0] = preActs[:, 0] actsForward[preActs[:, 0] <= 0, 0] = 0.0 actsBackward = np.empty(preActs.shape) actsBackward[:, -1] = preActs[:, -1] actsBackward[preActs[:, -1] <= 0, -1] = 0.0 for t in xrange(1, T): actsForward[:, t] = np.dot( wtf, actsForward[:, t - 1]) + preActs[:, t] actsBackward[:, -t - 1] = np.dot( wtb, actsBackward[:, -t]) + preActs[:, -t - 1] actsForward[actsForward[:, t] <= 0, t] = 0.0 actsBackward[actsBackward[:, -t - 1] <= 0, -t - 1] = 0.0 self.hActs[i][:] = actsForward + actsBackward if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i][self.hActs[i] < 0.0] = 0.0 i += 1 # Subtract max activation probs = self.hActs[-1] - self.hActs[-1].max(axis=0)[None, :] # Softmax probs = np.exp(probs) probs /= probs.sum(axis=0)[None, :] cost, deltasC, skip = ctc.ctc_loss(np.asfortranarray(probs), labels, blank=0) if skip: return cost, self.grad, skip # back prop i = self.numLayers self.deltasOut = None self.deltasIn = None deltasIn, deltasOut = deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient grad[i][0] = np.dot(deltasIn, self.hActs[i].T) grad[i][1] = deltasIn.sum(axis=1)[:, None] # compute next layer deltas if i > 0: deltasOut = np.dot(w.T, deltasIn) # backprop through time if i == self.temporalLayer: tmpGradF = np.sign(actsForward) tmpGradB = np.sign(actsBackward) deltasForward = np.array(deltasOut) deltasForward[:, -1] *= tmpGradF[:, -1] deltasBackward = np.array(deltasOut) deltasBackward[:, 0] *= tmpGradB[:, 0] for t in xrange(1, T): deltasForward[:, -t - 1] = tmpGradF[:, -t - 1] * ( deltasForward[:, -t - 1] + np.dot(wtf.T, deltasForward[:, -t])) deltasBackward[:, t] = tmpGradB[:, t] * ( deltasBackward[:, t] + np.dot(wtb.T, deltasBackward[:, t - 1])) # Compute temporal gradient dwtb[:] = np.dot(deltasBackward[:, :-1], actsBackward[:, 1:].T) dwtf[:] = np.dot(deltasForward[:, 1:], actsForward[:, :-1].T) deltasOut = deltasForward + deltasBackward if i > 0 and i != self.temporalLayer: tmpGrad = np.sign(self.hActs[i]) deltasOut *= tmpGrad if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 return cost, self.grad, skip
def costAndGrad(self,data,labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt,_ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1,T): self.hActs[i].minmax(0.0,self.maxAct,col=t-1) cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0) self.hActs[i].minmax(0.0,self.maxAct,col=T-1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T-1,0,-1): # Add in temporal delta cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0) # Push through activation fn deltasOut.mult_slice(t,self.tmpGrad,t) self.deltaTemp.set_single_col(t-1,deltasOut,t) # Accumulate temporal gradient cm.dot(self.deltaTemp,self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0) deltasOut.mult_slice(0,self.tmpGrad,0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def costAndGrad(self, data, labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt, _ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1, T): self.hActs[i].minmax(0.0, self.maxAct, col=t - 1) cm.mvdot_col_slice(wt, self.hActs[i], t - 1, self.hActs[i], t, beta=1.0) self.hActs[i].minmax(0.0, self.maxAct, col=T - 1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path( self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T - 1, 0, -1): # Add in temporal delta cm.mvdot_col_slice(wt.T, self.deltaTemp, t, deltasOut, t, beta=1.0) # Push through activation fn deltasOut.mult_slice(t, self.tmpGrad, t) self.deltaTemp.set_single_col(t - 1, deltasOut, t) # Accumulate temporal gradient cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T, self.deltaTemp, 0, deltasOut, 0, beta=1.0) deltasOut.mult_slice(0, self.tmpGrad, 0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 return cost, self.grad, skip
def costAndGrad(self,data,labels=None,key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop # this is the same as minibatch forward prop # since we pre-compute context window features for each time self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs/np.sum(probs,axis=0) # probs[probs<1e-12] = 1e-12 # TODO have to clamp? ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? if not self.train: return ctc.decode_best_path(probs, ref=labels, blank=0) #return ctc.decode_bp_bigrams(probs, blank=0, B=None) cost, self.deltas[-1], skip = ctc.ctc_loss(probs, labels, blank=0) # Bad utterance ? if skip: return cost,self.grad,skip # Store probabilities and error signal for a given key #if key is not None and key in self.hist: # self.hist[key].append((probs,self.deltas[-1])) self.deltas[-1] = gp.garray(self.deltas[-1]) # back prop i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i+1], True) self.deltas[i] = w.T.dot(self.deltas[i+1])*grad i -= 1 # compute gradients # NOTE we do not divide by utterance length. # Will need to scale up weight norm penalty accordingly for i in range(len(self.grad)): self.grad[i][0] = self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = gp.sum(self.deltas[i],axis=1).reshape(-1,1) return cost,self.grad,skip
def costAndGrad(self, data, labels=None, sentence=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-2] wtf, _ = self.stack[-2] wtb, _ = self.stack[-1] if self.train: grad = self.grad[:-2] dwtf, _ = self.grad[-2] dwtb, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop #TODO copy to device here self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: self.hActsFor.assign(self.hActs[i]) self.hActsBack.assign(self.hActs[i]) self.hActsFor.minmax(0.0, self.maxAct, col=0) self.hActsBack.minmax(0.0, self.maxAct, col=T - 1) for t in xrange(1, T): cm.mvdot_col_slice(wtf, self.hActsFor, t - 1, self.hActsFor, t, beta=1.0) self.hActsFor.minmax(0.0, self.maxAct, col=t) cm.mvdot_col_slice(wtb, self.hActsBack, T - t, self.hActsBack, T - t - 1, beta=1.0) self.hActsBack.minmax(0.0, self.maxAct, col=T - t - 1) self.hActsFor.add(self.hActsBack, target=self.hActs[i]) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: probs = self.probs.numpy_array return probs cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if self.reg > 0: self.regcost = 0.0 for w, b in self.stack: rc = (self.reg / 2.0) * (w.euclid_norm()**2) self.regcost += rc cost = cost + rc if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient # gradient for w cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) if self.reg > 0: grad[i][0].add_mult(w, alpha=self.reg) # gradient for b deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActsFor.within(0.0, self.maxAct, target=self.tmpGradFor) self.hActsBack.within(0.0, self.maxAct, target=self.tmpGradBack) self.deltasFor.assign(deltasOut) self.deltasBack.assign(deltasOut) self.deltasFor.mult_slice(T - 1, self.tmpGradFor, T - 1) self.deltasBack.mult_slice(0, self.tmpGradBack, 0) for t in xrange(1, T): # Add in temporal delta cm.mvdot_col_slice(wtf.T, self.deltasFor, T - t, self.deltasFor, T - t - 1, beta=1.0) cm.mvdot_col_slice(wtb.T, self.deltasBack, t - 1, self.deltasBack, t, beta=1.0) # Push through activation fn self.deltasFor.mult_slice(T - t - 1, self.tmpGradFor, T - t - 1) self.deltasBack.mult_slice(t, self.tmpGradBack, t) # Accumulate temporal gradient cm.dot(self.deltasFor.get_col_slice(1, T), self.hActsFor.get_col_slice(0, T - 1).T, target=dwtf) cm.dot(self.deltasBack.get_col_slice(0, T - 1), self.hActsBack.get_col_slice(1, T).T, target=dwtb) # Accumulate next layer deltas self.deltasFor.add(self.deltasBack, target=deltasOut) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 if self.reg > 0: if self.temporalLayer > 0: dwtf.add_mult(wtf, alpha=self.reg) dwtb.add_mult(wtb, alpha=self.reg) return cost, self.grad, skip
def costAndGrad(self,data,labels): T = data.shape[1] # forward prop self.hActs[0] = data if self.temporalLayer > 0: stack = self.stack[:-2] wtf,_ = self.stack[-2] wtb,_ = self.stack[-1] grad = self.grad[:-2] dwtf,_ = self.grad[-2] dwtb,_ = self.grad[-1] else: stack = self.stack grad = self.grad i = 1 for w,b in stack: self.hActs[i] = np.dot(w,self.hActs[i-1]) self.hActs[i] += b # forward prop through time if i == self.temporalLayer: preActs = np.array(self.hActs[i]) actsForward = np.empty(preActs.shape) actsForward[:,0] = preActs[:,0] actsForward[preActs[:,0]<=0,0] = 0.0 actsBackward = np.empty(preActs.shape) actsBackward[:,-1] = preActs[:,-1] actsBackward[preActs[:,-1]<=0,-1] = 0.0 for t in xrange(1,T): actsForward[:,t] = np.dot(wtf,actsForward[:,t-1]) + preActs[:,t] actsBackward[:,-t-1] = np.dot(wtb,actsBackward[:,-t]) + preActs[:,-t-1] actsForward[actsForward[:,t]<=0,t] = 0.0 actsBackward[actsBackward[:,-t-1]<=0,-t-1] = 0.0 self.hActs[i][:] = actsForward + actsBackward if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i][self.hActs[i]<0.0] = 0.0 i += 1 # Subtract max activation probs = self.hActs[-1] - self.hActs[-1].max(axis=0)[None,:] # Softmax probs = np.exp(probs) probs /= probs.sum(axis=0)[None,:] cost, deltasC, skip = ctc.ctc_loss(np.asfortranarray(probs),labels,blank=0) if skip: return cost,self.grad,skip # back prop i = self.numLayers self.deltasOut = None self.deltasIn = None deltasIn,deltasOut = deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient grad[i][0] = np.dot(deltasIn,self.hActs[i].T) grad[i][1] = deltasIn.sum(axis=1)[:,None] # compute next layer deltas if i > 0: deltasOut = np.dot(w.T,deltasIn) # backprop through time if i == self.temporalLayer: tmpGradF = np.sign(actsForward) tmpGradB = np.sign(actsBackward) deltasForward = np.array(deltasOut) deltasForward[:,-1] *= tmpGradF[:,-1] deltasBackward = np.array(deltasOut) deltasBackward[:,0] *= tmpGradB[:,0] for t in xrange(1,T): deltasForward[:,-t-1] = tmpGradF[:,-t-1]*(deltasForward[:,-t-1]+np.dot(wtf.T,deltasForward[:,-t])) deltasBackward[:,t] = tmpGradB[:,t]*(deltasBackward[:,t]+np.dot(wtb.T,deltasBackward[:,t-1])) # Compute temporal gradient dwtb[:] = np.dot(deltasBackward[:,:-1],actsBackward[:,1:].T) dwtf[:] = np.dot(deltasForward[:,1:],actsForward[:,:-1].T) deltasOut = deltasForward + deltasBackward if i > 0 and i != self.temporalLayer: tmpGrad = np.sign(self.hActs[i]) deltasOut *= tmpGrad if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def costAndGrad(self, data, labels, key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop T = data.shape[1] sizes = [self.inputDim] + self.layerSizes + [self.outputDim] stackMax = len(self.stack) - 1 if self.temporalLayer > 0: stackMax -= 1 self.hActs = [gp.empty((s, T)) for s in sizes] self.hActs[0] = data #for t in range(T): i = 1 for l in range(stackMax + 1): w, b = self.stack[l] self.hActs[i] = w.dot(self.hActs[i - 1]) + b # loop over time for recurrent layer if (self.temporalLayer - 1) == l: for t in range(T): if t > 0: self.hActs[i][:, t] += self.stack[-1][0].dot( self.hActs[i][:, t - 1]) # nonlinearity if i <= stackMax: self.hActs[i][:, t] = self.activation(self.hActs[i][:, t]) # hidden layer activation function for batch forward prop elif i <= stackMax: self.hActs[i] = self.activation(self.hActs[i]) # w_t,b_t = self.stack[-1][0] # self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) i += 1 # convert final layer to probs after all time iteration complete probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs / np.sum(probs, axis=0) ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0) # Store probabilities and error signal for a given key if key is not None and key in self.hist: self.hist[key].append((probs, delta_output)) if not self.train: return cost, None delta_output = gp.garray(delta_output) ## back prop through time # zero gradients self.grad = [[gp.zeros(w.shape), gp.zeros(b.shape)] for w, b in self.stack] if self.temporalLayer > 0: delta_t = np.zeros(self.layerSizes[self.temporalLayer - 1]) for t in reversed(range(T)): # get delta from loss function delta = delta_output[:, t].T # compute gradient for output layer #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape # TODO can we get rid of some of these annoying reshape -1 1? self.grad[stackMax][0] += delta.reshape(-1, 1).dot( self.hActs[-2][:, t].reshape(-1, 1).T) self.grad[stackMax][1] += delta.reshape(-1, 1) # push delta through output layer delta = self.stack[stackMax][0].T.dot(delta) # iterate over lower layers i = len(self.layerSizes) - 1 while i >= 0: # add the temporal delta if this is the recurrent layer if (self.temporalLayer - 1) == i: #print delta.shape, delta_t.shape delta += delta_t # push delta through activation function for this layer #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape delta = delta * self.activation(self.hActs[i + 1][:, t], True) #embed() # compute the gradient #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape self.grad[i][0] += delta.reshape(-1, 1).dot( self.hActs[i][:, t].T.reshape(1, -1)) self.grad[i][1] += delta.reshape(-1, 1) # add the temporal delta if this is the recurrent layer if (self.temporalLayer - 1) == i and t > 0: self.grad[-1][0] += delta.reshape(-1, 1).dot( self.hActs[i + 1][:, t - 1].T.reshape(1, -1)) # push delta through temporal connections delta_t = self.stack[-1][0].T.dot(delta) # HACK no bias for temporal layer. Give it a gradient of 0 self.grad[-1][1] = np.zeros((2, 1)) # push the delta downward w, b = self.stack[i] delta = w.T.dot(delta) i -= 1 #print self.grad return cost, self.grad, skip
def costAndGrad(self,data,labels,key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop T = data.shape[1] sizes = [self.inputDim]+self.layerSizes+[self.outputDim] stackMax = len(self.stack)-1 if self.temporalLayer > 0: stackMax -= 1 self.hActs = [gp.empty((s,T)) for s in sizes] self.hActs[0] = data #for t in range(T): i = 1 for l in range(stackMax+1): w,b = self.stack[l] self.hActs[i] = w.dot(self.hActs[i-1]) + b # loop over time for recurrent layer if (self.temporalLayer-1) == l: for t in range(T): if t > 0: self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) # nonlinearity if i <= stackMax: self.hActs[i][:,t] = self.activation(self.hActs[i][:,t]) # hidden layer activation function for batch forward prop elif i <= stackMax: self.hActs[i] = self.activation(self.hActs[i]) # w_t,b_t = self.stack[-1][0] # self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) i += 1 # convert final layer to probs after all time iteration complete probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs/np.sum(probs,axis=0) ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0) # Store probabilities and error signal for a given key if key is not None and key in self.hist: self.hist[key].append((probs,delta_output)) if not self.train: return cost,None delta_output = gp.garray(delta_output) ## back prop through time # zero gradients self.grad = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.stack] if self.temporalLayer > 0: delta_t = np.zeros(self.layerSizes[self.temporalLayer-1]) for t in reversed(range(T)): # get delta from loss function delta = delta_output[:,t].T # compute gradient for output layer #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape # TODO can we get rid of some of these annoying reshape -1 1? self.grad[stackMax][0] += delta.reshape(-1,1).dot(self.hActs[-2][:,t].reshape(-1,1).T) self.grad[stackMax][1] += delta.reshape(-1, 1) # push delta through output layer delta = self.stack[stackMax][0].T.dot(delta) # iterate over lower layers i = len(self.layerSizes)-1 while i >= 0: # add the temporal delta if this is the recurrent layer if (self.temporalLayer-1) == i: #print delta.shape, delta_t.shape delta += delta_t # push delta through activation function for this layer #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape delta = delta * self.activation(self.hActs[i+1][:,t], True) #embed() # compute the gradient #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape self.grad[i][0] += delta.reshape(-1,1).dot(self.hActs[i][:,t].T.reshape(1,-1)) self.grad[i][1] += delta.reshape(-1,1) # add the temporal delta if this is the recurrent layer if (self.temporalLayer-1) == i and t > 0: self.grad[-1][0] += delta.reshape(-1,1).dot(self.hActs[i+1][:,t-1].T.reshape(1,-1)) # push delta through temporal connections delta_t = self.stack[-1][0].T.dot(delta) # HACK no bias for temporal layer. Give it a gradient of 0 self.grad[-1][1] = np.zeros((2,1)) # push the delta downward w,b = self.stack[i] delta = w.T.dot(delta) i -= 1 #print self.grad return cost,self.grad, skip