def costAndGrad(self,data,labels=None, sentence=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-2] wtf,_ = self.stack[-2] wtb,_ = self.stack[-1] if self.train: grad = self.grad[:-2] dwtf,_ = self.grad[-2] dwtb,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop #TODO copy to device here self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: self.hActsFor.assign(self.hActs[i]) self.hActsBack.assign(self.hActs[i]) self.hActsFor.minmax(0.0,self.maxAct,col=0) self.hActsBack.minmax(0.0,self.maxAct,col=T-1) for t in xrange(1,T): cm.mvdot_col_slice(wtf,self.hActsFor,t-1,self.hActsFor,t,beta=1.0) self.hActsFor.minmax(0.0,self.maxAct,col=t) cm.mvdot_col_slice(wtb,self.hActsBack,T-t,self.hActsBack,T-t-1,beta=1.0) self.hActsBack.minmax(0.0,self.maxAct,col=T-t-1) self.hActsFor.add(self.hActsBack,target=self.hActs[i]) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: probs = self.probs.numpy_array return probs cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if self.reg > 0: self.regcost = 0.0 for w, b in self.stack: rc = (self.reg / 2.0) * (w.euclid_norm() ** 2) self.regcost += rc cost = cost + rc if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient # gradient for w cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) if self.reg > 0: grad[i][0].add_mult(w, alpha=self.reg) # gradient for b deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActsFor.within(0.0,self.maxAct,target=self.tmpGradFor) self.hActsBack.within(0.0,self.maxAct,target=self.tmpGradBack) self.deltasFor.assign(deltasOut) self.deltasBack.assign(deltasOut) self.deltasFor.mult_slice(T-1,self.tmpGradFor,T-1) self.deltasBack.mult_slice(0,self.tmpGradBack,0) for t in xrange(1,T): # Add in temporal delta cm.mvdot_col_slice(wtf.T,self.deltasFor,T-t, self.deltasFor,T-t-1,beta=1.0) cm.mvdot_col_slice(wtb.T,self.deltasBack,t-1, self.deltasBack,t,beta=1.0) # Push through activation fn self.deltasFor.mult_slice(T-t-1,self.tmpGradFor,T-t-1) self.deltasBack.mult_slice(t,self.tmpGradBack,t) # Accumulate temporal gradient cm.dot(self.deltasFor.get_col_slice(1,T), self.hActsFor.get_col_slice(0,T-1).T,target=dwtf) cm.dot(self.deltasBack.get_col_slice(0,T-1), self.hActsBack.get_col_slice(1,T).T,target=dwtb) # Accumulate next layer deltas self.deltasFor.add(self.deltasBack,target=deltasOut) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 if self.reg > 0: if self.temporalLayer > 0: dwtf.add_mult(wtf, alpha=self.reg) dwtb.add_mult(wtb, alpha=self.reg) return cost,self.grad,skip
def costAndGrad(self,data,labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt,_ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1,T): self.hActs[i].minmax(0.0,self.maxAct,col=t-1) cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0) self.hActs[i].minmax(0.0,self.maxAct,col=T-1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T-1,0,-1): # Add in temporal delta cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0) # Push through activation fn deltasOut.mult_slice(t,self.tmpGrad,t) self.deltaTemp.set_single_col(t-1,deltasOut,t) # Accumulate temporal gradient cm.dot(self.deltaTemp,self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0) deltasOut.mult_slice(0,self.tmpGrad,0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def costAndGrad(self, data, labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt, _ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1, T): self.hActs[i].minmax(0.0, self.maxAct, col=t - 1) cm.mvdot_col_slice(wt, self.hActs[i], t - 1, self.hActs[i], t, beta=1.0) self.hActs[i].minmax(0.0, self.maxAct, col=T - 1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path( self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T - 1, 0, -1): # Add in temporal delta cm.mvdot_col_slice(wt.T, self.deltaTemp, t, deltasOut, t, beta=1.0) # Push through activation fn deltasOut.mult_slice(t, self.tmpGrad, t) self.deltaTemp.set_single_col(t - 1, deltasOut, t) # Accumulate temporal gradient cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T, self.deltaTemp, 0, deltasOut, 0, beta=1.0) deltasOut.mult_slice(0, self.tmpGrad, 0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 return cost, self.grad, skip
def costAndGrad(self, data, labels=None, sentence=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-2] wtf, _ = self.stack[-2] wtb, _ = self.stack[-1] if self.train: grad = self.grad[:-2] dwtf, _ = self.grad[-2] dwtb, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop #TODO copy to device here self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: self.hActsFor.assign(self.hActs[i]) self.hActsBack.assign(self.hActs[i]) self.hActsFor.minmax(0.0, self.maxAct, col=0) self.hActsBack.minmax(0.0, self.maxAct, col=T - 1) for t in xrange(1, T): cm.mvdot_col_slice(wtf, self.hActsFor, t - 1, self.hActsFor, t, beta=1.0) self.hActsFor.minmax(0.0, self.maxAct, col=t) cm.mvdot_col_slice(wtb, self.hActsBack, T - t, self.hActsBack, T - t - 1, beta=1.0) self.hActsBack.minmax(0.0, self.maxAct, col=T - t - 1) self.hActsFor.add(self.hActsBack, target=self.hActs[i]) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: probs = self.probs.numpy_array return probs cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if self.reg > 0: self.regcost = 0.0 for w, b in self.stack: rc = (self.reg / 2.0) * (w.euclid_norm()**2) self.regcost += rc cost = cost + rc if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient # gradient for w cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) if self.reg > 0: grad[i][0].add_mult(w, alpha=self.reg) # gradient for b deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActsFor.within(0.0, self.maxAct, target=self.tmpGradFor) self.hActsBack.within(0.0, self.maxAct, target=self.tmpGradBack) self.deltasFor.assign(deltasOut) self.deltasBack.assign(deltasOut) self.deltasFor.mult_slice(T - 1, self.tmpGradFor, T - 1) self.deltasBack.mult_slice(0, self.tmpGradBack, 0) for t in xrange(1, T): # Add in temporal delta cm.mvdot_col_slice(wtf.T, self.deltasFor, T - t, self.deltasFor, T - t - 1, beta=1.0) cm.mvdot_col_slice(wtb.T, self.deltasBack, t - 1, self.deltasBack, t, beta=1.0) # Push through activation fn self.deltasFor.mult_slice(T - t - 1, self.tmpGradFor, T - t - 1) self.deltasBack.mult_slice(t, self.tmpGradBack, t) # Accumulate temporal gradient cm.dot(self.deltasFor.get_col_slice(1, T), self.hActsFor.get_col_slice(0, T - 1).T, target=dwtf) cm.dot(self.deltasBack.get_col_slice(0, T - 1), self.hActsBack.get_col_slice(1, T).T, target=dwtb) # Accumulate next layer deltas self.deltasFor.add(self.deltasBack, target=deltasOut) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 if self.reg > 0: if self.temporalLayer > 0: dwtf.add_mult(wtf, alpha=self.reg) dwtb.add_mult(wtb, alpha=self.reg) return cost, self.grad, skip