Example #1
0
    def costAndGrad(self,data,labels=None, sentence=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf,_ = self.stack[-2]
            wtb,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-2]
                dwtf,_ = self.grad[-2]
                dwtb,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop #TODO copy to device here 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)
            # forward prop through time
            if i == self.temporalLayer:
                self.hActsFor.assign(self.hActs[i])
                self.hActsBack.assign(self.hActs[i])
                self.hActsFor.minmax(0.0,self.maxAct,col=0)
                self.hActsBack.minmax(0.0,self.maxAct,col=T-1)
                for t in xrange(1,T):
                    cm.mvdot_col_slice(wtf,self.hActsFor,t-1,self.hActsFor,t,beta=1.0)
                    self.hActsFor.minmax(0.0,self.maxAct,col=t)
                    cm.mvdot_col_slice(wtb,self.hActsBack,T-t,self.hActsBack,T-t-1,beta=1.0)
                    self.hActsBack.minmax(0.0,self.maxAct,col=T-t-1)
                self.hActsFor.add(self.hActsBack,target=self.hActs[i])

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            probs = self.probs.numpy_array
            return probs

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if self.reg > 0:
            self.regcost = 0.0
            for w, b in self.stack:
                rc = (self.reg / 2.0) * (w.euclid_norm() ** 2)
                self.regcost += rc
                cost = cost + rc

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            # gradient for w
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            if self.reg > 0:
                grad[i][0].add_mult(w, alpha=self.reg)
            # gradient for b
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActsFor.within(0.0,self.maxAct,target=self.tmpGradFor)
                self.hActsBack.within(0.0,self.maxAct,target=self.tmpGradBack)
                self.deltasFor.assign(deltasOut)
                self.deltasBack.assign(deltasOut)
                self.deltasFor.mult_slice(T-1,self.tmpGradFor,T-1)
                self.deltasBack.mult_slice(0,self.tmpGradBack,0)

                for t in xrange(1,T):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wtf.T,self.deltasFor,T-t,
                                       self.deltasFor,T-t-1,beta=1.0)
                    cm.mvdot_col_slice(wtb.T,self.deltasBack,t-1,
                                       self.deltasBack,t,beta=1.0)

                    # Push through activation fn
                    self.deltasFor.mult_slice(T-t-1,self.tmpGradFor,T-t-1)
                    self.deltasBack.mult_slice(t,self.tmpGradBack,t)

                # Accumulate temporal gradient
                cm.dot(self.deltasFor.get_col_slice(1,T),
                        self.hActsFor.get_col_slice(0,T-1).T,target=dwtf)
                cm.dot(self.deltasBack.get_col_slice(0,T-1),
                        self.hActsBack.get_col_slice(1,T).T,target=dwtb)

                # Accumulate next layer deltas
                self.deltasFor.add(self.deltasBack,target=deltasOut)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1
        if self.reg > 0:
            if self.temporalLayer > 0:
                dwtf.add_mult(wtf, alpha=self.reg)
                dwtb.add_mult(wtb, alpha=self.reg)

        return cost,self.grad,skip
Example #2
0
    def costAndGrad(self,data,labels=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1,T):
                    self.hActs[i].minmax(0.0,self.maxAct,col=t-1)
                    cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0)
                self.hActs[i].minmax(0.0,self.maxAct,col=T-1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
	if not self.train:
	    return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T-1,0,-1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t,self.tmpGrad,t) 
                    self.deltaTemp.set_single_col(t-1,deltasOut,t)

 
                # Accumulate temporal gradient
                cm.dot(self.deltaTemp,self.hActs[i].T,
                        target=dwt)

                cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0)
                deltasOut.mult_slice(0,self.tmpGrad,0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
Example #3
0
    def costAndGrad(self, data, labels=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1, T):
                    self.hActs[i].minmax(0.0, self.maxAct, col=t - 1)
                    cm.mvdot_col_slice(wt,
                                       self.hActs[i],
                                       t - 1,
                                       self.hActs[i],
                                       t,
                                       beta=1.0)
                self.hActs[i].minmax(0.0, self.maxAct, col=T - 1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            return ctc.decode_best_path(
                self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T - 1, 0, -1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,
                                       self.deltaTemp,
                                       t,
                                       deltasOut,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t, self.tmpGrad, t)
                    self.deltaTemp.set_single_col(t - 1, deltasOut, t)

                # Accumulate temporal gradient
                cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt)

                cm.mvdot_col_slice(wt.T,
                                   self.deltaTemp,
                                   0,
                                   deltasOut,
                                   0,
                                   beta=1.0)
                deltasOut.mult_slice(0, self.tmpGrad, 0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1

        return cost, self.grad, skip
Example #4
0
    def costAndGrad(self, data, labels=None, sentence=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf, _ = self.stack[-2]
            wtb, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-2]
                dwtf, _ = self.grad[-2]
                dwtb, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop #TODO copy to device here
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)
            # forward prop through time
            if i == self.temporalLayer:
                self.hActsFor.assign(self.hActs[i])
                self.hActsBack.assign(self.hActs[i])
                self.hActsFor.minmax(0.0, self.maxAct, col=0)
                self.hActsBack.minmax(0.0, self.maxAct, col=T - 1)
                for t in xrange(1, T):
                    cm.mvdot_col_slice(wtf,
                                       self.hActsFor,
                                       t - 1,
                                       self.hActsFor,
                                       t,
                                       beta=1.0)
                    self.hActsFor.minmax(0.0, self.maxAct, col=t)
                    cm.mvdot_col_slice(wtb,
                                       self.hActsBack,
                                       T - t,
                                       self.hActsBack,
                                       T - t - 1,
                                       beta=1.0)
                    self.hActsBack.minmax(0.0, self.maxAct, col=T - t - 1)
                self.hActsFor.add(self.hActsBack, target=self.hActs[i])

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            probs = self.probs.numpy_array
            return probs

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if self.reg > 0:
            self.regcost = 0.0
            for w, b in self.stack:
                rc = (self.reg / 2.0) * (w.euclid_norm()**2)
                self.regcost += rc
                cost = cost + rc

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            # gradient for w
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            if self.reg > 0:
                grad[i][0].add_mult(w, alpha=self.reg)
            # gradient for b
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActsFor.within(0.0, self.maxAct, target=self.tmpGradFor)
                self.hActsBack.within(0.0,
                                      self.maxAct,
                                      target=self.tmpGradBack)
                self.deltasFor.assign(deltasOut)
                self.deltasBack.assign(deltasOut)
                self.deltasFor.mult_slice(T - 1, self.tmpGradFor, T - 1)
                self.deltasBack.mult_slice(0, self.tmpGradBack, 0)

                for t in xrange(1, T):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wtf.T,
                                       self.deltasFor,
                                       T - t,
                                       self.deltasFor,
                                       T - t - 1,
                                       beta=1.0)
                    cm.mvdot_col_slice(wtb.T,
                                       self.deltasBack,
                                       t - 1,
                                       self.deltasBack,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    self.deltasFor.mult_slice(T - t - 1, self.tmpGradFor,
                                              T - t - 1)
                    self.deltasBack.mult_slice(t, self.tmpGradBack, t)

                # Accumulate temporal gradient
                cm.dot(self.deltasFor.get_col_slice(1, T),
                       self.hActsFor.get_col_slice(0, T - 1).T,
                       target=dwtf)
                cm.dot(self.deltasBack.get_col_slice(0, T - 1),
                       self.hActsBack.get_col_slice(1, T).T,
                       target=dwtb)

                # Accumulate next layer deltas
                self.deltasFor.add(self.deltasBack, target=deltasOut)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1
        if self.reg > 0:
            if self.temporalLayer > 0:
                dwtf.add_mult(wtf, alpha=self.reg)
                dwtb.add_mult(wtb, alpha=self.reg)

        return cost, self.grad, skip