Ejemplo n.º 1
0
    def costAndGrad(self,data,labels):
        
        batchSize = data.shape[1]
        self.setViews(batchSize)
        
        # forward prop
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in self.stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)
            if i <= len(self.layerSizes):
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)
        self.deltasC.assign(cm.CUDAMatrix(deltas))

        if skip:
            return cost,self.grad,skip

        # back prop
        nl = len(self.layerSizes)
        i = nl 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(self.stack):
            # compute gradient
            cm.dot(deltasIn,self.hActs[i].T,target=self.grad[i][0])
            deltasIn.sum(axis=1,target=self.grad[i][1])

            # compute next layer deltas
            if i > 0:
                self.hActs[i].sign(target=self.tmpGrad)
                cm.dot(w.T,deltasIn,target=deltasOut)
                deltasOut.mult(self.tmpGrad)

            if i == nl:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
Ejemplo n.º 2
0
    def costAndGrad(self,data,labels=None, sentence=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf,_ = self.stack[-2]
            wtb,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-2]
                dwtf,_ = self.grad[-2]
                dwtb,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop #TODO copy to device here 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)
            # forward prop through time
            if i == self.temporalLayer:
                self.hActsFor.assign(self.hActs[i])
                self.hActsBack.assign(self.hActs[i])
                self.hActsFor.minmax(0.0,self.maxAct,col=0)
                self.hActsBack.minmax(0.0,self.maxAct,col=T-1)
                for t in xrange(1,T):
                    cm.mvdot_col_slice(wtf,self.hActsFor,t-1,self.hActsFor,t,beta=1.0)
                    self.hActsFor.minmax(0.0,self.maxAct,col=t)
                    cm.mvdot_col_slice(wtb,self.hActsBack,T-t,self.hActsBack,T-t-1,beta=1.0)
                    self.hActsBack.minmax(0.0,self.maxAct,col=T-t-1)
                self.hActsFor.add(self.hActsBack,target=self.hActs[i])

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            probs = self.probs.numpy_array
            return probs

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if self.reg > 0:
            self.regcost = 0.0
            for w, b in self.stack:
                rc = (self.reg / 2.0) * (w.euclid_norm() ** 2)
                self.regcost += rc
                cost = cost + rc

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            # gradient for w
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            if self.reg > 0:
                grad[i][0].add_mult(w, alpha=self.reg)
            # gradient for b
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActsFor.within(0.0,self.maxAct,target=self.tmpGradFor)
                self.hActsBack.within(0.0,self.maxAct,target=self.tmpGradBack)
                self.deltasFor.assign(deltasOut)
                self.deltasBack.assign(deltasOut)
                self.deltasFor.mult_slice(T-1,self.tmpGradFor,T-1)
                self.deltasBack.mult_slice(0,self.tmpGradBack,0)

                for t in xrange(1,T):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wtf.T,self.deltasFor,T-t,
                                       self.deltasFor,T-t-1,beta=1.0)
                    cm.mvdot_col_slice(wtb.T,self.deltasBack,t-1,
                                       self.deltasBack,t,beta=1.0)

                    # Push through activation fn
                    self.deltasFor.mult_slice(T-t-1,self.tmpGradFor,T-t-1)
                    self.deltasBack.mult_slice(t,self.tmpGradBack,t)

                # Accumulate temporal gradient
                cm.dot(self.deltasFor.get_col_slice(1,T),
                        self.hActsFor.get_col_slice(0,T-1).T,target=dwtf)
                cm.dot(self.deltasBack.get_col_slice(0,T-1),
                        self.hActsBack.get_col_slice(1,T).T,target=dwtb)

                # Accumulate next layer deltas
                self.deltasFor.add(self.deltasBack,target=deltasOut)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1
        if self.reg > 0:
            if self.temporalLayer > 0:
                dwtf.add_mult(wtf, alpha=self.reg)
                dwtb.add_mult(wtb, alpha=self.reg)

        return cost,self.grad,skip
Ejemplo n.º 3
0
    def costAndGrad(self, data, labels):

        T = data.shape[1]

        # forward prop
        self.hActs[0] = data

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf, _ = self.stack[-2]
            wtb, _ = self.stack[-1]
            grad = self.grad[:-2]
            dwtf, _ = self.grad[-2]
            dwtb, _ = self.grad[-1]
        else:
            stack = self.stack
            grad = self.grad

        i = 1
        for w, b in stack:
            self.hActs[i] = np.dot(w, self.hActs[i - 1])
            self.hActs[i] += b

            # forward prop through time
            if i == self.temporalLayer:
                preActs = np.array(self.hActs[i])
                actsForward = np.empty(preActs.shape)
                actsForward[:, 0] = preActs[:, 0]
                actsForward[preActs[:, 0] <= 0, 0] = 0.0
                actsBackward = np.empty(preActs.shape)
                actsBackward[:, -1] = preActs[:, -1]
                actsBackward[preActs[:, -1] <= 0, -1] = 0.0
                for t in xrange(1, T):
                    actsForward[:, t] = np.dot(
                        wtf, actsForward[:, t - 1]) + preActs[:, t]
                    actsBackward[:, -t - 1] = np.dot(
                        wtb, actsBackward[:, -t]) + preActs[:, -t - 1]
                    actsForward[actsForward[:, t] <= 0, t] = 0.0
                    actsBackward[actsBackward[:, -t - 1] <= 0, -t - 1] = 0.0
                self.hActs[i][:] = actsForward + actsBackward

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i][self.hActs[i] < 0.0] = 0.0
            i += 1

        # Subtract max activation
        probs = self.hActs[-1] - self.hActs[-1].max(axis=0)[None, :]

        # Softmax
        probs = np.exp(probs)
        probs /= probs.sum(axis=0)[None, :]
        cost, deltasC, skip = ctc.ctc_loss(np.asfortranarray(probs),
                                           labels,
                                           blank=0)

        if skip:
            return cost, self.grad, skip

        # back prop
        i = self.numLayers
        self.deltasOut = None
        self.deltasIn = None
        deltasIn, deltasOut = deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            grad[i][0] = np.dot(deltasIn, self.hActs[i].T)
            grad[i][1] = deltasIn.sum(axis=1)[:, None]

            # compute next layer deltas
            if i > 0:
                deltasOut = np.dot(w.T, deltasIn)

            # backprop through time
            if i == self.temporalLayer:
                tmpGradF = np.sign(actsForward)
                tmpGradB = np.sign(actsBackward)
                deltasForward = np.array(deltasOut)
                deltasForward[:, -1] *= tmpGradF[:, -1]
                deltasBackward = np.array(deltasOut)
                deltasBackward[:, 0] *= tmpGradB[:, 0]
                for t in xrange(1, T):
                    deltasForward[:, -t - 1] = tmpGradF[:, -t - 1] * (
                        deltasForward[:, -t - 1] +
                        np.dot(wtf.T, deltasForward[:, -t]))
                    deltasBackward[:, t] = tmpGradB[:, t] * (
                        deltasBackward[:, t] +
                        np.dot(wtb.T, deltasBackward[:, t - 1]))

                # Compute temporal gradient
                dwtb[:] = np.dot(deltasBackward[:, :-1], actsBackward[:, 1:].T)
                dwtf[:] = np.dot(deltasForward[:, 1:], actsForward[:, :-1].T)
                deltasOut = deltasForward + deltasBackward

            if i > 0 and i != self.temporalLayer:
                tmpGrad = np.sign(self.hActs[i])
                deltasOut *= tmpGrad

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1

        return cost, self.grad, skip
Ejemplo n.º 4
0
    def costAndGrad(self,data,labels=None):
        
        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt,_ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt,_ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad
        
        # forward prop 
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w,b in stack:
            cm.dot(w,self.hActs[i-1],self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1,T):
                    self.hActs[i].minmax(0.0,self.maxAct,col=t-1)
                    cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0)
                self.hActs[i].minmax(0.0,self.maxAct,col=T-1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0,target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0,target=self.rowVec)
        cm.pow(self.rowVec,-1.0,target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
	if not self.train:
	    return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64),
                labels,blank=0)

        if skip:
            return cost,self.grad,skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers 
        deltasIn,deltasOut = self.deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0])
            deltasIn.sum(axis=1,target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T,deltasIn,target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T-1,0,-1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t,self.tmpGrad,t) 
                    self.deltaTemp.set_single_col(t-1,deltasOut,t)

 
                # Accumulate temporal gradient
                cm.dot(self.deltaTemp,self.hActs[i].T,
                        target=dwt)

                cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0)
                deltasOut.mult_slice(0,self.tmpGrad,0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
Ejemplo n.º 5
0
    def costAndGrad(self, data, labels=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-1]
            wt, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-1]
                dwt, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)

            # forward prop through time
            if i == self.temporalLayer:
                for t in xrange(1, T):
                    self.hActs[i].minmax(0.0, self.maxAct, col=t - 1)
                    cm.mvdot_col_slice(wt,
                                       self.hActs[i],
                                       t - 1,
                                       self.hActs[i],
                                       t,
                                       beta=1.0)
                self.hActs[i].minmax(0.0, self.maxAct, col=T - 1)

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            return ctc.decode_best_path(
                self.probs.numpy_array.astype(np.float64))

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad)
                self.deltaTemp.assign(0.0)
                for t in xrange(T - 1, 0, -1):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wt.T,
                                       self.deltaTemp,
                                       t,
                                       deltasOut,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    deltasOut.mult_slice(t, self.tmpGrad, t)
                    self.deltaTemp.set_single_col(t - 1, deltasOut, t)

                # Accumulate temporal gradient
                cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt)

                cm.mvdot_col_slice(wt.T,
                                   self.deltaTemp,
                                   0,
                                   deltasOut,
                                   0,
                                   beta=1.0)
                deltasOut.mult_slice(0, self.tmpGrad, 0)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1

        return cost, self.grad, skip
Ejemplo n.º 6
0
    def costAndGrad(self,data,labels=None,key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        # this is the same as minibatch forward prop 
        # since we pre-compute context window features for each time
        self.hActs[0] = data
        i = 1
        for w,b in self.stack:
            self.hActs[i] = w.dot(self.hActs[i-1])+b
            if i <= len(self.layerSizes):
                self.hActs[i] = self.activation(self.hActs[i])
            i += 1

        probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0)
	probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs/np.sum(probs,axis=0)
#	probs[probs<1e-12] = 1e-12 # TODO have to clamp?

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us? 
	if not self.train:
	    return ctc.decode_best_path(probs, ref=labels, blank=0)
	    #return ctc.decode_bp_bigrams(probs, blank=0, B=None)

        cost, self.deltas[-1], skip = ctc.ctc_loss(probs, labels, blank=0)

	# Bad utterance ?
	if skip:
	    return cost,self.grad,skip

	# Store probabilities and error signal for a given key
	#if key is not None and key in self.hist:
	#    self.hist[key].append((probs,self.deltas[-1]))

	self.deltas[-1] = gp.garray(self.deltas[-1])

        # back prop
        i = len(self.layerSizes)-1
        for w,b in reversed(self.stack[1:]):
            grad = self.activation(self.hActs[i+1], True)
            self.deltas[i] = w.T.dot(self.deltas[i+1])*grad
            i -= 1

        # compute gradients
        # NOTE we do not divide by utterance length. 
        #    Will need to scale up weight norm penalty accordingly
        for i in range(len(self.grad)):
            self.grad[i][0] = self.deltas[i].dot(self.hActs[i].T)
            self.grad[i][1] = gp.sum(self.deltas[i],axis=1).reshape(-1,1)

        return cost,self.grad,skip
Ejemplo n.º 7
0
    def costAndGrad(self, data, labels=None, sentence=None):

        T = data.shape[1]
        self.setViews(T)

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf, _ = self.stack[-2]
            wtb, _ = self.stack[-1]
            if self.train:
                grad = self.grad[:-2]
                dwtf, _ = self.grad[-2]
                dwtb, _ = self.grad[-1]
        else:
            stack = self.stack
            if self.train:
                grad = self.grad

        # forward prop #TODO copy to device here
        self.hActs[0].assign(cm.CUDAMatrix(data))

        i = 1
        for w, b in stack:
            cm.dot(w, self.hActs[i - 1], self.hActs[i])
            self.hActs[i].add_col_vec(b)
            # forward prop through time
            if i == self.temporalLayer:
                self.hActsFor.assign(self.hActs[i])
                self.hActsBack.assign(self.hActs[i])
                self.hActsFor.minmax(0.0, self.maxAct, col=0)
                self.hActsBack.minmax(0.0, self.maxAct, col=T - 1)
                for t in xrange(1, T):
                    cm.mvdot_col_slice(wtf,
                                       self.hActsFor,
                                       t - 1,
                                       self.hActsFor,
                                       t,
                                       beta=1.0)
                    self.hActsFor.minmax(0.0, self.maxAct, col=t)
                    cm.mvdot_col_slice(wtb,
                                       self.hActsBack,
                                       T - t,
                                       self.hActsBack,
                                       T - t - 1,
                                       beta=1.0)
                    self.hActsBack.minmax(0.0, self.maxAct, col=T - t - 1)
                self.hActsFor.add(self.hActsBack, target=self.hActs[i])

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i].maximum(0.0)
            i += 1

        # Subtract max activation
        self.hActs[-1].max(axis=0, target=self.rowVec)
        self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs)

        # Softmax
        cm.exp(self.probs)
        self.probs.sum(axis=0, target=self.rowVec)
        cm.pow(self.rowVec, -1.0, target=self.rowVec)
        self.probs.mult_by_row(self.rowVec)

        self.probs.copy_to_host()
        if not self.train:
            probs = self.probs.numpy_array
            return probs

        cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(
            np.float64),
                                          labels,
                                          blank=0)

        if self.reg > 0:
            self.regcost = 0.0
            for w, b in self.stack:
                rc = (self.reg / 2.0) * (w.euclid_norm()**2)
                self.regcost += rc
                cost = cost + rc

        if skip:
            return cost, self.grad, skip

        self.deltasC.assign(cm.CUDAMatrix(deltas))

        # back prop
        i = self.numLayers
        deltasIn, deltasOut = self.deltasC, self.deltasOut
        for w, b in reversed(stack):
            # compute gradient
            # gradient for w
            cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0])
            if self.reg > 0:
                grad[i][0].add_mult(w, alpha=self.reg)
            # gradient for b
            deltasIn.sum(axis=1, target=grad[i][1])

            # compute next layer deltas
            if i > 0:
                cm.dot(w.T, deltasIn, target=deltasOut)

            # backprop through time
            if i == self.temporalLayer:
                self.hActsFor.within(0.0, self.maxAct, target=self.tmpGradFor)
                self.hActsBack.within(0.0,
                                      self.maxAct,
                                      target=self.tmpGradBack)
                self.deltasFor.assign(deltasOut)
                self.deltasBack.assign(deltasOut)
                self.deltasFor.mult_slice(T - 1, self.tmpGradFor, T - 1)
                self.deltasBack.mult_slice(0, self.tmpGradBack, 0)

                for t in xrange(1, T):
                    # Add in temporal delta
                    cm.mvdot_col_slice(wtf.T,
                                       self.deltasFor,
                                       T - t,
                                       self.deltasFor,
                                       T - t - 1,
                                       beta=1.0)
                    cm.mvdot_col_slice(wtb.T,
                                       self.deltasBack,
                                       t - 1,
                                       self.deltasBack,
                                       t,
                                       beta=1.0)

                    # Push through activation fn
                    self.deltasFor.mult_slice(T - t - 1, self.tmpGradFor,
                                              T - t - 1)
                    self.deltasBack.mult_slice(t, self.tmpGradBack, t)

                # Accumulate temporal gradient
                cm.dot(self.deltasFor.get_col_slice(1, T),
                       self.hActsFor.get_col_slice(0, T - 1).T,
                       target=dwtf)
                cm.dot(self.deltasBack.get_col_slice(0, T - 1),
                       self.hActsBack.get_col_slice(1, T).T,
                       target=dwtb)

                # Accumulate next layer deltas
                self.deltasFor.add(self.deltasBack, target=deltasOut)

            if i > 0 and i != self.temporalLayer:
                self.hActs[i].sign(target=self.tmpGrad)
                deltasOut.mult(self.tmpGrad)

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn, deltasOut = deltasOut, deltasIn
            i -= 1
        if self.reg > 0:
            if self.temporalLayer > 0:
                dwtf.add_mult(wtf, alpha=self.reg)
                dwtb.add_mult(wtb, alpha=self.reg)

        return cost, self.grad, skip
Ejemplo n.º 8
0
    def costAndGrad(self,data,labels):
        
        T = data.shape[1]
        
        # forward prop
        self.hActs[0] = data

        if self.temporalLayer > 0:
            stack = self.stack[:-2]
            wtf,_ = self.stack[-2]
            wtb,_ = self.stack[-1]
            grad = self.grad[:-2]
            dwtf,_ = self.grad[-2]
            dwtb,_ = self.grad[-1]
        else:
            stack = self.stack
            grad = self.grad
 
        i = 1
        for w,b in stack:
            self.hActs[i] = np.dot(w,self.hActs[i-1])
            self.hActs[i] += b

            # forward prop through time
            if i == self.temporalLayer:
                preActs = np.array(self.hActs[i])
                actsForward = np.empty(preActs.shape)
                actsForward[:,0] = preActs[:,0]
                actsForward[preActs[:,0]<=0,0] = 0.0 
                actsBackward = np.empty(preActs.shape)
                actsBackward[:,-1] = preActs[:,-1]
                actsBackward[preActs[:,-1]<=0,-1] = 0.0 
                for t in xrange(1,T):
                    actsForward[:,t] = np.dot(wtf,actsForward[:,t-1]) + preActs[:,t]
                    actsBackward[:,-t-1] = np.dot(wtb,actsBackward[:,-t]) + preActs[:,-t-1]
                    actsForward[actsForward[:,t]<=0,t] = 0.0
                    actsBackward[actsBackward[:,-t-1]<=0,-t-1] = 0.0
                self.hActs[i][:] = actsForward + actsBackward

            if i <= self.numLayers and i != self.temporalLayer:
                # hard relu
                self.hActs[i][self.hActs[i]<0.0] = 0.0
            i += 1

        # Subtract max activation
        probs = self.hActs[-1] - self.hActs[-1].max(axis=0)[None,:]

        # Softmax
        probs = np.exp(probs)
        probs /= probs.sum(axis=0)[None,:]
        cost, deltasC, skip = ctc.ctc_loss(np.asfortranarray(probs),labels,blank=0)

        if skip:
            return cost,self.grad,skip

        # back prop
        i = self.numLayers 
        self.deltasOut = None
        self.deltasIn = None
        deltasIn,deltasOut = deltasC,self.deltasOut
        for w,b in reversed(stack):
            # compute gradient
            grad[i][0] = np.dot(deltasIn,self.hActs[i].T)
            grad[i][1] = deltasIn.sum(axis=1)[:,None]

            # compute next layer deltas
            if i > 0:
                deltasOut = np.dot(w.T,deltasIn)

            # backprop through time
            if i == self.temporalLayer:
                tmpGradF = np.sign(actsForward)
                tmpGradB = np.sign(actsBackward)
                deltasForward = np.array(deltasOut)
                deltasForward[:,-1] *= tmpGradF[:,-1]
                deltasBackward = np.array(deltasOut)
                deltasBackward[:,0] *= tmpGradB[:,0]
                for t in xrange(1,T):
                    deltasForward[:,-t-1] = tmpGradF[:,-t-1]*(deltasForward[:,-t-1]+np.dot(wtf.T,deltasForward[:,-t]))
                    deltasBackward[:,t] = tmpGradB[:,t]*(deltasBackward[:,t]+np.dot(wtb.T,deltasBackward[:,t-1]))

                # Compute temporal gradient
                dwtb[:] = np.dot(deltasBackward[:,:-1],actsBackward[:,1:].T)
                dwtf[:] = np.dot(deltasForward[:,1:],actsForward[:,:-1].T)
                deltasOut = deltasForward + deltasBackward

            if i > 0 and i != self.temporalLayer:
                tmpGrad = np.sign(self.hActs[i])
                deltasOut *= tmpGrad

            if i == self.numLayers:
                deltasIn = self.deltasIn

            deltasIn,deltasOut = deltasOut,deltasIn
            i -= 1

        return cost,self.grad,skip
Ejemplo n.º 9
0
    def costAndGrad(self, data, labels, key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        T = data.shape[1]
        sizes = [self.inputDim] + self.layerSizes + [self.outputDim]
        stackMax = len(self.stack) - 1
        if self.temporalLayer > 0:
            stackMax -= 1

        self.hActs = [gp.empty((s, T)) for s in sizes]
        self.hActs[0] = data
        #for t in range(T):
        i = 1
        for l in range(stackMax + 1):
            w, b = self.stack[l]

            self.hActs[i] = w.dot(self.hActs[i - 1]) + b
            # loop over time for recurrent layer
            if (self.temporalLayer - 1) == l:
                for t in range(T):
                    if t > 0:
                        self.hActs[i][:, t] += self.stack[-1][0].dot(
                            self.hActs[i][:, t - 1])
                    # nonlinearity
                    if i <= stackMax:
                        self.hActs[i][:, t] = self.activation(self.hActs[i][:,
                                                                            t])
            # hidden layer activation function for batch forward prop
            elif i <= stackMax:
                self.hActs[i] = self.activation(self.hActs[i])

            #    w_t,b_t = self.stack[-1][0]
            #    self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
            i += 1

        # convert final layer to probs after all time iteration complete
        probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0)
        probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs / np.sum(probs, axis=0)

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us?
        cost, delta_output, skip = ctc.ctc_loss(probs,
                                                labels.squeeze(),
                                                blank=0)

        # Store probabilities and error signal for a given key
        if key is not None and key in self.hist:
            self.hist[key].append((probs, delta_output))

        if not self.train:
            return cost, None

        delta_output = gp.garray(delta_output)
        ## back prop through time
        # zero gradients
        self.grad = [[gp.zeros(w.shape), gp.zeros(b.shape)]
                     for w, b in self.stack]
        if self.temporalLayer > 0:
            delta_t = np.zeros(self.layerSizes[self.temporalLayer - 1])
        for t in reversed(range(T)):
            # get delta from loss function
            delta = delta_output[:, t].T

            # compute gradient for output layer
            #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape
            #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape
            # TODO can we get rid of some of these annoying reshape -1 1?
            self.grad[stackMax][0] += delta.reshape(-1, 1).dot(
                self.hActs[-2][:, t].reshape(-1, 1).T)
            self.grad[stackMax][1] += delta.reshape(-1, 1)

            # push delta through output layer
            delta = self.stack[stackMax][0].T.dot(delta)

            # iterate over lower layers
            i = len(self.layerSizes) - 1
            while i >= 0:
                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer - 1) == i:
                    #print delta.shape, delta_t.shape
                    delta += delta_t
                # push delta through activation function for this layer
                #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape
                delta = delta * self.activation(self.hActs[i + 1][:, t], True)
                #embed()
                # compute the gradient
                #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape
                self.grad[i][0] += delta.reshape(-1, 1).dot(
                    self.hActs[i][:, t].T.reshape(1, -1))
                self.grad[i][1] += delta.reshape(-1, 1)

                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer - 1) == i and t > 0:
                    self.grad[-1][0] += delta.reshape(-1, 1).dot(
                        self.hActs[i + 1][:, t - 1].T.reshape(1, -1))
                    # push delta through temporal connections
                    delta_t = self.stack[-1][0].T.dot(delta)

                    # HACK no bias for temporal layer. Give it a gradient of 0
                    self.grad[-1][1] = np.zeros((2, 1))

                # push the delta downward
                w, b = self.stack[i]
                delta = w.T.dot(delta)
                i -= 1
        #print self.grad
        return cost, self.grad, skip
Ejemplo n.º 10
0
    def costAndGrad(self,data,labels,key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        T = data.shape[1]
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        stackMax = len(self.stack)-1
        if self.temporalLayer > 0:
            stackMax -= 1

        self.hActs = [gp.empty((s,T)) for s in sizes]
        self.hActs[0] = data
        #for t in range(T):
        i = 1
        for l in range(stackMax+1):
            w,b = self.stack[l]

            self.hActs[i] = w.dot(self.hActs[i-1]) + b
            # loop over time for recurrent layer
            if (self.temporalLayer-1) == l:
                for t in range(T):
                    if t > 0:
                        self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
                    # nonlinearity 
                    if i <= stackMax:
                        self.hActs[i][:,t] = self.activation(self.hActs[i][:,t])
            # hidden layer activation function for batch forward prop
            elif i <= stackMax:
                self.hActs[i] = self.activation(self.hActs[i])

            #    w_t,b_t = self.stack[-1][0]
            #    self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
            i += 1

        # convert final layer to probs after all time iteration complete
        probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0)
	probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs/np.sum(probs,axis=0)

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us? 
        cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0)

	# Store probabilities and error signal for a given key
	if key is not None and key in self.hist:
	    self.hist[key].append((probs,delta_output))

        if not self.train:
            return cost,None

        delta_output =  gp.garray(delta_output)
        ## back prop through time
        # zero gradients
        self.grad = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.stack]
        if self.temporalLayer > 0:
            delta_t = np.zeros(self.layerSizes[self.temporalLayer-1])
        for t in reversed(range(T)):
            # get delta from loss function
            delta = delta_output[:,t].T

            # compute gradient for output layer
            #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape
            #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape
            # TODO can we get rid of some of these annoying reshape -1 1?
            self.grad[stackMax][0] +=  delta.reshape(-1,1).dot(self.hActs[-2][:,t].reshape(-1,1).T)
            self.grad[stackMax][1] +=  delta.reshape(-1, 1)

            # push delta through output layer
            delta = self.stack[stackMax][0].T.dot(delta)
            
            # iterate over lower layers
            i = len(self.layerSizes)-1
            while i >= 0:
                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer-1) == i:
                    #print delta.shape, delta_t.shape
                    delta += delta_t
                # push delta through activation function for this layer
                #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape
                delta = delta * self.activation(self.hActs[i+1][:,t], True)
                #embed()
                # compute the gradient
                #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape
                self.grad[i][0] += delta.reshape(-1,1).dot(self.hActs[i][:,t].T.reshape(1,-1))
                self.grad[i][1] += delta.reshape(-1,1)

                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer-1) == i and t > 0:
                    self.grad[-1][0] += delta.reshape(-1,1).dot(self.hActs[i+1][:,t-1].T.reshape(1,-1))
                    # push delta through temporal connections
                    delta_t = self.stack[-1][0].T.dot(delta)

                    # HACK no bias for temporal layer. Give it a gradient of 0
                    self.grad[-1][1] = np.zeros((2,1))

                # push the delta downward
                w,b = self.stack[i]
                delta = w.T.dot(delta)
                i -= 1
        #print self.grad
        return cost,self.grad, skip