Ejemplo n.º 1
0
def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x):
    """
    Predict the class label of input x from supervised DBN
    Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006
    The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html
    
    x: Input data. (NxD matrix)
    """
    L = len(ws_vh)
    N = x.shape[0]

    # make a forward pass to get from input layer to visible layer of top level
    # RBM
    h_prev = x.T

    # forward (bottom-up) pass, (use deterministic (we pass the activations, not
    # the stochastically sampled steps) forward pass)
    for l in range(L - 1):
        ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l]
        h_prev = gnp.logistic(ah)

    H = ws_vh[-1].shape[0]  # number of visible units top level RBM
    Hx = h_prev.shape[0]  # number of hidden units in the penultimate layer
    K = H - Hx
    # (H - Hx) is the number of supervised inputs to top level RBM

    # for every class, assume it is the correct label and calculate its free energy
    y = gnp.zeros((K, N))
    free_energy = gnp.zeros((N, K))  # we actually calculate -free_energy
    for k in range(K):
        # set the current assumed class label
        y[k, :] = 1.0

        # visible unit vector
        v = gnp.concatenate((y, h_prev))
        e_v = gnp.dot(ws_v[-1].T, v)  # bias energy term

        ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1]
        e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0)

        free_energy[:, k] = e_v + e_h

        # zero the class labels for next iteration
        y[:, :] = 0.0

    # since these numbers may get pretty small, use the sum-exp trick for converting
    # these to probabilities
    pred_y = (
        gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis])
        / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis]
    )

    return pred_y
Ejemplo n.º 2
0
 def softmax(self, x):
     max=gp.max(x,axis=1)
     x=x-max[:,gp.newaxis]
     y=gp.exp(x)
     s=gp.sum(y,1)
     z=y/s[:,gp.newaxis]
     return z
Ejemplo n.º 3
0
 def softmax(self, x):
     max = gp.max(x, axis=1)
     x = x - max[:, gp.newaxis]
     y = gp.exp(x)
     s = gp.sum(y, 1)
     z = y / s[:, gp.newaxis]
     return z
Ejemplo n.º 4
0
 def safe_softmax(self, Y):
     """Compute a reasonably (numerically) safe softmax."""
     Y_max = gp.max(Y, axis=1)
     Y_max = Y_max[:,gp.newaxis]
     Y_exp = gp.exp(Y - Y_max)
     Y_sum = gp.sum(Y_exp, axis=1)
     Y_sum = Y_sum[:,gp.newaxis]
     Y_sm = Y_exp / Y_sum
     return Y_sm
Ejemplo n.º 5
0
 def max(A,axis,out):
     if A.ndim == 2: 
         if out == None:
             out = gp.empty((A.shape[0],1) if axis == 1 else (1,A.shape[1]),dtype=A.dtype)
         A._base_shaped(1).max(1-axis,target=out._base_shaped(1))
         return out
     else:
         r = gp.max(A,axis)  # gnumpy has optimized max over 1D vectors, so use it
         if out != None:
             assert(out.size == 1)
             out[:] = r[:]
         return r
Ejemplo n.º 6
0
 def max(A, axis, out):
     if A.ndim == 2:
         if out == None:
             out = gp.empty((A.shape[0], 1) if axis == 1 else
                            (1, A.shape[1]),
                            dtype=A.dtype)
         A._base_shaped(1).max(1 - axis, target=out._base_shaped(1))
         return out
     else:
         r = gp.max(
             A, axis)  # gnumpy has optimized max over 1D vectors, so use it
         if out != None:
             assert (out.size == 1)
             out[:] = r[:]
         return r
Ejemplo n.º 7
0
    def costAndGrad(self, data, labels):

        # forward prop
        self.hActs[0] = data
        i = 1
        for w, b in self.stack:
            self.hActs[i] = w.dot(self.hActs[i - 1]) + b
            if i <= len(self.layerSizes):
                self.hActs[i] = self.activation(self.hActs[i])
            i += 1

        probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0)
        probs = gp.exp(probs)
        probs = probs / gp.sum(probs, axis=0)
        probs += (probs < 1e-8) * (1e-8 - probs)

        labelMat = np.zeros(probs.shape)
        labelMat[labels, range(self.mbSize)] = 1
        labelMat = gp.garray(labelMat)
        cost = -(1. / self.mbSize) * gp.sum(labelMat * gp.log(probs))

        if not self.train:
            return cost, None

        # back prop
        self.deltas[-1] = probs - labelMat
        i = len(self.layerSizes) - 1
        for w, b in reversed(self.stack[1:]):
            grad = self.activation(self.hActs[i + 1], True)
            self.deltas[i] = w.T.dot(self.deltas[i + 1]) * grad
            i -= 1

        # compute gradients
        for i in range(len(self.grad)):
            self.grad[i][0] = (1. / self.mbSize) * self.deltas[i].dot(
                self.hActs[i].T)
            self.grad[i][1] = (1. / self.mbSize) * gp.sum(
                self.deltas[i], axis=1).reshape(-1, 1)

            # add gaussian noise
            # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape)
            # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape)

        return cost, self.grad
Ejemplo n.º 8
0
    def costAndGrad(self,data,labels):
        
        # forward prop
        self.hActs[0] = data
        i = 1
        for w,b in self.stack:
            self.hActs[i] = w.dot(self.hActs[i-1])+b
            if i <= len(self.layerSizes):
                self.hActs[i] = self.activation(self.hActs[i])
            i += 1

        probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0)
        probs = gp.exp(probs)
        probs = probs/gp.sum(probs,axis=0)
        probs += (probs < 1e-8)*(1e-8-probs)

        labelMat = np.zeros(probs.shape)
        labelMat[labels,range(self.mbSize)] = 1
        labelMat = gp.garray(labelMat)
        cost = -(1./self.mbSize)*gp.sum(labelMat*gp.log(probs))

        if not self.train:
            return cost,None

        # back prop
        self.deltas[-1] = probs-labelMat
        i = len(self.layerSizes)-1
        for w,b in reversed(self.stack[1:]):
            grad = self.activation(self.hActs[i+1], True)
            self.deltas[i] = w.T.dot(self.deltas[i+1])*grad
            i -= 1

        # compute gradients
        for i in range(len(self.grad)):
            self.grad[i][0] = (1./self.mbSize)*self.deltas[i].dot(self.hActs[i].T)
            self.grad[i][1] = (1./self.mbSize)*gp.sum(self.deltas[i],axis=1).reshape(-1,1)

            # add gaussian noise
            # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape)
            # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape)

        return cost,self.grad
Ejemplo n.º 9
0
    def costAndGrad(self,data,labels,key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        T = data.shape[1]
        sizes = [self.inputDim]+self.layerSizes+[self.outputDim]
        stackMax = len(self.stack)-1
        if self.temporalLayer > 0:
            stackMax -= 1

        self.hActs = [gp.empty((s,T)) for s in sizes]
        self.hActs[0] = data
        #for t in range(T):
        i = 1
        for l in range(stackMax+1):
            w,b = self.stack[l]

            self.hActs[i] = w.dot(self.hActs[i-1]) + b
            # loop over time for recurrent layer
            if (self.temporalLayer-1) == l:
                for t in range(T):
                    if t > 0:
                        self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
                    # nonlinearity 
                    if i <= stackMax:
                        self.hActs[i][:,t] = self.activation(self.hActs[i][:,t])
            # hidden layer activation function for batch forward prop
            elif i <= stackMax:
                self.hActs[i] = self.activation(self.hActs[i])

            #    w_t,b_t = self.stack[-1][0]
            #    self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
            i += 1

        # convert final layer to probs after all time iteration complete
        probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0)
	probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs/np.sum(probs,axis=0)

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us? 
        cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0)

	# Store probabilities and error signal for a given key
	if key is not None and key in self.hist:
	    self.hist[key].append((probs,delta_output))

        if not self.train:
            return cost,None

        delta_output =  gp.garray(delta_output)
        ## back prop through time
        # zero gradients
        self.grad = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.stack]
        if self.temporalLayer > 0:
            delta_t = np.zeros(self.layerSizes[self.temporalLayer-1])
        for t in reversed(range(T)):
            # get delta from loss function
            delta = delta_output[:,t].T

            # compute gradient for output layer
            #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape
            #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape
            # TODO can we get rid of some of these annoying reshape -1 1?
            self.grad[stackMax][0] +=  delta.reshape(-1,1).dot(self.hActs[-2][:,t].reshape(-1,1).T)
            self.grad[stackMax][1] +=  delta.reshape(-1, 1)

            # push delta through output layer
            delta = self.stack[stackMax][0].T.dot(delta)
            
            # iterate over lower layers
            i = len(self.layerSizes)-1
            while i >= 0:
                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer-1) == i:
                    #print delta.shape, delta_t.shape
                    delta += delta_t
                # push delta through activation function for this layer
                #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape
                delta = delta * self.activation(self.hActs[i+1][:,t], True)
                #embed()
                # compute the gradient
                #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape
                self.grad[i][0] += delta.reshape(-1,1).dot(self.hActs[i][:,t].T.reshape(1,-1))
                self.grad[i][1] += delta.reshape(-1,1)

                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer-1) == i and t > 0:
                    self.grad[-1][0] += delta.reshape(-1,1).dot(self.hActs[i+1][:,t-1].T.reshape(1,-1))
                    # push delta through temporal connections
                    delta_t = self.stack[-1][0].T.dot(delta)

                    # HACK no bias for temporal layer. Give it a gradient of 0
                    self.grad[-1][1] = np.zeros((2,1))

                # push the delta downward
                w,b = self.stack[i]
                delta = w.T.dot(delta)
                i -= 1
        #print self.grad
        return cost,self.grad, skip
Ejemplo n.º 10
0
 def max(A, axis):
     return gp.max(A, axis=axis)
Ejemplo n.º 11
0
def activation_softmax(x):
    result = x - g.max(x,axis=1)[:,g.newaxis]
    result = g.exp(result)
    result = result / g.sum(result,axis=1)[:,g.newaxis]
    return result
Ejemplo n.º 12
0
 def softmax_old(x):
     y = gp.max(x, axis=1)[:, gp.newaxis]
     logsumexp = y + gp.log(gp.sum((gp.exp(x - y)), axis=1))[:, gp.newaxis]
     return gp.exp(x - logsumexp)
Ejemplo n.º 13
0
def activation_softmax(x):
    result = x - g.max(x, axis=1)[:, g.newaxis]
    result = g.exp(result)
    result = result / g.sum(result, axis=1)[:, g.newaxis]
    return result
Ejemplo n.º 14
0
def printMaxGrad(net,log):
    for i in range(len(net.weights)):
        print >>log, "  Maximum Weight and Bias Gradient in layer %d: %f and %f" % (i,num.array(gnp.max(abs(net.WGrads[i]))),num.array(gnp.max(abs(net.biasGrads[i]))))
    print >>log, "==========="
Ejemplo n.º 15
0
    def costAndGrad(self,data,labels=None,key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        # this is the same as minibatch forward prop 
        # since we pre-compute context window features for each time
        self.hActs[0] = data
        i = 1
        for w,b in self.stack:
            self.hActs[i] = w.dot(self.hActs[i-1])+b
            if i <= len(self.layerSizes):
                self.hActs[i] = self.activation(self.hActs[i])
            i += 1

        probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0)
	probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs/np.sum(probs,axis=0)
#	probs[probs<1e-12] = 1e-12 # TODO have to clamp?

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us? 
	if not self.train:
	    return ctc.decode_best_path(probs, ref=labels, blank=0)
	    #return ctc.decode_bp_bigrams(probs, blank=0, B=None)

        cost, self.deltas[-1], skip = ctc.ctc_loss(probs, labels, blank=0)

	# Bad utterance ?
	if skip:
	    return cost,self.grad,skip

	# Store probabilities and error signal for a given key
	#if key is not None and key in self.hist:
	#    self.hist[key].append((probs,self.deltas[-1]))

	self.deltas[-1] = gp.garray(self.deltas[-1])

        # back prop
        i = len(self.layerSizes)-1
        for w,b in reversed(self.stack[1:]):
            grad = self.activation(self.hActs[i+1], True)
            self.deltas[i] = w.T.dot(self.deltas[i+1])*grad
            i -= 1

        # compute gradients
        # NOTE we do not divide by utterance length. 
        #    Will need to scale up weight norm penalty accordingly
        for i in range(len(self.grad)):
            self.grad[i][0] = self.deltas[i].dot(self.hActs[i].T)
            self.grad[i][1] = gp.sum(self.deltas[i],axis=1).reshape(-1,1)

        return cost,self.grad,skip
Ejemplo n.º 16
0
 def softmax(A):
     A -= gp.max(A, axis=1)[:, gp.newaxis]
     Z = gp.exp(A)
     return Z / gp.sum(Z, axis=1)[:, gp.newaxis]
Ejemplo n.º 17
0
            weights_step, bias_vis_step, bias_hid_step = ml.rbm.cd_update(x)

        if epoch >= cfg.use_final_momentum_from_epoch:
            momentum = cfg.final_momentum
        else:
            momentum = cfg.initial_momentum
        
        if False:
            print "weights_step:"
            print weights_step[0:5,0:5]
            print "bias_vis_step:"
            print bias_vis_step[0:5]
            print "bias_hid_step:"
            print bias_hid_step[0:5]

            print "max(weights_step): ", gp.max(weights_step)

            sys.exit(0)         

        weights_update = momentum * weights_m1 + \
            cfg.step_rate * (weights_step - cfg.weight_cost * ml.rbm.weights)
        bias_vis_update = momentum * bias_vis_m1 + cfg.step_rate * bias_vis_step
        bias_hid_update = momentum * bias_hid_m1 + cfg.step_rate * bias_hid_step
    
        ml.rbm.weights += weights_update
        ml.rbm.bias_vis += bias_vis_update
        ml.rbm.bias_hid += bias_hid_update

        weights_m1 = weights_update
        bias_vis_m1 = bias_vis_update
        bias_hid_m1 = bias_hid_update
Ejemplo n.º 18
0
    def costAndGrad(self, data, labels, key=None):
        """
        Forward prop entire utterance
        Call CTC cost function
        Compute gradient

        data is a 2-D matrix where each column is a single time frame
        Number of input frames changes across iterations
        
        labels is a vector of symbol ids, length unknown and does not
        depend on the number of time frames
        """

        ## forward prop
        T = data.shape[1]
        sizes = [self.inputDim] + self.layerSizes + [self.outputDim]
        stackMax = len(self.stack) - 1
        if self.temporalLayer > 0:
            stackMax -= 1

        self.hActs = [gp.empty((s, T)) for s in sizes]
        self.hActs[0] = data
        #for t in range(T):
        i = 1
        for l in range(stackMax + 1):
            w, b = self.stack[l]

            self.hActs[i] = w.dot(self.hActs[i - 1]) + b
            # loop over time for recurrent layer
            if (self.temporalLayer - 1) == l:
                for t in range(T):
                    if t > 0:
                        self.hActs[i][:, t] += self.stack[-1][0].dot(
                            self.hActs[i][:, t - 1])
                    # nonlinearity
                    if i <= stackMax:
                        self.hActs[i][:, t] = self.activation(self.hActs[i][:,
                                                                            t])
            # hidden layer activation function for batch forward prop
            elif i <= stackMax:
                self.hActs[i] = self.activation(self.hActs[i])

            #    w_t,b_t = self.stack[-1][0]
            #    self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1])
            i += 1

        # convert final layer to probs after all time iteration complete
        probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0)
        probs = gp.as_numpy_array(probs)
        probs = np.exp(probs)
        probs = probs / np.sum(probs, axis=0)

        ## pass probs and label string to ctc loss
        # TODO how much does passing to different function cost us?
        cost, delta_output, skip = ctc.ctc_loss(probs,
                                                labels.squeeze(),
                                                blank=0)

        # Store probabilities and error signal for a given key
        if key is not None and key in self.hist:
            self.hist[key].append((probs, delta_output))

        if not self.train:
            return cost, None

        delta_output = gp.garray(delta_output)
        ## back prop through time
        # zero gradients
        self.grad = [[gp.zeros(w.shape), gp.zeros(b.shape)]
                     for w, b in self.stack]
        if self.temporalLayer > 0:
            delta_t = np.zeros(self.layerSizes[self.temporalLayer - 1])
        for t in reversed(range(T)):
            # get delta from loss function
            delta = delta_output[:, t].T

            # compute gradient for output layer
            #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape
            #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape
            # TODO can we get rid of some of these annoying reshape -1 1?
            self.grad[stackMax][0] += delta.reshape(-1, 1).dot(
                self.hActs[-2][:, t].reshape(-1, 1).T)
            self.grad[stackMax][1] += delta.reshape(-1, 1)

            # push delta through output layer
            delta = self.stack[stackMax][0].T.dot(delta)

            # iterate over lower layers
            i = len(self.layerSizes) - 1
            while i >= 0:
                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer - 1) == i:
                    #print delta.shape, delta_t.shape
                    delta += delta_t
                # push delta through activation function for this layer
                #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape
                delta = delta * self.activation(self.hActs[i + 1][:, t], True)
                #embed()
                # compute the gradient
                #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape
                self.grad[i][0] += delta.reshape(-1, 1).dot(
                    self.hActs[i][:, t].T.reshape(1, -1))
                self.grad[i][1] += delta.reshape(-1, 1)

                # add the temporal delta if this is the recurrent layer
                if (self.temporalLayer - 1) == i and t > 0:
                    self.grad[-1][0] += delta.reshape(-1, 1).dot(
                        self.hActs[i + 1][:, t - 1].T.reshape(1, -1))
                    # push delta through temporal connections
                    delta_t = self.stack[-1][0].T.dot(delta)

                    # HACK no bias for temporal layer. Give it a gradient of 0
                    self.grad[-1][1] = np.zeros((2, 1))

                # push the delta downward
                w, b = self.stack[i]
                delta = w.T.dot(delta)
                i -= 1
        #print self.grad
        return cost, self.grad, skip