def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x): """ Predict the class label of input x from supervised DBN Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html x: Input data. (NxD matrix) """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # for every class, assume it is the correct label and calculate its free energy y = gnp.zeros((K, N)) free_energy = gnp.zeros((N, K)) # we actually calculate -free_energy for k in range(K): # set the current assumed class label y[k, :] = 1.0 # visible unit vector v = gnp.concatenate((y, h_prev)) e_v = gnp.dot(ws_v[-1].T, v) # bias energy term ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1] e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0) free_energy[:, k] = e_v + e_h # zero the class labels for next iteration y[:, :] = 0.0 # since these numbers may get pretty small, use the sum-exp trick for converting # these to probabilities pred_y = ( gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]) / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis] ) return pred_y
def softmax(self, x): max=gp.max(x,axis=1) x=x-max[:,gp.newaxis] y=gp.exp(x) s=gp.sum(y,1) z=y/s[:,gp.newaxis] return z
def softmax(self, x): max = gp.max(x, axis=1) x = x - max[:, gp.newaxis] y = gp.exp(x) s = gp.sum(y, 1) z = y / s[:, gp.newaxis] return z
def safe_softmax(self, Y): """Compute a reasonably (numerically) safe softmax.""" Y_max = gp.max(Y, axis=1) Y_max = Y_max[:,gp.newaxis] Y_exp = gp.exp(Y - Y_max) Y_sum = gp.sum(Y_exp, axis=1) Y_sum = Y_sum[:,gp.newaxis] Y_sm = Y_exp / Y_sum return Y_sm
def max(A,axis,out): if A.ndim == 2: if out == None: out = gp.empty((A.shape[0],1) if axis == 1 else (1,A.shape[1]),dtype=A.dtype) A._base_shaped(1).max(1-axis,target=out._base_shaped(1)) return out else: r = gp.max(A,axis) # gnumpy has optimized max over 1D vectors, so use it if out != None: assert(out.size == 1) out[:] = r[:] return r
def max(A, axis, out): if A.ndim == 2: if out == None: out = gp.empty((A.shape[0], 1) if axis == 1 else (1, A.shape[1]), dtype=A.dtype) A._base_shaped(1).max(1 - axis, target=out._base_shaped(1)) return out else: r = gp.max( A, axis) # gnumpy has optimized max over 1D vectors, so use it if out != None: assert (out.size == 1) out[:] = r[:] return r
def costAndGrad(self, data, labels): # forward prop self.hActs[0] = data i = 1 for w, b in self.stack: self.hActs[i] = w.dot(self.hActs[i - 1]) + b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.exp(probs) probs = probs / gp.sum(probs, axis=0) probs += (probs < 1e-8) * (1e-8 - probs) labelMat = np.zeros(probs.shape) labelMat[labels, range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1. / self.mbSize) * gp.sum(labelMat * gp.log(probs)) if not self.train: return cost, None # back prop self.deltas[-1] = probs - labelMat i = len(self.layerSizes) - 1 for w, b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i + 1], True) self.deltas[i] = w.T.dot(self.deltas[i + 1]) * grad i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1. / self.mbSize) * self.deltas[i].dot( self.hActs[i].T) self.grad[i][1] = (1. / self.mbSize) * gp.sum( self.deltas[i], axis=1).reshape(-1, 1) # add gaussian noise # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape) # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape) return cost, self.grad
def costAndGrad(self,data,labels): # forward prop self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.exp(probs) probs = probs/gp.sum(probs,axis=0) probs += (probs < 1e-8)*(1e-8-probs) labelMat = np.zeros(probs.shape) labelMat[labels,range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1./self.mbSize)*gp.sum(labelMat*gp.log(probs)) if not self.train: return cost,None # back prop self.deltas[-1] = probs-labelMat i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i+1], True) self.deltas[i] = w.T.dot(self.deltas[i+1])*grad i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1./self.mbSize)*self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = (1./self.mbSize)*gp.sum(self.deltas[i],axis=1).reshape(-1,1) # add gaussian noise # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape) # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape) return cost,self.grad
def costAndGrad(self,data,labels,key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop T = data.shape[1] sizes = [self.inputDim]+self.layerSizes+[self.outputDim] stackMax = len(self.stack)-1 if self.temporalLayer > 0: stackMax -= 1 self.hActs = [gp.empty((s,T)) for s in sizes] self.hActs[0] = data #for t in range(T): i = 1 for l in range(stackMax+1): w,b = self.stack[l] self.hActs[i] = w.dot(self.hActs[i-1]) + b # loop over time for recurrent layer if (self.temporalLayer-1) == l: for t in range(T): if t > 0: self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) # nonlinearity if i <= stackMax: self.hActs[i][:,t] = self.activation(self.hActs[i][:,t]) # hidden layer activation function for batch forward prop elif i <= stackMax: self.hActs[i] = self.activation(self.hActs[i]) # w_t,b_t = self.stack[-1][0] # self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) i += 1 # convert final layer to probs after all time iteration complete probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs/np.sum(probs,axis=0) ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0) # Store probabilities and error signal for a given key if key is not None and key in self.hist: self.hist[key].append((probs,delta_output)) if not self.train: return cost,None delta_output = gp.garray(delta_output) ## back prop through time # zero gradients self.grad = [[gp.zeros(w.shape),gp.zeros(b.shape)] for w,b in self.stack] if self.temporalLayer > 0: delta_t = np.zeros(self.layerSizes[self.temporalLayer-1]) for t in reversed(range(T)): # get delta from loss function delta = delta_output[:,t].T # compute gradient for output layer #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape # TODO can we get rid of some of these annoying reshape -1 1? self.grad[stackMax][0] += delta.reshape(-1,1).dot(self.hActs[-2][:,t].reshape(-1,1).T) self.grad[stackMax][1] += delta.reshape(-1, 1) # push delta through output layer delta = self.stack[stackMax][0].T.dot(delta) # iterate over lower layers i = len(self.layerSizes)-1 while i >= 0: # add the temporal delta if this is the recurrent layer if (self.temporalLayer-1) == i: #print delta.shape, delta_t.shape delta += delta_t # push delta through activation function for this layer #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape delta = delta * self.activation(self.hActs[i+1][:,t], True) #embed() # compute the gradient #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape self.grad[i][0] += delta.reshape(-1,1).dot(self.hActs[i][:,t].T.reshape(1,-1)) self.grad[i][1] += delta.reshape(-1,1) # add the temporal delta if this is the recurrent layer if (self.temporalLayer-1) == i and t > 0: self.grad[-1][0] += delta.reshape(-1,1).dot(self.hActs[i+1][:,t-1].T.reshape(1,-1)) # push delta through temporal connections delta_t = self.stack[-1][0].T.dot(delta) # HACK no bias for temporal layer. Give it a gradient of 0 self.grad[-1][1] = np.zeros((2,1)) # push the delta downward w,b = self.stack[i] delta = w.T.dot(delta) i -= 1 #print self.grad return cost,self.grad, skip
def max(A, axis): return gp.max(A, axis=axis)
def activation_softmax(x): result = x - g.max(x,axis=1)[:,g.newaxis] result = g.exp(result) result = result / g.sum(result,axis=1)[:,g.newaxis] return result
def softmax_old(x): y = gp.max(x, axis=1)[:, gp.newaxis] logsumexp = y + gp.log(gp.sum((gp.exp(x - y)), axis=1))[:, gp.newaxis] return gp.exp(x - logsumexp)
def activation_softmax(x): result = x - g.max(x, axis=1)[:, g.newaxis] result = g.exp(result) result = result / g.sum(result, axis=1)[:, g.newaxis] return result
def printMaxGrad(net,log): for i in range(len(net.weights)): print >>log, " Maximum Weight and Bias Gradient in layer %d: %f and %f" % (i,num.array(gnp.max(abs(net.WGrads[i]))),num.array(gnp.max(abs(net.biasGrads[i])))) print >>log, "==========="
def costAndGrad(self,data,labels=None,key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop # this is the same as minibatch forward prop # since we pre-compute context window features for each time self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs/np.sum(probs,axis=0) # probs[probs<1e-12] = 1e-12 # TODO have to clamp? ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? if not self.train: return ctc.decode_best_path(probs, ref=labels, blank=0) #return ctc.decode_bp_bigrams(probs, blank=0, B=None) cost, self.deltas[-1], skip = ctc.ctc_loss(probs, labels, blank=0) # Bad utterance ? if skip: return cost,self.grad,skip # Store probabilities and error signal for a given key #if key is not None and key in self.hist: # self.hist[key].append((probs,self.deltas[-1])) self.deltas[-1] = gp.garray(self.deltas[-1]) # back prop i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i+1], True) self.deltas[i] = w.T.dot(self.deltas[i+1])*grad i -= 1 # compute gradients # NOTE we do not divide by utterance length. # Will need to scale up weight norm penalty accordingly for i in range(len(self.grad)): self.grad[i][0] = self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = gp.sum(self.deltas[i],axis=1).reshape(-1,1) return cost,self.grad,skip
def softmax(A): A -= gp.max(A, axis=1)[:, gp.newaxis] Z = gp.exp(A) return Z / gp.sum(Z, axis=1)[:, gp.newaxis]
weights_step, bias_vis_step, bias_hid_step = ml.rbm.cd_update(x) if epoch >= cfg.use_final_momentum_from_epoch: momentum = cfg.final_momentum else: momentum = cfg.initial_momentum if False: print "weights_step:" print weights_step[0:5,0:5] print "bias_vis_step:" print bias_vis_step[0:5] print "bias_hid_step:" print bias_hid_step[0:5] print "max(weights_step): ", gp.max(weights_step) sys.exit(0) weights_update = momentum * weights_m1 + \ cfg.step_rate * (weights_step - cfg.weight_cost * ml.rbm.weights) bias_vis_update = momentum * bias_vis_m1 + cfg.step_rate * bias_vis_step bias_hid_update = momentum * bias_hid_m1 + cfg.step_rate * bias_hid_step ml.rbm.weights += weights_update ml.rbm.bias_vis += bias_vis_update ml.rbm.bias_hid += bias_hid_update weights_m1 = weights_update bias_vis_m1 = bias_vis_update bias_hid_m1 = bias_hid_update
def costAndGrad(self, data, labels, key=None): """ Forward prop entire utterance Call CTC cost function Compute gradient data is a 2-D matrix where each column is a single time frame Number of input frames changes across iterations labels is a vector of symbol ids, length unknown and does not depend on the number of time frames """ ## forward prop T = data.shape[1] sizes = [self.inputDim] + self.layerSizes + [self.outputDim] stackMax = len(self.stack) - 1 if self.temporalLayer > 0: stackMax -= 1 self.hActs = [gp.empty((s, T)) for s in sizes] self.hActs[0] = data #for t in range(T): i = 1 for l in range(stackMax + 1): w, b = self.stack[l] self.hActs[i] = w.dot(self.hActs[i - 1]) + b # loop over time for recurrent layer if (self.temporalLayer - 1) == l: for t in range(T): if t > 0: self.hActs[i][:, t] += self.stack[-1][0].dot( self.hActs[i][:, t - 1]) # nonlinearity if i <= stackMax: self.hActs[i][:, t] = self.activation(self.hActs[i][:, t]) # hidden layer activation function for batch forward prop elif i <= stackMax: self.hActs[i] = self.activation(self.hActs[i]) # w_t,b_t = self.stack[-1][0] # self.hActs[i][:,t] += self.stack[-1][0].dot(self.hActs[i][:,t-1]) i += 1 # convert final layer to probs after all time iteration complete probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.as_numpy_array(probs) probs = np.exp(probs) probs = probs / np.sum(probs, axis=0) ## pass probs and label string to ctc loss # TODO how much does passing to different function cost us? cost, delta_output, skip = ctc.ctc_loss(probs, labels.squeeze(), blank=0) # Store probabilities and error signal for a given key if key is not None and key in self.hist: self.hist[key].append((probs, delta_output)) if not self.train: return cost, None delta_output = gp.garray(delta_output) ## back prop through time # zero gradients self.grad = [[gp.zeros(w.shape), gp.zeros(b.shape)] for w, b in self.stack] if self.temporalLayer > 0: delta_t = np.zeros(self.layerSizes[self.temporalLayer - 1]) for t in reversed(range(T)): # get delta from loss function delta = delta_output[:, t].T # compute gradient for output layer #print self.hActs[-2].shape, delta.shape, self.stack[stackMax][0].shape #print delta.reshape(-1,1).shape, self.hActs[-2][:,t].reshape(-1,1).shape # TODO can we get rid of some of these annoying reshape -1 1? self.grad[stackMax][0] += delta.reshape(-1, 1).dot( self.hActs[-2][:, t].reshape(-1, 1).T) self.grad[stackMax][1] += delta.reshape(-1, 1) # push delta through output layer delta = self.stack[stackMax][0].T.dot(delta) # iterate over lower layers i = len(self.layerSizes) - 1 while i >= 0: # add the temporal delta if this is the recurrent layer if (self.temporalLayer - 1) == i: #print delta.shape, delta_t.shape delta += delta_t # push delta through activation function for this layer #print i, stackMax, delta.shape, self.hActs[i+1][:,t].shape delta = delta * self.activation(self.hActs[i + 1][:, t], True) #embed() # compute the gradient #print i, delta.shape, self.hActs[i][:,t].T.reshape(1,-1).shape, self.grad[i][0].shape self.grad[i][0] += delta.reshape(-1, 1).dot( self.hActs[i][:, t].T.reshape(1, -1)) self.grad[i][1] += delta.reshape(-1, 1) # add the temporal delta if this is the recurrent layer if (self.temporalLayer - 1) == i and t > 0: self.grad[-1][0] += delta.reshape(-1, 1).dot( self.hActs[i + 1][:, t - 1].T.reshape(1, -1)) # push delta through temporal connections delta_t = self.stack[-1][0].T.dot(delta) # HACK no bias for temporal layer. Give it a gradient of 0 self.grad[-1][1] = np.zeros((2, 1)) # push the delta downward w, b = self.stack[i] delta = w.T.dot(delta) i -= 1 #print self.grad return cost, self.grad, skip