def backprop(self, dLdA, A, X, M, Ws=[]): """Run backprop for the activation gradients in dLdA. The lengths (i.e. len()) of the lists of arrays dLdA, A, and M should all be self.layer_count. The shapes of dLdA[i] and A[i] should be the same for all i. The shape of M[i] should match the shape of A[i-1] for i from 1 to (self.layer_count - 1). The shape of M[0] should match the shape of X. Weight array list Ws defaults to self.layer_weights(). """ if (len(Ws) == 0): Ws = self.layer_weights() dLdWs = [] dLdX = [] for i in range((self.layer_count-1),-1,-1): if (i == 0): # First layer receives X as input Xi = M[i] * lnf.bias(X, self.bias_val) else: # Other layers receive previous layer's activations as input Xi = M[i] * lnf.bias(A[i-1], self.bias_val) # BP current grads onto current layer's weights and inputs Bi = self.layers[i].backprop(dLdA[i], A[i], Xi, Ws[i]) # Rescale BP-ed input grads to account for dropout mask Bi['dLdX'] = M[i] * Bi['dLdX'] if (i == 0): # BP-ed input grads at first layer are grads on X dLdX = lnf.unbias(Bi['dLdX']) else: # BP-ed input grads at other layers should be addded to # whatever grads were already there (e.g. DEV gradients) dLdA[i-1] = dLdA[i-1] + lnf.unbias(Bi['dLdX']) # Record the BP-ed gradients on current layer's inbound weights dLdWs.append(Bi['dLdW']) dLdWs.reverse() return {'dLdWs': dLdWs, 'dLdX': dLdX}
def check_loss(self, X, Y, Ws=[]): """Check loss at output layer for observations X/Y.""" if (len(Ws) == 0): Ws = self.layer_weights() obs_count = X.shape[0] M = self.get_drop_masks(obs_count, 0, 0) A = self.feedforward(X, M) O = self.out_loss(A[-1], Y) Yc = lnf.class_cats(Y) Yhc = lnf.class_cats(A[-1]) hits = sum([(int(a) == int(b)) for (a, b) in zip(Yhc, Yc)]) acc = float(hits) / float(obs_count) return {'loss': O['L'], 'acc': acc, 'grad': O['dL']}
def check_acc(self, X, Y, Ws=[]): """Check classification accuracy using inputs X with classes Y. Classes should be represented in Y as a +/- 1 indicator matrix. """ if (len(Ws) == 0): Ws = self.layer_weights() obs_count = X.shape[0] Yc = lnf.class_cats(Y) M = self.get_drop_masks(obs_count, 0, 0) A = self.feedforward(X, M, Ws) Yh = A[-1] Yhc = lnf.class_cats(Yh) hits = sum([(int(a) == int(b)) for (a, b) in zip(Yhc, Yc)]) return (float(hits) / float(obs_count))
def feedforward(self, X, M=[], Ws=[]): """Feedforward for inputs X with drop masks M and layer weights Ws.""" if (len(M) == 0): # If no masks are given, use drop-free feedforward M = self.get_drop_masks(X.shape[0],0,0) if (len(Ws) == 0): # Default to this network's current per-layer weights Ws = self.layer_weights() A = [] for i in range(self.layer_count): if (i == 0): # First layer receives X as input Xi = M[i] * lnf.bias(X, self.bias_val) else: # Other layers receive previous layer's activations as input Xi = M[i] * lnf.bias(A[i-1], self.bias_val) # Perform feedforward through the i'th network layer Ai = self.layers[i].feedforward(Xi, Ws[i]) A.append(Ai['post']) return A
def dev_loss(self, X, Y, M, Ws=[]): """Compute DEV-regularized loss for inputs X with target outputs Y. This loss function computes a combination of standard output loss (e.g. for classification/regression) and Dropout Ensemble Variance regularization loss. X should be a list of 'dev_reps' input arrays, where 'dev_reps' is the number of times each input will be pushed through a droppy network when computing the DEV regularizer. M should be a list of lists of per-layer dropout masks, matched to size of the input arrays in X. Y should contain the target outputs for X[0], for which inputs will be pushed through a drop-free network. """ if (len(Ws) == 0): Ws = self.layer_weights() dev_reps = len(X) # Compute activations for observations in X A = [self.feedforward(X[i], M[i], Ws) for i in range(dev_reps)] # Compute loss and gradient for output-layer activations, for the # (should be) drop free feedforward of X[0]. O = self.out_loss(A[0][-1], Y) # Make list of activation gradients dLdA = [[gp.zeros(Aj.shape) for Aj in A[0]] \ for i in range(dev_reps)] dLdA[0][-1] = O['dL'] # Compute DEV regularizer loss and gradients Ld = 0.0 for i in range(self.layer_count): dev_type = self.dev_types[i] dev_lam = self.dev_lams[i] if (dev_lam > 0.0000001): Ai = [A[j][i] for j in range(dev_reps)] Di = lnf.dev_loss(Ai, dev_type, 0) Ld = Ld + (dev_lam * Di['L']) for j in range(dev_reps): dLdA[j][i] = dLdA[j][i] + (dev_lam * Di['dLdA'][j]) # Backpropagate gradients for each DEV rep B = {'dLdWs': [gp.zeros(W.shape) for W in Ws]} for i in range(dev_reps): Bi = self.backprop(dLdA[i], A[i], X[i], M[i], Ws) for j in range(self.layer_count): B['dLdWs'][j] = B['dLdWs'][j] + Bi['dLdWs'][j] # Compute parameter regularization loss and gradients R = self.reg_loss(Ws) # Combine output loss, DEV loss, and regularization loss L = [O['L'], Ld, R['L']] # Combine output loss gradient and regularization gradient dLdWs = [(dWb + dWr) for (dWb, dWr) in zip(B['dLdWs'], R['dLdWs'])] return {'L': L, 'dLdWs': dLdWs}
def train(self, X, Y, opts={}): """Train this network using observations X/Y and options 'opts'. This does SGD. """ # Fill out opts with defaults, and adjust self if needed opts = lnf.check_opts(opts) if opts.has_key('lam_l2'): self.lam_l2 = opts['lam_l2'] if opts.has_key('lam_l1'): self.lam_l1 = opts['lam_l1'] if opts.has_key('wt_bnd'): self.wt_bnd = opts['wt_bnd'] # Grab params that control minibatch SGD batch_size = opts['batch_size'] dev_reps = opts['dev_reps'] rate = opts['start_rate'] decay = opts['decay_rate'] momentum = opts['momentum'] rounds = opts['rounds'] # Get initial weights, and an initial set of momentus updates Ws = self.layer_weights() self.set_weights(Ws) dLdWs_mom = [gp.zeros(W.shape) for W in Ws] # Get arrays for holding training batches and batches for loss # checking on the training set. Xb = gp.zeros((batch_size, X.shape[1])) Yb = gp.zeros((batch_size, Y.shape[1])) Xv = gp.zeros((min(X.shape[0],2000), X.shape[1])) Yv = gp.zeros((min(Y.shape[0],2000), Y.shape[1])) # Loop-da-loop b_start = 0 for i in range(rounds): # Grab a minibatch of training examples b_end = b_start + batch_size if (b_end >= X.shape[0]): b_start = 0 b_end = b_start + batch_size Xb = X[b_start:b_end,:] Yb = Y[b_start:b_end,:] b_start = b_end if (self.do_dev == 1): # Make lists of inputs and drop masks for DEV regularization Xb_a = [Xb for j in range(dev_reps)] Mb_a = [self.get_drop_masks(Xb.shape[0],int(j>0),int(j>0)) \ for j in range(dev_reps)] # Compute loss and gradients subject to DEV regularization loss_info = self.dev_loss(Xb_a, Yb, Mb_a, Ws) else: # Get dropout masks for the minibatch Mb = self.get_drop_masks(Xb.shape[0], 1, 1) # Compute SDE loss for the minibatch loss_info = self.sde_loss(Xb, Yb, Mb, Ws) # Adjust momentus updates and apply to Ws gentle_rate = min(1.0, (i / 1000.0)) * rate for j in range(self.layer_count): dLdWs_mom[j] = (momentum * dLdWs_mom[j]) + \ ((1.0 - momentum) * loss_info['dLdWs'][j]) Ws[j] = Ws[j] - (gentle_rate * dLdWs_mom[j]) # Update learning rate rate = rate * decay # Bound L2 norm of weights based on self.wt_bnd for j in range(self.layer_count): Ws[j] = self.layers[j].bound_weights(Ws[j], self.wt_bnd) # Give some feedback, to quell impatience and fidgeting if ((i == 0) or (((i + 1) % 200) == 0)): self.set_weights(Ws) lnf.sample_obs(X, Y, Xv, Yv) CL_tr = self.check_loss(Xv, Yv) print 'Round {0:6d}:'.format((i + 1)) print ' Lo: {0:.4f}, Ld: {1:.4f}, Lr: {2:.4f}'.format(\ loss_info['L'][0],loss_info['L'][1],loss_info['L'][2]) if (opts['do_validate'] == 1): # Compute accuracy on validation set lnf.sample_obs(opts['Xv'], opts['Yv'], Xv, Yv) CL_te = self.check_loss(Xv, Yv) print ' Atr: {0:.4f}, Ltr: {1:.4f}, Ate: {2:.4f}, Lte: {3:.4f}'.\ format(CL_tr['acc'], CL_tr['loss'], CL_te['acc'], CL_te['loss']) else: print ' Atr: {0:.4f}, Ltr: {1:.4f}'.\ format(CL_tr['acc'], CL_tr['loss']) #print " Matrix data types: " #print " dLdWs_mom[0]: " + str(dLdWs_mom[0].dtype) #print " Ws[0]: " + str(Ws[0].dtype) stdout.flush()
if __name__ == '__main__': from time import clock as clock obs_dim = 784 out_dim = 10 obs_count = 10000 hidden_size = 250 layer_sizes = [obs_dim, hidden_size, hidden_size, out_dim] # Generate dummy training data X = gp.randn((obs_count, obs_dim)) Y = gp.randn((obs_count, out_dim)) # Get some training options opts = lnf.check_opts() opts['rounds'] = 201 opts['batch_size'] = 100 opts['dev_reps'] = 2 # Train a network (on BS data) LN = LNNet(layer_sizes, lnf.kspr_trans, lnf.loss_lsq) LN.do_dev = 1 LN.dev_lams = [1.0 for i in range(LN.layer_count)] # Time training t1 = clock() LN.train(X,Y,opts) t2 = clock() print "TIME PER UPDATE: " + str(float(t2 - t1) / float(opts['rounds']))
import gnumpy as gp import LNFuncs as lnf import LNLayer as lnl import LNNet as lnn from time import clock as clock if __name__ == '__main__': # Load usps data X = np.load('usps_X_numpy.npy') Y = np.load('usps_Y_numpy.npy') X = X - np.reshape(np.mean(X,axis=1), (X.shape[0], 1)) X = X / np.reshape(np.max(np.abs(X),axis=1), (X.shape[0], 1)) # Split into random training/test portions (note: lnf.trte_split() returns # gp.garrays, i.e. arrays on the GPU) [Xtr, Ytr, Xte, Yte] = lnf.trte_split(X, Y, 0.8) # Configure network layer sizes obs_dim = X.shape[1] out_dim = Y.shape[1] hidden_size = 200 layer_sizes = [obs_dim, hidden_size, hidden_size, out_dim] # Set some training options opts = lnf.check_opts() opts['rounds'] = 15000 opts['batch_size'] = 100 opts['start_rate'] = 0.1 opts['momentum'] = 0.9 opts['decay_rate'] = 0.1**(1.0 / opts['rounds']) opts['dev_reps'] = 2 opts['do_validate'] = 1 opts['Xv'] = Xte