def train(self, X, Y, opts={}): """Train this network using observations X/Y and options 'opts'. This does SGD. """ # Fill out opts with defaults, and adjust self if needed opts = lnf.check_opts(opts) if opts.has_key('lam_l2'): self.lam_l2 = opts['lam_l2'] if opts.has_key('lam_l1'): self.lam_l1 = opts['lam_l1'] if opts.has_key('wt_bnd'): self.wt_bnd = opts['wt_bnd'] # Grab params that control minibatch SGD batch_size = opts['batch_size'] dev_reps = opts['dev_reps'] rate = opts['start_rate'] decay = opts['decay_rate'] momentum = opts['momentum'] rounds = opts['rounds'] # Get initial weights, and an initial set of momentus updates Ws = self.layer_weights() self.set_weights(Ws) dLdWs_mom = [gp.zeros(W.shape) for W in Ws] # Get arrays for holding training batches and batches for loss # checking on the training set. Xb = gp.zeros((batch_size, X.shape[1])) Yb = gp.zeros((batch_size, Y.shape[1])) Xv = gp.zeros((min(X.shape[0],2000), X.shape[1])) Yv = gp.zeros((min(Y.shape[0],2000), Y.shape[1])) # Loop-da-loop b_start = 0 for i in range(rounds): # Grab a minibatch of training examples b_end = b_start + batch_size if (b_end >= X.shape[0]): b_start = 0 b_end = b_start + batch_size Xb = X[b_start:b_end,:] Yb = Y[b_start:b_end,:] b_start = b_end if (self.do_dev == 1): # Make lists of inputs and drop masks for DEV regularization Xb_a = [Xb for j in range(dev_reps)] Mb_a = [self.get_drop_masks(Xb.shape[0],int(j>0),int(j>0)) \ for j in range(dev_reps)] # Compute loss and gradients subject to DEV regularization loss_info = self.dev_loss(Xb_a, Yb, Mb_a, Ws) else: # Get dropout masks for the minibatch Mb = self.get_drop_masks(Xb.shape[0], 1, 1) # Compute SDE loss for the minibatch loss_info = self.sde_loss(Xb, Yb, Mb, Ws) # Adjust momentus updates and apply to Ws gentle_rate = min(1.0, (i / 1000.0)) * rate for j in range(self.layer_count): dLdWs_mom[j] = (momentum * dLdWs_mom[j]) + \ ((1.0 - momentum) * loss_info['dLdWs'][j]) Ws[j] = Ws[j] - (gentle_rate * dLdWs_mom[j]) # Update learning rate rate = rate * decay # Bound L2 norm of weights based on self.wt_bnd for j in range(self.layer_count): Ws[j] = self.layers[j].bound_weights(Ws[j], self.wt_bnd) # Give some feedback, to quell impatience and fidgeting if ((i == 0) or (((i + 1) % 200) == 0)): self.set_weights(Ws) lnf.sample_obs(X, Y, Xv, Yv) CL_tr = self.check_loss(Xv, Yv) print 'Round {0:6d}:'.format((i + 1)) print ' Lo: {0:.4f}, Ld: {1:.4f}, Lr: {2:.4f}'.format(\ loss_info['L'][0],loss_info['L'][1],loss_info['L'][2]) if (opts['do_validate'] == 1): # Compute accuracy on validation set lnf.sample_obs(opts['Xv'], opts['Yv'], Xv, Yv) CL_te = self.check_loss(Xv, Yv) print ' Atr: {0:.4f}, Ltr: {1:.4f}, Ate: {2:.4f}, Lte: {3:.4f}'.\ format(CL_tr['acc'], CL_tr['loss'], CL_te['acc'], CL_te['loss']) else: print ' Atr: {0:.4f}, Ltr: {1:.4f}'.\ format(CL_tr['acc'], CL_tr['loss']) #print " Matrix data types: " #print " dLdWs_mom[0]: " + str(dLdWs_mom[0].dtype) #print " Ws[0]: " + str(Ws[0].dtype) stdout.flush()
if __name__ == '__main__': from time import clock as clock obs_dim = 784 out_dim = 10 obs_count = 10000 hidden_size = 250 layer_sizes = [obs_dim, hidden_size, hidden_size, out_dim] # Generate dummy training data X = gp.randn((obs_count, obs_dim)) Y = gp.randn((obs_count, out_dim)) # Get some training options opts = lnf.check_opts() opts['rounds'] = 201 opts['batch_size'] = 100 opts['dev_reps'] = 2 # Train a network (on BS data) LN = LNNet(layer_sizes, lnf.kspr_trans, lnf.loss_lsq) LN.do_dev = 1 LN.dev_lams = [1.0 for i in range(LN.layer_count)] # Time training t1 = clock() LN.train(X,Y,opts) t2 = clock() print "TIME PER UPDATE: " + str(float(t2 - t1) / float(opts['rounds']))