예제 #1
0
    def train(self, X, Y, opts={}):
        """Train this network using observations X/Y and options 'opts'.

        This does SGD.
        """
        # Fill out opts with defaults, and adjust self if needed
        opts = lnf.check_opts(opts)
        if opts.has_key('lam_l2'):
            self.lam_l2 = opts['lam_l2']
        if opts.has_key('lam_l1'):
            self.lam_l1 = opts['lam_l1']
        if opts.has_key('wt_bnd'):
            self.wt_bnd = opts['wt_bnd']
        # Grab params that control minibatch SGD
        batch_size = opts['batch_size']
        dev_reps = opts['dev_reps']
        rate = opts['start_rate']
        decay = opts['decay_rate']
        momentum = opts['momentum']
        rounds = opts['rounds']
        # Get initial weights, and an initial set of momentus updates
        Ws = self.layer_weights()
        self.set_weights(Ws)
        dLdWs_mom = [gp.zeros(W.shape) for W in Ws]
        # Get arrays for holding training batches and batches for loss
        # checking on the training set.
        Xb = gp.zeros((batch_size, X.shape[1]))
        Yb = gp.zeros((batch_size, Y.shape[1]))
        Xv = gp.zeros((min(X.shape[0],2000), X.shape[1]))
        Yv = gp.zeros((min(Y.shape[0],2000), Y.shape[1]))
        # Loop-da-loop
        b_start = 0
        for i in range(rounds):
            # Grab a minibatch of training examples
            b_end = b_start + batch_size
            if (b_end >= X.shape[0]):
                b_start = 0
                b_end = b_start + batch_size
            Xb = X[b_start:b_end,:]
            Yb = Y[b_start:b_end,:]
            b_start = b_end
            if (self.do_dev == 1):
                # Make lists of inputs and drop masks for DEV regularization
                Xb_a = [Xb for j in range(dev_reps)]
                Mb_a = [self.get_drop_masks(Xb.shape[0],int(j>0),int(j>0)) \
                        for j in range(dev_reps)]
                # Compute loss and gradients subject to DEV regularization
                loss_info = self.dev_loss(Xb_a, Yb, Mb_a, Ws)
            else:
                # Get dropout masks for the minibatch
                Mb = self.get_drop_masks(Xb.shape[0], 1, 1)
                # Compute SDE loss for the minibatch
                loss_info = self.sde_loss(Xb, Yb, Mb, Ws)
            # Adjust momentus updates and apply to Ws
            gentle_rate = min(1.0, (i / 1000.0)) * rate
            for j in range(self.layer_count):
                dLdWs_mom[j] = (momentum * dLdWs_mom[j]) + \
                        ((1.0 - momentum) * loss_info['dLdWs'][j])
                Ws[j] = Ws[j] - (gentle_rate * dLdWs_mom[j])
            # Update learning rate
            rate = rate * decay
            # Bound L2 norm of weights based on self.wt_bnd
            for j in range(self.layer_count):
                Ws[j] = self.layers[j].bound_weights(Ws[j], self.wt_bnd)
            # Give some feedback, to quell impatience and fidgeting
            if ((i == 0) or (((i + 1) % 200) == 0)):
                self.set_weights(Ws)
                lnf.sample_obs(X, Y, Xv, Yv)
                CL_tr = self.check_loss(Xv, Yv)
                print 'Round {0:6d}:'.format((i + 1))
                print ' Lo: {0:.4f}, Ld: {1:.4f}, Lr: {2:.4f}'.format(\
                        loss_info['L'][0],loss_info['L'][1],loss_info['L'][2])
                if (opts['do_validate'] == 1):
                    # Compute accuracy on validation set
                    lnf.sample_obs(opts['Xv'], opts['Yv'], Xv, Yv)
                    CL_te = self.check_loss(Xv, Yv)
                    print '    Atr: {0:.4f}, Ltr: {1:.4f}, Ate: {2:.4f}, Lte: {3:.4f}'.\
                            format(CL_tr['acc'], CL_tr['loss'], CL_te['acc'], CL_te['loss'])
                else:
                    print '    Atr: {0:.4f}, Ltr: {1:.4f}'.\
                            format(CL_tr['acc'], CL_tr['loss'])
                #print "  Matrix data types: "
                #print "    dLdWs_mom[0]: " + str(dLdWs_mom[0].dtype)
                #print "    Ws[0]: " + str(Ws[0].dtype)
                stdout.flush()
예제 #2
0



if __name__ == '__main__':
    from time import clock as clock
    obs_dim = 784
    out_dim = 10
    obs_count = 10000
    hidden_size = 250
    layer_sizes = [obs_dim, hidden_size, hidden_size, out_dim]
    # Generate dummy training data
    X = gp.randn((obs_count, obs_dim))
    Y = gp.randn((obs_count, out_dim))
    # Get some training options
    opts = lnf.check_opts()
    opts['rounds'] = 201
    opts['batch_size'] = 100
    opts['dev_reps'] = 2
    # Train a network (on BS data)
    LN = LNNet(layer_sizes, lnf.kspr_trans, lnf.loss_lsq)
    LN.do_dev = 1
    LN.dev_lams = [1.0 for i in range(LN.layer_count)]
    # Time training
    t1 = clock()
    LN.train(X,Y,opts)
    t2 = clock()
    print "TIME PER UPDATE: " + str(float(t2 - t1) / float(opts['rounds']))