Beispiel #1
0
def cifar_fast_net(batch_size=128,n_epochs=300,test_frequency=13, learning_rate=0.001):

    rng1 = numpy.random.RandomState(23455)
    rng2 = numpy.random.RandomState(12423)
    rng3 = numpy.random.RandomState(23245)
    rng4 = numpy.random.RandomState(12123)
    rng5 = numpy.random.RandomState(25365)
    rng6 = numpy.random.RandomState(15323)
    train_set_x, train_set_y = load_cifar_data(['data_batch_1','data_batch_2','data_batch_3','data_batch_4'])
    valid_set_x, valid_set_y = load_cifar_data(['data_batch_5'],WHICHSET='valid')
    test_set_x, test_set_y = load_cifar_data(['test_batch'],WHICHSET='test')

    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    index = T.lscalar()

    x = T.matrix('x')
    y = T.ivector('y')

    img_input = x.reshape((batch_size,3,32,32))
    img_input = img_input.dimshuffle(1,2,3,0)
####define the layers:
    conv_pool1 = LeNetConvPoolLayer(rng=rng1,input=img_input,
                                    filter_shape=(3,5,5,32),
                                    image_shape=(3,32,32,batch_size),
                                    activation='vshape',
                                    poolsize=(3,3),poolstride=2,pad=2,
                                    convstride=1,initW=0.0001,initB=0,partial_sum=4,
                                    pooling='max',
                                    epsW=0.001,
                                    epsB=0.002,
                                    momW=0.9,
                                    momB=0.9,
                                    wc=0.004
                                    )

    conv_pool2 = LeNetConvPoolLayer(rng=rng2,input=conv_pool1.output,
                                    filter_shape=(32,5,5,32),
                                    image_shape=(32,16,16,batch_size),
                                    activation='vshape',
                                    poolsize=(3,3),poolstride=2,pad=2,
                                    convstride=1,initW=0.01,initB=0,partial_sum=4,
                                    pooling='average',
                                    epsW=0.001,
                                    epsB=0.002,
                                    momW=0.9,
                                    momB=0.9,
                                    wc=0.004)
    conv_pool3 = LeNetConvPoolLayer(rng=rng3,input=conv_pool2.output,
                                    filter_shape=(32,5,5,64),
                                    image_shape=(32,8,8,batch_size),
                                    activation='vshape',
                                    poolsize=(3,3),poolstride=2,pad=2,
                                    convstride=1,initW=0.01,initB=0,partial_sum=4,
                                    pooling='average',
                                    epsW=0.001,
                                    epsB=0.002,
                                    momW=0.9,
                                    momB=0.9,
                                    wc=0.004)

    layer4_input = conv_pool3.output.dimshuffle(3,0,1,2).flatten(2)
    #fc_64 = HiddenLayer(rng=rng4,input=layer4_input,n_in=64*4*4,n_out=64,initW=0.1,initB=0)
    fc_64 = HiddenLayer(rng=rng4,input=layer4_input,n_in=64*4*4,n_out=64,initW=0.1,initB=0,
                        epsW=0.001,
                        epsB=0.002,
                        momW=0.9,
                        momB=0.9,
                        wc=0.03)
    fc_10 = LogisticRegression(input=fc_64.output,rng=rng5,n_in=64,n_out=10,initW=0.1,
                               epsW=0.001,
                                epsB=0.002,
                                momW=0.9,
                                momB=0.9,
                                wc=0.03)
####build the models:
    cost = fc_10.negative_log_likelihood(y)
    test_model = theano.function([index], fc_10.errors(y),
             givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]})

    validate_model = theano.function([index], fc_10.errors(y),
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size: (index + 1) * batch_size]})

    Ws = [conv_pool1.W, conv_pool2.W, conv_pool3.W, fc_64.W, fc_10.W]
    pgradWs = [conv_pool1.grad_W, conv_pool2.grad_W, conv_pool3.grad_W, fc_64.grad_W, fc_10.grad_W]

    bs = [conv_pool1.b, conv_pool2.b, conv_pool3.b, fc_64.b, fc_10.b]
    pgradbs = [conv_pool1.grad_b, conv_pool2.grad_b, conv_pool3.grad_b, fc_64.grad_b, fc_10.grad_b]

    momWs = [conv_pool1.momW, conv_pool2.momW, conv_pool3.momW, fc_64.momW, fc_10.momW]
    momBs = [conv_pool1.momB, conv_pool2.momB, conv_pool3.momB, fc_64.momB, fc_10.momB]
    wcs = [conv_pool1.wc, conv_pool2.wc, conv_pool3.wc, fc_64.wc, fc_10.wc]
    epsWs = [conv_pool1.epsW, conv_pool2.epsW, conv_pool3.epsW, fc_64.epsW, fc_10.epsW]
    epsBs = [conv_pool1.epsB, conv_pool2.epsB, conv_pool3.epsB, fc_64.epsB, fc_10.epsB]

    gradWs = T.grad(cost, Ws)
    gradbs = T.grad(cost, bs)
    updates = []
    for W_i, gradW_i, momW_i, wc_i, epsW_i, pgW_i in zip(Ws,gradWs,momWs,wcs, epsWs,pgradWs):
        grad_i = - epsW_i*gradW_i - wc_i*epsW_i*W_i + momW_i*pgW_i
        updates.append((W_i, W_i+grad_i))
        updates.append((pgW_i,grad_i))

    for b_i, gradb_i, momB_i, epsB_i, pgB_i in zip(bs,gradbs,momBs, epsBs,pgradbs):
        grad_i = - epsB_i*gradb_i + momB_i*pgB_i
        updates.append((b_i, b_i+grad_i))
        updates.append((pgB_i,grad_i))







    train_model = theano.function([index], cost, updates=updates,
          givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #below is the code for reduce learning_rate
        ###########################################

        if epoch == 50:
            epsWs = [k/10.0 for k in epsWs]
            epsBs = [k/10.0 for k in epsBs]
            print 'reduce eps by a factor of 10'
            updates = []
            for W_i, gradW_i, momW_i, wc_i, epsW_i, pgW_i in zip(Ws,gradWs,momWs,wcs, epsWs,pgradWs):
                grad_i = - epsW_i*gradW_i - wc_i*epsW_i*W_i + momW_i*pgW_i
                updates.append((W_i, W_i+grad_i))
                updates.append((pgW_i,grad_i))

            for b_i, gradb_i, momB_i, epsB_i, pgB_i in zip(bs,gradbs,momBs, epsBs,pgradbs):
                grad_i = - epsB_i*gradb_i + momB_i*pgB_i
                updates.append((b_i, b_i+grad_i))
                updates.append((pgB_i,grad_i))
            train_model = theano.function([index], cost, updates=updates,
              givens={
                x: train_set_x[index * batch_size: (index + 1) * batch_size],
                y: train_set_y[index * batch_size: (index + 1) * batch_size]})

        ##############################################
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    conv_pool1.bestW = conv_pool1.W.get_value().copy()
                    conv_pool1.bestB = conv_pool1.b.get_value().copy()
                    conv_pool2.bestW = conv_pool2.W.get_value().copy()
                    conv_pool2.bestB = conv_pool2.b.get_value().copy()
                    conv_pool3.bestW = conv_pool3.W.get_value().copy()
                    conv_pool3.bestB = conv_pool3.b.get_value().copy()
                    fc_64.bestW = fc_64.W.get_value().copy()
                    fc_64.bestB = fc_64.b.get_value().copy()
                    fc_10.bestW = fc_10.W.get_value().copy()
                    fc_10.bestB = fc_10.b.get_value().copy()

                    ##saving current best
                    print 'saving current best params..'
                    current_params = (conv_pool1.bestW,conv_pool1.bestB,conv_pool2.bestW,
                    conv_pool2.bestB,conv_pool3.bestW,conv_pool3.bestB,fc_64.bestW,fc_64.bestB,
                    fc_10.bestW,fc_10.bestB,momWs,momBs,epsWs,epsBs,wcs)
                    outfile = file('current_best_params.pkl','wb')
                    cPickle.dump(current_params,outfile)
                    outfile.close()


                    # test it on the test set
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def train_rep(
    learning_rate=0.002,
    L1_reg=0.0002,
    L2_reg=0.005,
    n_epochs=200,
    nkerns=[20, 50],
    batch_size=25,
):

    rng = numpy.random.RandomState(23455)

    train_dir = "../out/h5/"
    valid_dir = "../out/h5/"

    weights_dir = "./weights/"

    print("... load input data")
    filename = train_dir + "rep_train_data_1.gzip.h5"
    datasets = load_initial_data(filename)
    train_set_x, train_set_y, shared_train_set_y = datasets

    filename = valid_dir + "rep_valid_data_1.gzip.h5"
    datasets = load_initial_data(filename)
    valid_set_x, valid_set_y, shared_valid_set_y = datasets

    mydatasets = load_initial_test_data()
    test_set_x, test_set_y, shared_test_set_y, valid_ds = mydatasets

    # compute number of minibatches for training, validation and testing
    n_all_train_batches = 30000
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_all_train_batches /= batch_size
    n_train_batches /= batch_size
    n_valid_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix("x")  # the data is presented as rasterized images
    y = T.ivector("y")  # the labels are presented as 1D vector of
    # [int] labels

    # image size
    layer0_w = 50
    layer0_h = 50
    layer1_w = (layer0_w - 4) / 2
    layer1_h = (layer0_h - 4) / 2
    layer2_w = (layer1_w - 2) / 2
    layer2_h = (layer1_h - 2) / 2
    layer3_w = (layer2_w - 2) / 2
    layer3_h = (layer2_h - 2) / 2

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print("... building the model")

    # image sizes
    batchsize = batch_size
    in_channels = 20
    in_width = 50
    in_height = 50
    # filter sizes
    flt_channels = 40
    flt_time = 20
    flt_width = 5
    flt_height = 5

    signals_shape = (batchsize, in_channels, in_height, in_width)
    filters_shape = (flt_channels, in_channels, flt_height, flt_width)

    layer0_input = x.reshape(signals_shape)

    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=signals_shape,
        filter_shape=filters_shape,
        poolsize=(2, 2),
    )

    # TODO: incase of flt_time < in_time the output dimension will be different
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, flt_channels, layer1_w, layer1_h),
        filter_shape=(60, flt_channels, 3, 3),
        poolsize=(2, 2),
    )

    layer2 = LeNetConvPoolLayer(
        rng,
        input=layer1.output,
        image_shape=(batch_size, 60, layer2_w, layer2_h),
        filter_shape=(90, 60, 3, 3),
        poolsize=(2, 2),
    )
    layer3_input = layer2.output.flatten(2)

    layer3 = HiddenLayer(
        rng,
        input=layer3_input,
        n_in=90 * layer3_w * layer3_h,
        n_out=500,
        activation=T.tanh,
    )

    layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=8)

    classify = theano.function(
        [index],
        outputs=layer4.get_output_labels(y),
        givens={
            x: test_set_x[index * batch_size : (index + 1) * batch_size],
            y: test_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    validate_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: valid_set_x[index * batch_size : (index + 1) * batch_size],
            y: valid_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    # create a list of all model parameters to be fit by gradient descent
    params = (
        layer4.params + layer3.params + layer2.params + layer1.params + layer0.params
    )

    # symbolic Theano variable that represents the L1 regularization term
    L1 = (
        T.sum(abs(layer4.params[0]))
        + T.sum(abs(layer3.params[0]))
        + T.sum(abs(layer2.params[0]))
        + T.sum(abs(layer1.params[0]))
        + T.sum(abs(layer0.params[0]))
    )
    # symbolic Theano variable that represents the squared L2 term
    L2_sqr = (
        T.sum(layer4.params[0] ** 2)
        + T.sum(layer3.params[0] ** 2)
        + T.sum(layer2.params[0] ** 2)
        + T.sum(layer1.params[0] ** 2)
        + T.sum(layer0.params[0] ** 2)
    )
    # the loss
    cost = layer4.negative_log_likelihood(y) + L1_reg * L1 + L2_reg * L2_sqr

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size : (index + 1) * batch_size],
            y: train_set_y[index * batch_size : (index + 1) * batch_size],
        },
    )

    ###############
    # TRAIN MODEL #
    ###############
    print("... training")

    start_time = time.clock()

    epoch = 0
    done_looping = False
    cost_ij = 0
    train_files_num = 600
    val_files_num = 100

    startc = time.clock()
    while (epoch < n_epochs) and (not done_looping):
        endc = time.clock()
        print(("epoch %i, took %.2f minutes" % (epoch, (endc - startc) / 60.0)))
        startc = time.clock()
        epoch = epoch + 1
        for nTrainSet in range(1, train_files_num + 1):
            # load next train data
            if nTrainSet % 50 == 0:
                print("training @ nTrainSet =  ", nTrainSet, ", cost = ", cost_ij)
            filename = train_dir + "rep_train_data_" + str(nTrainSet) + ".gzip.h5"
            datasets = load_next_data(filename)
            ns_train_set_x, ns_train_set_y = datasets
            train_set_x.set_value(ns_train_set_x, borrow=True)
            shared_train_set_y.set_value(
                numpy.asarray(ns_train_set_y, dtype=theano.config.floatX), borrow=True
            )
            n_train_batches = train_set_x.get_value(borrow=True).shape[0]
            n_train_batches /= batch_size

            # train
            for minibatch_index in range(n_train_batches):

                # training itself
                # --------------------------------------
                cost_ij = train_model(minibatch_index)
                # -------------------------

        # at the end of each epoch run validation
        this_validation_loss = 0
        for nValSet in range(1, val_files_num + 1):
            filename = valid_dir + "rep_valid_data_" + str(nValSet) + ".gzip.h5"
            datasets = load_next_data(filename)
            ns_valid_set_x, ns_valid_set_y = datasets
            valid_set_x.set_value(ns_valid_set_x, borrow=True)
            shared_valid_set_y.set_value(
                numpy.asarray(ns_valid_set_y, dtype=theano.config.floatX), borrow=True
            )
            n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
            n_valid_batches /= batch_size

            # compute zero-one loss on validation set
            validation_losses = [validate_model(i) for i in range(n_valid_batches)]
            this_validation_loss += numpy.mean(validation_losses)
        this_validation_loss /= val_files_num
        print((
            "epoch %i, minibatch %i/%i, validation error %f %%"
            % (
                epoch,
                minibatch_index + 1,
                n_train_batches,
                this_validation_loss * 100.0,
            )
        ))

        # save snapshots
        print("saving weights state, epoch = ", epoch)
        f = file(weights_dir + "weights_epoch" + str(epoch) + ".save", "wb")
        state_L0 = layer0.__getstate__()
        pickle.dump(state_L0, f, protocol=pickle.HIGHEST_PROTOCOL)
        state_L1 = layer1.__getstate__()
        pickle.dump(state_L1, f, protocol=pickle.HIGHEST_PROTOCOL)
        state_L2 = layer2.__getstate__()
        pickle.dump(state_L2, f, protocol=pickle.HIGHEST_PROTOCOL)
        state_L3 = layer3.__getstate__()
        pickle.dump(state_L3, f, protocol=pickle.HIGHEST_PROTOCOL)
        state_L4 = layer4.__getstate__()
        pickle.dump(state_L4, f, protocol=pickle.HIGHEST_PROTOCOL)
        f.close()

    end_time = time.clock()
    print ("Optimization complete.")
    print((
        "The code for file "
        + os.path.split(__file__)[1]
        + " ran for %.2fm" % ((end_time - start_time) / 60.0)
    ), file=sys.stderr)
def train_cifar(learning_rate_base=1.0,batch_size=128,n_epochs=200,test_frequency=1300, check_point_frequency=5000,show_progress_frequency=100):
    check_point_path = '/home/chensi/mylocal/sichen/data/check_points/'
    parser = optparse.OptionParser()
    parser.add_option("-f",dest="filename", default='None')

    (options, args) = parser.parse_args()

    #defining the rngs
    rng1 = numpy.random.RandomState(23455)
    rng2 = numpy.random.RandomState(12423)
    rng3 = numpy.random.RandomState(23245)
    rng4 = numpy.random.RandomState(12123)
    rng5 = numpy.random.RandomState(25365)


    train_set_x, train_set_y = load_cifar_data(['data_batch_1','data_batch_2','data_batch_3','data_batch_4','data_batch_5'])
    test_set_x, test_set_y = load_cifar_data(['test_batch'],WHICHSET='test')

    n_training_batches = train_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]

    n_training_batches /= batch_size
    n_test_batches /= batch_size

    index = T.lscalar()

    x = T.matrix('x')
    y = T.ivector('y')

    img_input = x.reshape((batch_size,3,32,32)) #bc01
    img_input = img_input.dimshuffle(1,2,3,0) #c01b

    #####################
    #defining the layers#
    #####################

    if options.filename == 'None':
        print 'start new training...'
        print 'building model...'
        conv1_input = img_input
        conv_pool1 = LeNetConvPoolLayer(rng=rng1,input=conv1_input,
                                    filter_shape=(3,5,5,32),
                                    image_shape=(3,32,32,batch_size),
                                    activation='relu',
                                    poolsize=(3,3),poolstride=2,pad=2,
                                    convstride=1,initW=0.0001,initB=0,partial_sum=4,
                                    pooling='max',
                                    epsW=0.001,
                                    epsB=0.002,
                                    momW=0.9,
                                    momB=0.9,
                                    wc=0.004,
                                    name='conv1'
                                    )
        conv_pool2_input = drop_out_layer(rng2,conv_pool1.output,0.5)
        conv_pool2 = LeNetConvPoolLayer(rng=rng2,input=conv_pool2_input,
                                        filter_shape=(32,5,5,32),
                                        image_shape=(32,16,16,batch_size),
                                        activation='relu',
                                        poolsize=(3,3),poolstride=2,pad=2,
                                        convstride=1,initW=0.01,initB=0,partial_sum=4,
                                        pooling='average',
                                        epsW=0.001,
                                        epsB=0.002,
                                        momW=0.9,
                                        momB=0.9,
                                        wc=0.004,
                                        name='conv2')
        conv_pool3_input = drop_out_layer(rng3,conv_pool2.output,0.5)
        conv_pool3 = LeNetConvPoolLayer(rng=rng3,input=conv_pool3_input,
                                        filter_shape=(32,5,5,64),
                                        image_shape=(32,8,8,batch_size),
                                        activation='relu',
                                        poolsize=(3,3),poolstride=2,pad=2,
                                        convstride=1,initW=0.01,initB=0,partial_sum=4,
                                        pooling='average',
                                        epsW=0.001,
                                        epsB=0.002,
                                        momW=0.9,
                                        momB=0.9,
                                        wc=0.004,
                                        name='conv3')

        layer4_input = conv_pool3.output.dimshuffle(3,0,1,2).flatten(2)
        #fc_64 = HiddenLayer(rng=rng4,input=layer4_input,n_in=64*4*4,n_out=64,initW=0.1,initB=0)

        fc_2_input = drop_out_layer(rng1,input=layer4_input,p=0.5)
        fc_2 = LogisticRegression(input=fc_2_input,rng=rng5,n_in=64*4*4,n_out=10,initW=0.01,
                                   epsW=0.001,
                                    epsB=0.002,
                                    momW=0.9,
                                    momB=0.9,
                                    wc=1.0,
                                    name='fc2')
    else:
        print 'resume training %s...' % options.filename

        params_file = open(check_point_path+options.filename,'rb')
        params = cPickle.load(params_file)
        params_file.close()
        layer1_W = theano.shared(params[0],borrow=True)
        layer1_b = theano.shared(params[1],borrow=True)
        layer2_W = theano.shared(params[2],borrow=True)
        layer2_b = theano.shared(params[3],borrow=True)
        layer3_W = theano.shared(params[4],borrow=True)
        layer3_b = theano.shared(params[5],borrow=True)
        fc10_W = theano.shared(params[6],borrow=True)
        fc10_b = theano.shared(params[7],borrow=True)
        print 'building model...'

        conv_pool1 = LeNetConvPoolLayer(rng=rng1,input=img_input,
                                    filter_shape=(3,5,5,32),
                                    image_shape=(3,32,32,batch_size),
                                    poolsize=(3,3),poolstride=2,pad=2,
                                    convstride=1,initW=0.0001,initB=0,partial_sum=4,
                                    activation='relu',
                                    pooling='max',
                                    epsW=0.001,
                                    epsB=0.001,
                                    momW=0.9,
                                    momB=0.9,
                                    wc=0.004,
                                    name='conv1',
                                    W1=layer1_W,
                                    b1=layer1_b
                                    )
        conv_pool2_input = drop_out_layer(rng2,conv_pool1.output,0.5)
        conv_pool2 = LeNetConvPoolLayer(rng=rng2,input=conv_pool2_input,
                                        filter_shape=(32,5,5,32),
                                        image_shape=(32,16,16,batch_size),
                                        poolsize=(3,3),poolstride=2,pad=2,
                                        convstride=1,initW=0.01,initB=0,partial_sum=4,
                                        pooling='average',
                                        activation='relu',
                                        epsW=0.001,
                                        epsB=0.001,
                                        momW=0.9,
                                        momB=0.9,
                                        wc=0.004,
                                        name='conv2',
                                        W1=layer2_W,
                                        b1=layer2_b
                                        )
        conv_pool3_input = drop_out_layer(rng3,conv_pool2.output,0.5)
        conv_pool3 = LeNetConvPoolLayer(rng=rng3,input=conv_pool3_input,
                                        filter_shape=(32,5,5,64),
                                        image_shape=(32,8,8,batch_size),
                                        poolsize=(3,3),poolstride=2,pad=2,
                                        convstride=1,initW=0.01,initB=0,partial_sum=4,
                                        pooling='average',
                                        activation='relu',
                                        epsW=0.001,
                                        epsB=0.001,
                                        momW=0.9,
                                        momB=0.9,
                                        wc=0.004,
                                        name='conv3',
                                        W1=layer3_W,
                                        b1=layer3_b
                                        )

        layer4_input = conv_pool3.output.dimshuffle(3,0,1,2).flatten(2)
        #fc_64 = HiddenLayer(rng=rng4,input=layer4_input,n_in=64*4*4,n_out=64,initW=0.1,initB=0)

        fc_2_input = drop_out_layer(rng1,input=layer4_input,p=0.5)

        fc_2 = LogisticRegression(input=fc_2_input,rng=rng5,n_in=64*4*4,n_out=10,initW=0.01,
                                   epsW=0.001,
                                    epsB=0.001,
                                    momW=0.9,
                                    momB=0.9,
                                    wc=1.0,
                                    W=fc10_W,
                                    b=fc10_b,
                                    name='fc2'
                                    )

    all_layers = [conv_pool1,conv_pool2,conv_pool3,fc_2]
#############################################
############### test model###################
#############################################
    print 'building test model...'
    conv1_input_test = img_input
    conv_pool1_test = LeNetConvPoolLayer(rng=rng1,input=conv1_input_test,
                                filter_shape=(3,5,5,32),
                                image_shape=(3,32,32,batch_size),
                                activation='relu',
                                poolsize=(3,3),poolstride=2,pad=2,
                                convstride=1,initW=0.0001,initB=0,partial_sum=4,
                                pooling='max',
                                W1=conv_pool1.W*0.5,
                                b1=conv_pool1.b,
                                name='conv1'
                                )

    conv_pool2_test = LeNetConvPoolLayer(rng=rng2,input=conv_pool1_test.output,
                                    filter_shape=(32,5,5,32),
                                    image_shape=(32,16,16,batch_size),
                                    activation='relu',
                                    poolsize=(3,3),poolstride=2,pad=2,
                                    convstride=1,initW=0.01,initB=0,partial_sum=4,
                                    pooling='average',
                                    W1=conv_pool2.W*0.5,
                                    b1=conv_pool2.b,
                                    name='conv2')
    conv_pool3_test = LeNetConvPoolLayer(rng=rng3,input=conv_pool2_test.output,
                                    filter_shape=(32,5,5,64),
                                    image_shape=(32,8,8,batch_size),
                                    activation='relu',
                                    poolsize=(3,3),poolstride=2,pad=2,
                                    convstride=1,initW=0.01,initB=0,partial_sum=4,
                                    pooling='average',
                                    W1=conv_pool3.W*0.5,
                                    b1=conv_pool3.b,
                                    name='conv3')

    layer4_input_test = conv_pool3_test.output.dimshuffle(3,0,1,2).flatten(2)


    fc_2_test = LogisticRegression(input=layer4_input_test,rng=rng5,n_in=64*4*4,n_out=10,initW=0.01,
                                W=fc_2.W,
                                b=fc_2.b,
                                name='fc2')


    #cost_test = fc_2_test.negative_log_likelihood(y)



    test_model = theano.function(inputs=[index], outputs=fc_2_test.errors(y),
                                 givens={
                                     x:test_set_x[index*batch_size: (index+1)*batch_size],
                                     y:test_set_y[index*batch_size: (index+1)*batch_size]
                                 })
########train model
    cost = fc_2.negative_log_likelihood(y)
    Ws = []
    pgradWs = []

    bs = []
    pgradbs = []

    momWs = []
    mombs = []

    epsWs = []
    epsbs = []
    wcs = []

    for i in range(len(all_layers)):
        Ws.append(all_layers[i].W)
        pgradWs.append(all_layers[i].grad_W)
        bs.append(all_layers[i].b)
        pgradbs.append(all_layers[i].grad_b)
        momWs.append(all_layers[i].momW)
        mombs.append(all_layers[i].momB)
        epsWs.append(all_layers[i].epsW)
        epsbs.append(all_layers[i].epsB)
        wcs.append(all_layers[i].wc)

    gradWs = T.grad(cost, Ws)
    gradbs = T.grad(cost, bs)
    updates = []
    for W_i, gradW_i, momW_i, wc_i, epsW_i, pgW_i in zip(Ws, gradWs, momWs, wcs, epsWs, pgradWs):
        epsW_i *= learning_rate_base
        grad_i = - epsW_i*gradW_i - wc_i*epsW_i*W_i + momW_i*pgW_i

        updates.append((W_i, W_i+grad_i))
        updates.append((pgW_i, grad_i))

    for b_i, gradb_i, momb_i, epsb_i, pgb_i in zip(bs, gradbs, mombs, epsbs, pgradbs):
        grad_i = - epsb_i*gradb_i + momb_i*pgb_i
        updates.append((b_i, b_i+grad_i))
        updates.append((pgb_i,grad_i))

    train_model = theano.function(inputs=[index],outputs=[cost,fc_2.errors(y)],updates=updates,
                                  givens={
                                      x: train_set_x[index*batch_size:(index+1)*batch_size],
                                      y: train_set_y[index*batch_size:(index+1)*batch_size]
                                  })

    #############
    #train model#
    #############
    print 'training...'


    best_validation_loss = numpy.inf
    best_epoch = 0

    epoch = 0

    pweights = []
    pbias = []
    for i in range(len(all_layers)):
        pweights.append(numpy.mean(numpy.abs(all_layers[i].W.get_value()[0,:])))
        pbias.append(numpy.mean(numpy.abs(all_layers[i].b.get_value())))
    time_start = time.time()
    start_time = time.time()
    while(epoch<n_epochs):
        epoch = epoch + 1
        for minibatch_index in range(n_training_batches):

            iter = (epoch-1)*n_training_batches + minibatch_index
            train_out = train_model(minibatch_index)
            if iter % show_progress_frequency == 0:
                time_end = time.time()
                print 'epoch: %d, batch_num: %d, cost: %f, training_error: %f, (%f seconds)' % (epoch, minibatch_index, train_out[0], train_out[1], time_end-time_start)
                time_start = time.time()

            if (iter+1) % test_frequency == 0:
                time1 = time.time()
                test_losses = [test_model(i) for i in range(n_test_batches)]
                this_test_loss = numpy.mean(test_losses)
                print '=====================testing output==========================='
                print 'epoch: %d, batch_num: %d, test_error: %f ' % (epoch, minibatch_index, this_test_loss*100.)
                for i in range(len(all_layers)):
                    weights = numpy.mean(numpy.abs(all_layers[i].W.get_value()[0,:]))
                    bias = numpy.mean(numpy.abs(all_layers[i].b.get_value()))

                    print 'Layer: %s, weights[0]:%e [%e]' % (all_layers[i].name, weights*1.00, weights-pweights[i])
                    print 'Layer: %s,bias: %e[%e]' % (all_layers[i].name, bias*1.00, bias-pbias[i])
                    pweights[i] = weights
                    pbias[i] = bias
                if this_test_loss < best_validation_loss:
                    best_epoch = epoch
                    best_validation_loss = this_test_loss
                    best_params = []
                    for i in range(len(all_layers)):
                        best_params.append(all_layers[i].W.get_value().copy())
                        best_params.append(all_layers[i].b.get_value().copy())
                    outfile_name = check_point_path+'current_best_params.pkl'
                    outfile = open(outfile_name,'wb')
                    cPickle.dump(best_params,outfile)
                    outfile.close()
                    print 'saved best params to %s' % outfile_name
                time2 = time.time()
                print '==================================================(%f seconds)' % (time2-time1)
            if (iter+1) % check_point_frequency == 0:
                print '~~~~~~~~~~~~~~~~~~saving check_point~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
                time1 = time.time()
                current_params = []
                for i in range(len(all_layers)):
                    current_params.append(all_layers[i].W.get_value().copy())
                    current_params.append(all_layers[i].b.get_value().copy())
                outfile_name = check_point_path + 'current_params_' + str(time.localtime().tm_mon) + '_' + str(time.localtime().tm_mday) \
                + '_' + str(time.localtime().tm_hour) + '_' + str(time.localtime().tm_min) + '_' + str(time.localtime().tm_sec)+'.pkl'
                outfile = open(outfile_name,'wb')
                cPickle.dump(current_params,outfile)
                outfile.close()
                print 'saved check_point to %s' % outfile_name
                time2 = time.time()
                print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~(%f seconds)' % (time2-time1)

    end_time = time.time()
    print 'Best test score is %f at epoch %d. Total time:%f hour' % (best_validation_loss * 100., best_epoch, (end_time-start_time)/3600.)
Beispiel #4
0
class deep_sugar(object):
    def __init__(self, numpy_rng, theano_rng=None, y=None, 
                 alpha=0.9, sample_rate=0.1, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10,
                 corruption_levels=[0.1, 0.1],
                 allX=None,allY=None,srng=None):
        self.sigmoid_layers = []
        self.sugar_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        self.allXs = []
        if y == None:
            self.y = tensor.ivector(name='y')
        else:
            self.y = y
        assert self.n_layers > 0
        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        self.x = tensor.matrix('x')  
        self.x = tensor.matrix('x')  
        self.y = tensor.ivector('y')  
        self.y = tensor.ivector('y')  
        for i in xrange(self.n_layers):
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output
            if i == 0:
                self.allXs.append(allX)
            else:
                self.allXs.append(tensor.dot(self.allXs[i-1], self.sigmoid_layers[-1].W) + self.sigmoid_layers[-1].b)
            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=tensor.nnet.sigmoid)
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)
            sugar_layer = sugar(numpy_rng=numpy_rng,
                                alpha=alpha,
                                sample_rate=sample_rate,
                                x=layer_input,
                                y=self.y,
                                n_visible=input_size,
                                n_hidden=hidden_layers_sizes[i],
                                W=sigmoid_layer.W,
                                bhid=sigmoid_layer.b,
                                allX=self.allXs[i],
                                allY=allY,
                                srng=srng)
            self.sugar_layers.append(sugar_layer)
        self.logLayer = LogisticRegression(
                         input=self.sigmoid_layers[-1].output,
                         n_in=hidden_layers_sizes[-1], n_out=n_outs)
        self.params.extend(self.logLayer.params)
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)
        
    def pretraining_functions(self, train_set_x, train_set_y, batch_size):
        index = tensor.lscalar('index')  
        index = tensor.lscalar('index')  
        corruption_level = tensor.scalar('corruption')  
        corruption_level = tensor.scalar('corruption')  
        learning_rate = tensor.scalar('lr')  
        learning_rate = tensor.scalar('lr')  
        switch = tensor.iscalar('switch')
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size
        pretrain_fns = []
        for sugar in self.sugar_layers:
            cost, updates = sugar.get_cost_updates(corruption_level,
                                                learning_rate,
                                                switch)
            fn = function(inputs=[index,
                                         Param(corruption_level, default=0.2),
                                         Param(learning_rate, default=0.1),
                                         Param(switch, default=1)],
                                 outputs=[cost],
                                 updates=updates,
                                 givens={self.x: train_set_x[batch_begin:batch_end],
                                         self.y: train_set_y[batch_begin:batch_end]}, on_unused_input='ignore')
            pretrain_fns.append(fn)
        return pretrain_fns
        
    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y)   = datasets[2]
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size
        index = tensor.lscalar('index')  
        gparams = tensor.grad(self.finetune_cost, self.params)
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate))
        train_fn = function(inputs=[index],
              outputs=self.finetune_cost,
              updates=updates,
              givens={
                self.x: train_set_x[index * batch_size:
                                    (index + 1) * batch_size],
                self.y: train_set_y[index * batch_size:
                                    (index + 1) * batch_size]})
        test_score_i = function([index], self.errors,
                 givens={
                   self.x: test_set_x[index * batch_size:
                                      (index + 1) * batch_size],
                   self.y: test_set_y[index * batch_size:
                                      (index + 1) * batch_size]})
        valid_score_i = function([index], self.errors,
              givens={
                 self.x: valid_set_x[index * batch_size:
                                     (index + 1) * batch_size],
                 self.y: valid_set_y[index * batch_size:
                                     (index + 1) * batch_size]})
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]
        return train_fn, valid_score, test_score
Beispiel #5
0
class deep_sugar(object):
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 y=None,
                 alpha=0.9,
                 sample_rate=0.1,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10,
                 corruption_levels=[0.1, 0.1],
                 allX=None,
                 allY=None,
                 srng=None):
        self.sigmoid_layers = []
        self.sugar_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        self.allXs = []
        if y == None:
            self.y = tensor.ivector(name='y')
        else:
            self.y = y
        assert self.n_layers > 0
        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))
        self.x = tensor.matrix('x')
        self.x = tensor.matrix('x')
        self.y = tensor.ivector('y')
        self.y = tensor.ivector('y')
        for i in xrange(self.n_layers):
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output
            if i == 0:
                self.allXs.append(allX)
            else:
                self.allXs.append(
                    tensor.dot(self.allXs[i - 1], self.sigmoid_layers[-1].W) +
                    self.sigmoid_layers[-1].b)
            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=tensor.nnet.sigmoid)
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)
            sugar_layer = sugar(numpy_rng=numpy_rng,
                                alpha=alpha,
                                sample_rate=sample_rate,
                                x=layer_input,
                                y=self.y,
                                n_visible=input_size,
                                n_hidden=hidden_layers_sizes[i],
                                W=sigmoid_layer.W,
                                bhid=sigmoid_layer.b,
                                allX=self.allXs[i],
                                allY=allY,
                                srng=srng)
            self.sugar_layers.append(sugar_layer)
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_set_x, train_set_y, batch_size):
        index = tensor.lscalar('index')
        index = tensor.lscalar('index')
        corruption_level = tensor.scalar('corruption')
        corruption_level = tensor.scalar('corruption')
        learning_rate = tensor.scalar('lr')
        learning_rate = tensor.scalar('lr')
        switch = tensor.iscalar('switch')
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size
        pretrain_fns = []
        for sugar in self.sugar_layers:
            cost, updates = sugar.get_cost_updates(corruption_level,
                                                   learning_rate, switch)
            fn = function(inputs=[
                index,
                Param(corruption_level, default=0.2),
                Param(learning_rate, default=0.1),
                Param(switch, default=1)
            ],
                          outputs=[cost],
                          updates=updates,
                          givens={
                              self.x: train_set_x[batch_begin:batch_end],
                              self.y: train_set_y[batch_begin:batch_end]
                          },
                          on_unused_input='ignore')
            pretrain_fns.append(fn)
        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size
        index = tensor.lscalar('index')
        gparams = tensor.grad(self.finetune_cost, self.params)
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate))
        train_fn = function(
            inputs=[index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size]
            })
        test_score_i = function(
            [index],
            self.errors,
            givens={
                self.x:
                test_set_x[index * batch_size:(index + 1) * batch_size],
                self.y: test_set_y[index * batch_size:(index + 1) * batch_size]
            })
        valid_score_i = function(
            [index],
            self.errors,
            givens={
                self.x:
                valid_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size]
            })

        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score, test_score
def test_conv(learning_rate=0.1, n_epochs=1000, nkerns=[16, 512], kern_shape=[9,7],
            batch_size=200, verbose=False, loadmodel=False):
    """
    learning_rate: term for the gradient 

    n_epochs: maximal number of epochs before exiting

    nkerns: number of kernels on each layer
    
    kern_shape: list of numbers with the dimensions of the kernels

    batch_szie: number of examples in minibatch.

    verbose: to print out epoch summary or not to
    
    loadmodel: load parameters from a saved .npy file

    """
    
    # Folder for saving and loading parameters
    folder='results'
    # Seed the random generator
    rng = numpy.random.RandomState(1990)

    # Load the dataset
    datasets = load_faceScrub(theano_shared=True)
    
    # Functions for saving and loading parameters
    def save(folder):
        for param in params:
            print (str(param.name))
            numpy.save(os.path.join(folder,
                       param.name + '.npy'), param.get_value())

    def load(folder):
        for param in params:
            param.set_value(numpy.load(os.path.join(folder,
                            param.name + '.npy')))


    # Accassing the train,test and validation set
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ###############
    # BUILD MODEL #
    ###############
    print('... building the model')

    # Reshape matrix of rasterized images of shape (batch_size, 1 * 100 * 100)
    # to a 4D tensor, which is expected by theano
    layer0_input = x.reshape((batch_size, 1, 100, 100))
    
    # First convolutional pooling layer
    layer0 = ConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=((batch_size, 1, 100, 100)), 
        filter_shape=((nkerns[0], 1, kern_shape[0], kern_shape[0])),
        poolsize=((2,2)),
        idx=0
    )

    # Second layer
    layer1 = ConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=((batch_size, nkerns[0], 46, 46)),
        filter_shape=((nkerns[1], nkerns[0], kern_shape[1], kern_shape[1])),
        poolsize=((2,2)),
        idx=1
    )
    
    # Flatten input for the fully connected layer
    layer2_input = layer1.output.flatten(2)

    # Fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=(nkerns[1]*20*20),
        n_out=(500),
        activation=T.tanh
    )   
    

    # Output layer
    layer3 = LogisticRegression(
         input=layer2.output,
         n_in=500,
    n_out=530)

    # Cost function
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # Calculate validation error
    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # Parameter list which needs update
    params = layer3.params + layer2.params + layer1.params + layer0.params
    
    # Load the parameters if we want
    if loadmodel == True:
        load(folder)
        

    # Gradient of costfunction w.r.t. parameters
    grads = T.grad(cost, params)

    # Gradient decent for every parameters
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    # Theano function for calculating the cost and updating the model
    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )


    print('... training')
    train_net(train_model, validate_model, test_model,
        n_train_batches, n_valid_batches, n_test_batches, n_epochs, verbose)
    
    # Save parameters after training
    save(folder)
Beispiel #7
0
class StackedAutoEncoder(object):
    """Stacked auto-encoder class (SAE)
    Adopted from:
    https://github.com/lisa-lab/DeepLearningTutorials/blob/master/code/SdA.py

    A stacked autoencoder (SAE) model is obtained by stacking several
    AEs. The hidden layer of the AE at layer `i` becomes the input of
    the AE at layer `i+1`. The first layer AE gets as input the input of
    the SAE, and the hidden layer of the last AE represents the output.
    Note that after pretraining, the SAE is dealt with as a normal MLP,
    the AEs are only used to initialize the weights.
    """

    def __init__(
        self,
        numpy_rng,
        train_set_x,
        train_set_y,
        hidden_layers_sizes,
        n_ins=784,
        n_outs=10
    ):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: np.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type train_set_x: theano.shared float32
        :param: train_set_x: Training data set, shape (n_samples, n_pixels)

        :type train_set_y: theano.shared, int32
        :param: train_set_x: GT for training data, shape (n_samples)

        :type n_ins: int
        :param n_ins: dimension of the input to the SAE

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
               at least one value
        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.AE_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        self.train_set_x = train_set_x
        self.train_set_y = train_set_y

        assert self.n_layers > 0

        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of

        for i in xrange(self.n_layers):     # used to be n layers

            # construct the sigmoid layer = encoder stack
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=(n_ins if i == 0 else
                                              hidden_layers_sizes[i-1]),
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)

            # init the DA_layer, takes weights from sigmoid layer
            AE_layer = AutoEncoder(
                numpy_rng=numpy_rng,
                input=layer_input,
                n_visible=(n_ins if i == 0 else hidden_layers_sizes[i-1]),
                n_hidden=hidden_layers_sizes[i],
                W=sigmoid_layer.W,
                bhid=sigmoid_layer.b)

            self.AE_layers.append(AE_layer)

        # on top of the layers
        # log layer for fine-tuning
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs
        )
        self.params.extend(self.logLayer.params)
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, batch_size):
        """
        Generates a list of functions to time each AE training.

        :type batch_size: int
        :param batch_size: size of a [mini]batch
        """

        index = T.lscalar('index')  # index to a minibatch

        # beginning of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        forward_backward_step = []
        forward_step_fns = []
        i = 0
        for AE in self.AE_layers:

            # get the cost and the updates list
            cost = AE.get_cost_updates()

            params = AE.params
            shared_cost = theano.shared(np.float32(0.0))
            forward_step_fns.append(
                theano.function(
                    [index], [],
                    updates=[(shared_cost, cost)],
                    givens={
                            self.x: self.train_set_x[batch_begin: batch_end],
                            }))
            grads_temp = T.grad(cost, params)

            # This is both forward and backward
            forward_backward_step.append(
                theano.function(
                    [index], grads_temp,
                    givens={
                            self.x: self.train_set_x[batch_begin: batch_end],
                            }))
            i += 1

        return forward_backward_step, forward_step_fns

    def build_finetune_functions(self, batch_size):

        index = T.lscalar('index')  # index to a [mini]batch
        # beginning of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        cost = self.finetune_cost
        shared_cost = theano.shared(np.float32(0.0))
        forward_mlp = theano.function(
            [index], [],
            updates=[(shared_cost, cost)],
            givens={
                    self.x: self.train_set_x[batch_begin: batch_end],
                    self.y: self.train_set_y[batch_begin: batch_end],
                    })

        grads_temp = T.grad(cost, self.params)

        # This is both forward and backward
        forward_backward_mlp = theano.function(
            [index], grads_temp,
            givens={
                    self.x: self.train_set_x[batch_begin: batch_end],
                    self.y: self.train_set_y[batch_begin: batch_end],
                    })

        return forward_mlp, forward_backward_mlp
Beispiel #8
0
class StackedAutoEncoder(object):
    """Stacked auto-encoder class (SAE)
    Adopted from:
    https://github.com/lisa-lab/DeepLearningTutorials/blob/master/code/SdA.py

    A stacked autoencoder (SAE) model is obtained by stacking several
    AEs. The hidden layer of the AE at layer `i` becomes the input of
    the AE at layer `i+1`. The first layer AE gets as input the input of
    the SAE, and the hidden layer of the last AE represents the output.
    Note that after pretraining, the SAE is dealt with as a normal MLP,
    the AEs are only used to initialize the weights.
    """
    def __init__(self,
                 numpy_rng,
                 train_set_x,
                 train_set_y,
                 hidden_layers_sizes,
                 n_ins=784,
                 n_outs=10):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: np.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type train_set_x: theano.shared float32
        :param: train_set_x: Training data set, shape (n_samples, n_pixels)

        :type train_set_y: theano.shared, int32
        :param: train_set_x: GT for training data, shape (n_samples)

        :type n_ins: int
        :param n_ins: dimension of the input to the SAE

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
               at least one value
        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.AE_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        self.train_set_x = train_set_x
        self.train_set_y = train_set_y

        assert self.n_layers > 0

        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of

        for i in xrange(self.n_layers):  # used to be n layers

            # construct the sigmoid layer = encoder stack
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(
                rng=numpy_rng,
                input=layer_input,
                n_in=(n_ins if i == 0 else hidden_layers_sizes[i - 1]),
                n_out=hidden_layers_sizes[i],
                activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)

            # init the DA_layer, takes weights from sigmoid layer
            AE_layer = AutoEncoder(
                numpy_rng=numpy_rng,
                input=layer_input,
                n_visible=(n_ins if i == 0 else hidden_layers_sizes[i - 1]),
                n_hidden=hidden_layers_sizes[i],
                W=sigmoid_layer.W,
                bhid=sigmoid_layer.b)

            self.AE_layers.append(AE_layer)

        # on top of the layers
        # log layer for fine-tuning
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, batch_size):
        """
        Generates a list of functions to time each AE training.

        :type batch_size: int
        :param batch_size: size of a [mini]batch
        """

        index = T.lscalar('index')  # index to a minibatch

        # beginning of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        forward_backward_step = []
        forward_step_fns = []
        i = 0
        for AE in self.AE_layers:

            # get the cost and the updates list
            cost = AE.get_cost_updates()

            params = AE.params
            shared_cost = theano.shared(np.float32(0.0))
            forward_step_fns.append(
                theano.function([index], [],
                                updates=[(shared_cost, cost)],
                                givens={
                                    self.x:
                                    self.train_set_x[batch_begin:batch_end],
                                }))
            grads_temp = T.grad(cost, params)

            # This is both forward and backward
            forward_backward_step.append(
                theano.function([index],
                                grads_temp,
                                givens={
                                    self.x:
                                    self.train_set_x[batch_begin:batch_end],
                                }))
            i += 1

        return forward_backward_step, forward_step_fns

    def build_finetune_functions(self, batch_size):

        index = T.lscalar('index')  # index to a [mini]batch
        # beginning of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        cost = self.finetune_cost
        shared_cost = theano.shared(np.float32(0.0))
        forward_mlp = theano.function(
            [index], [],
            updates=[(shared_cost, cost)],
            givens={
                self.x: self.train_set_x[batch_begin:batch_end],
                self.y: self.train_set_y[batch_begin:batch_end],
            })

        grads_temp = T.grad(cost, self.params)

        # This is both forward and backward
        forward_backward_mlp = theano.function(
            [index],
            grads_temp,
            givens={
                self.x: self.train_set_x[batch_begin:batch_end],
                self.y: self.train_set_y[batch_begin:batch_end],
            })

        return forward_mlp, forward_backward_mlp
Beispiel #9
0
def train_nnet(learning_rate=0.1, n_epochs=2,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50], batch_size=500):
    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    data = T.matrix('x')
    result = T.matrix('y')  # the labels are presented as 1D vector [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    init_input = data.reshape((batch_size, 1, 16, 16))

    # Check for pkl file holding old weights
    old_weights = [[None, None]] * 4;
    try:
        old_weights = pickle.load(open(sys.argv[1], "rb"))
    except FileNotFoundError as e:
        print(e)
    except IndexError:
        pass

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(
        rng,
        input=init_input,
        image_shape=(batch_size, 1, 16, 16),
        filter_shape=(nkerns[0], 1, 5, 5),
        oldWeights=old_weights[0][0],
        oldBias=old_weights[0][1],
        poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 6, 6),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        oldWeights=old_weights[1][0],
        oldBias=old_weights[1][1],
        poolsize=(2, 2)
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=nkerns[1] * 2 * 2,
        n_out=64,
        oldWeights=old_weights[2][0],
        oldBias=old_weights[2][1],
        activation=T.tanh
    )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(
        input=layer2.output,
        n_in=64,
        n_out=256,
        oldWeights=old_weights[3][0],
        oldBias=old_weights[3][1],
    )

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(result)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(result),
        givens={
            data: test_set_x[index * batch_size: (index + 1) * batch_size],
            result: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer3.errors(result),
        givens={
            data: valid_set_x[index * batch_size: (index + 1) * batch_size],
            result: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            data: train_set_x[index * batch_size: (index + 1) * batch_size],
            result: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    done = False
    for epoch in range(1, n_epochs + 1):
        for minibatch_index in range(int(n_train_batches)):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in range(int(n_valid_batches))]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase) 
                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in range(int(n_test_batches))
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done = True
                break
        if done:
            break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    weights = []
    weights.append([layer0.W.get_value(), layer0.b.get_value()])
    weights.append([layer1.W.get_value(), layer1.b.get_value()])
    weights.append([layer2.W.get_value(), layer2.b.get_value()])
    weights.append([layer3.W.get_value(), layer3.b.get_value()])

    pickle.dump(weights, open("mlp.pkl", "wb"))
Beispiel #10
0
class DBN(object):
    def __init__(self,
                 input,
                 output,
                 n_in,
                 hidden_layers_sizes,
                 n_out,
                 dropout=None,
                 optimizer=SGD,
                 is_train=0):

        self.dense_layers = []
        self.rbm_layers = []
        self.params = []
        self.consider_constants = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        self.rng = np.random.RandomState(888)
        self.theano_rng = RandomStreams(self.rng.randint(2**30))

        for i in range(self.n_layers):
            if i == 0:
                input_size = n_in
                layer_input = input
            else:
                input_size = hidden_layers_sizes[i - 1]
                layer_input = self.dense_layers[-1].output

            dense_layer = DenseLayer(rng=self.rng,
                                     theano_rng=self.theano_rng,
                                     input=layer_input,
                                     n_in=input_size,
                                     n_out=hidden_layers_sizes[i],
                                     activation=T.nnet.softplus,
                                     dropout=dropout,
                                     is_train=is_train)

            rbm_layer = RBM(input=layer_input,
                            rng=self.rng,
                            theano_rng=self.theano_rng,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=dense_layer.W,
                            hbias=dense_layer.b,
                            dropout=dropout,
                            h_activation=T.nnet.softplus,
                            optimizer=optimizer,
                            is_train=is_train)

            self.dense_layers.append(dense_layer)
            self.rbm_layers.append(rbm_layer)
            self.params.extend(dense_layer.params)

            if dense_layer.consider_constant is not None:
                self.consider_constants.extend(dense_layer.consider_constant)
            # end-for

        self.logistic_layer = LogisticRegression(
            input=self.dense_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_out)

        self.params.extend(self.logistic_layer.params)

        self.finetune_cost = self.logistic_layer.negative_loglikelihood(output)
        self.finetune_errors = self.logistic_layer.errors(output)

        self.input = input
        self.output = output
        self.is_train = is_train

        # model updates
        self.finetune_opt = optimizer(self.params)

    def _finetune_updates(self, learning_rate):
        return self.finetune_opt.update(self.finetune_cost, self.params,
                                        learning_rate, self.consider_constants)

    def build_pretraining_functions(self, datasets, batch_size, k=1):

        train_set_x = datasets[0][0]
        valid_set_x = datasets[1][0]

        index = T.lscalar('index')  # index to a [mini]batch
        learning_rate = T.scalar('learning_rate')

        self.rbm_pretraining_fns = []
        self.rbm_pretraining_errors = []

        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size

        for n, rbm_layer in enumerate(self.rbm_layers):

            persistent_chain = theano.shared(value=np.zeros(
                shape=(batch_size, rbm_layer.n_hidden),
                dtype=theano.config.floatX),
                                             borrow=True)

            rbm_cost, rbm_updates = rbm_layer.get_cost_updates(
                learning_rate, persistent_chain, k)

            train_rbm = theano.function(inputs=[index, learning_rate],
                                        outputs=rbm_cost,
                                        updates=rbm_updates,
                                        givens={
                                            self.input:
                                            train_set_x[batch_begin:batch_end],
                                            rbm_layer.is_train:
                                            T.cast(1, 'int32')
                                        },
                                        name='train_rbm' + '_' + str(n))
            self.rbm_pretraining_fns.append(train_rbm)

            validate_rbm = theano.function(
                inputs=[index],
                outputs=rbm_layer.get_valid_error(),
                givens={
                    self.input: valid_set_x[batch_begin:batch_end],
                    rbm_layer.is_train: T.cast(0, 'int32')
                },
                name='valid_rbm' + '_' + str(n))
            self.rbm_pretraining_errors.append(validate_rbm)
            # end-for

        return self.rbm_pretraining_fns, self.rbm_pretraining_errors

    def build_finetune_functions(self, datasets, batch_size):

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        index = T.lscalar('index')  # index to a [mini]batch
        learning_rate = T.scalar('learning_rate')

        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size

        test_model = theano.function(inputs=[index],
                                     outputs=self.finetune_errors,
                                     givens={
                                         self.input:
                                         test_set_x[batch_begin:batch_end],
                                         self.output:
                                         test_set_y[batch_begin:batch_end],
                                         self.is_train:
                                         T.cast(0, 'int32')
                                     })

        validate_model = theano.function(
            inputs=[index],
            outputs=self.finetune_errors,
            givens={
                self.input: valid_set_x[batch_begin:batch_end],
                self.output: valid_set_y[batch_begin:batch_end],
                self.is_train: T.cast(0, 'int32')
            })

        train_model = theano.function(
            inputs=[index, learning_rate],
            outputs=self.finetune_cost,
            updates=self._finetune_updates(learning_rate),
            givens={
                self.input: train_set_x[batch_begin:batch_end],
                self.output: train_set_y[batch_begin:batch_end],
                self.is_train: T.cast(1, 'int32')
            })

        return train_model, validate_model, test_model
Beispiel #11
0
class SentConv(object):
    def __init__(self,
                 learning_rate=0.1,
                 L1_reg=0.00,
                 L2_reg=0.0001,
                 filter_hs=[3, 4, 5],
                 filter_num=100,
                 n_hidden=100,
                 n_out=2,
                 word_idx_map=None,
                 wordvec=None,
                 k=300,
                 adjust_input=False):
        """
        :type learning_rate: float
        :param learning_rate: learning rate used (factor for the stochastic
        gradient

        :type L1_reg: float
        :param L1_reg: L1-norm's weight when added to the cost (see
        regularization)

        :type L2_reg: float
        :param L2_reg: L2-norm's weight when added to the cost (see
        regularization)
        """
        self.learning_rate = learning_rate
        self.L1_reg = L1_reg
        self.L2_reg = L2_reg
        self.word_idx_map = word_idx_map
        rng = np.random.RandomState(3435)
        self.rng = rng
        self.k = k
        self.filter_num = filter_num
        self.filter_hs = filter_hs
        # Can be assigned at the fit step.
        self.batch_size = None

        self.epoch = 0

        self.Words = theano.shared(value=wordvec, name="Words")
        X = T.matrix('X')
        Y = T.ivector('Y')
        self.X = X
        self.Y = Y

        layer0_input = self.Words[T.cast(X.flatten(), dtype='int32')].reshape((X.shape[0], X.shape[1], self.Words.shape[1]))
        self.layer0_input = layer0_input
        c_max_list = []
        self.conv_layer_s = []
        test_case = []

        for filter_h in filter_hs:
            conv_layer = ConvLayer(rng, layer0_input, filter_h=filter_h, filter_num=filter_num, k=k)
            self.conv_layer_s.append(conv_layer)
            c_max_list.append(conv_layer.c_max)
        max_pooling_out = T.concatenate(c_max_list, axis=1)
        max_pooling_out_size = filter_num * len(filter_hs)

        self.hidden_layer = HiddenLayer(rng, max_pooling_out, max_pooling_out_size, n_hidden)

        self.lr_layer = LogisticRegression(
            input=self.hidden_layer.output,
            n_in=n_hidden,
            n_out=n_out,
        )
        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = (
            sum([abs(conv_layer.W).sum() for conv_layer in self.conv_layer_s])
            + abs(self.hidden_layer.W).sum()
            + abs(self.lr_layer.W).sum()
        )

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (
            sum([(conv_layer.W ** 2).sum() for conv_layer in self.conv_layer_s])
            + (self.hidden_layer.W ** 2).sum()
            + (self.lr_layer.W ** 2).sum()
        )



        # the cost we minimize during training is the negative log likelihood of
        # the model plus the regularization terms (L1 and L2); cost is expressed
        # here symbolically
        self.cost = (
            self.negative_log_likelihood(Y)
            + self.L1_reg * self.L1
            + self.L2_reg * self.L2_sqr
        )

        # the parameters of the model are the parameters of the two layer it is
        # made out of
        self.params = []
        # also adjust the input word vectors
        if adjust_input:
            self.params.append(self.Words)
        for conv_layer in self.conv_layer_s:
            self.params += conv_layer.params
        self.params += self.hidden_layer.params
        self.params += self.lr_layer.params

    # negative log likelihood of the MLP is given by the negative
    # log likelihood of the output of the model, computed in the
    # logistic regression layer
    def negative_log_likelihood(self, Y):
        return self.lr_layer.negative_log_likelihood(Y)

    # same holds for the function computing the number of errors
    def errors(self, Y):
        return self.lr_layer.errors(Y)

    def fit(self, datasets, batch_size=50, n_epochs=400):
        train_x, train_y, valid_x, valid_y = datasets
        self.batch_size = batch_size

        # compute number of minibatches for training, validation and testing
        train_len = train_x.get_value(borrow=True).shape[0]
        valid_len = valid_x.get_value(borrow=True).shape[0]
        n_train_batches = train_len / batch_size
        if train_len % batch_size != 0:
            n_train_batches += 1
        n_valid_batches = valid_len / batch_size
        if valid_len % batch_size != 0:
            n_valid_batches += 1

        print 'number of train mini batch: %s' % n_train_batches

        ######################
        # BUILD ACTUAL MODEL #
        ######################
        print '... building the model'

        # allocate symbolic variables for the data
        index = T.lscalar()  # index to a [mini]batch
        X = self.X
        Y = self.Y
        learn_rate = T.scalar('Learning Rate')

        # compute the gradient of cost with respect to theta (sotred in params)
        # the resulting gradients will be stored in a list gparams
        gparams = [T.grad(self.cost, param) for param in self.params]

        # specify how to update the parameters of the model as a list of
        # (variable, update expression) pairs

        # given two lists of the same length, A = [a1, a2, a3, a4] and
        # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
        # element is a pair formed from the two lists :
        #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
        updates = [
            (param, param - learn_rate * gparam)
            for param, gparam in zip(self.params, gparams)
        ]

        # compiling a Theano function `train_model` that returns the cost, but
        # in the same time updates the parameter of the model based on the rules
        # defined in `updates`
        train_model = theano.function(
            inputs=[index, learn_rate],
            outputs=self.cost,
            updates=updates,
            givens={
                X: train_x[index * batch_size: (index + 1) * batch_size],
                Y: train_y[index * batch_size: (index + 1) * batch_size]
            }
        )

        test_train_model = theano.function(
            inputs=[index],
            outputs=self.errors(Y),
            givens={
                X: train_x[index * batch_size: (index + 1) * batch_size],
                Y: train_y[index * batch_size: (index + 1) * batch_size]
            }
        )

        validate_model = theano.function(
            inputs=[index],
            outputs=self.errors(Y),
            givens={
                X: valid_x[index * batch_size:(index + 1) * batch_size],
                Y: valid_y[index * batch_size:(index + 1) * batch_size]
            }
        )

        ###############
        # TRAIN MODEL #
        ###############
        print '... training'

        # early-stopping parameters
        patience = 1000000  # look as this many examples regardless
        patience_increase = 2  # wait this much longer when a new best is
                               # found
        improvement_threshold = 0.9999  # a relative improvement of this much is
                                       # considered significant
        validation_frequency = min(n_train_batches, patience / 2)
                                      # go through this many
                                      # minibatche before checking the network
                                      # on the validation set; in this case we
                                      # check every epoch

        best_validation_loss = np.inf
        best_iter = 0
        test_score = 0.
        start_time = timeit.default_timer()

        done_looping = False
        last_cost = np.inf
        sys.stdout.flush()
        logger.info('already traned number of epochs: %s' % self.epoch)
        epoch = self.epoch
        while (epoch < n_epochs) and (not done_looping):
            epoch += 1
            avg_cost_list = []
            for minibatch_index in xrange(n_train_batches):

                minibatch_avg_cost = train_model(minibatch_index, self.learning_rate)
                avg_cost_list.append(minibatch_avg_cost)
                # print self.lr_layer.W.get_value()
                # iteration number
                iter = (epoch - 1) * n_train_batches + minibatch_index

                if (iter + 1) % validation_frequency == 0:
                    # print self.lr_layer.W.get_value()
                    # print self.lr_layer.b.get_value()

                    # train_losses = [test_train_model(i) for i in xrange(n_train_batches)]
                    # this_train_loss = np.mean(train_losses)


                    # # compute zero-one loss on validation set
                    # validation_losses = [validate_model(i) for i
                    #                      in xrange(n_valid_batches)]
                    # this_validation_loss = np.mean(validation_losses)
                    # train_all_precison, train_label_precision, train_label_recall = \
                    #     self.test(train_x, train_y.eval())
                    # this_train_loss = 1 - train_all_precison

                    valid_all_precison, valid_label_precision, valid_label_recall = \
                        self.test(valid_x, valid_y.eval())
                    this_validation_loss = 1 - valid_all_precison

                    avg_cost = np.mean(avg_cost_list)
                    if avg_cost >= last_cost:
                        self.learning_rate *= 0.95
                    last_cost = avg_cost


                    logger.info(
                        'epoch %i, learning rate: %f, avg_cost: %f, valid P: %f %%, valid_1_P: %s, valid_1_R: %s' %
                        (
                            epoch,
                            self.learning_rate,
                            avg_cost,
                            # (1 - this_train_loss) * 100,
                            (1 - this_validation_loss) * 100.,
                            # train_label_precision[1],
                            # train_label_recall[1],
                            valid_label_precision[1],
                            valid_label_recall[1]
                        )
                    )
                    sys.stdout.flush()

                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:
                        #improve patience if loss improvement is good enough
                        if (
                            this_validation_loss < best_validation_loss *
                            improvement_threshold
                        ):
                        # Increase patience_increase times based on the current iteration.
                            patience = max(patience, iter * patience_increase)

                        best_validation_loss = this_validation_loss
                        best_iter = iter

                if patience <= iter:
                    done_looping = True
                    break
                self.epoch = epoch

        end_time = timeit.default_timer()
        logger.info(('Optimization complete. Best validation score of %f %% '
               'obtained at iteration %i') %
              (( 1 - best_validation_loss) * 100., best_iter + 1))
        logger.info('The code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.2fm' % ((end_time - start_time) / 60.))

    def save(self, path):
        with open(path, 'wb') as f:
            pickle.dump(self, f, -1)
        logger.info('save model to path %s' % path)
        return None

    @classmethod
    def load(self, path):
        with open(path, 'rb') as f:
            return pickle.load(f)

    def predict(self, shared_x, batch_size=None):
        if not batch_size:
            batch_size = self.batch_size
        shared_x_len = shared_x.get_value(borrow=True).shape[0]
        n_batches = shared_x_len / batch_size
        if shared_x_len % batch_size != 0:
            n_batches += 1

        index = T.lscalar()  # index to a [mini]batch
        X = self.X

        predict_model = theano.function(
            inputs=[index],
            outputs=self.lr_layer.y_pred,
            givens={
                X: shared_x[index * batch_size:(index + 1) * batch_size]
            }
        )
        pred_y = np.concatenate([predict_model(i) for i in range(n_batches)])
        return pred_y

    def test(self, shared_x, data_y, out_path=None):
        pred_y = self.predict(shared_x)
        if out_path:
            with codecs.open(out_path, 'wb') as f:
                f.writelines(['%s\t%s\n' % (x, y) for x, y in zip(data_y, pred_y)])
        return evaluate(data_y, pred_y)

    def test_from_file(self, path, out_path=None, encoding='utf-8'):
        data_x = []
        data_y = []
        with codecs.open(path, 'rb', encoding=encoding) as f:
            for i, line in enumerate(f):
                tokens = line.strip('\n').split('\t')
                if len(tokens) != 2:
                    raise ValueError('invalid line %s' % (i+1))
                label = int(tokens[0])
                sent = tokens[1]
                s = get_idx_from_sent(sent, self.word_idx_map)
                data_x.append(s)
                data_y.append(label)
        shared_x = theano.shared(
            value=np.asarray(data_x, dtype=theano.config.floatX),
            borrow='True'
        )
        return self.test(shared_x, data_y, out_path=out_path)