コード例 #1
0
def cva_6layer_dropout_mnist_60000(seed=0,
                                   dropout_flag=1,
                                   drop_inverses_flag=0,
                                   learning_rate=3e-4,
                                   predir=None,
                                   n_batch=144,
                                   dataset='mnist.pkl.gz',
                                   batch_size=500,
                                   nkerns=[20, 50],
                                   n_hidden=[500, 50]):
    """
    Implementation of convolutional VA
    """
    #cp->cd->cpd->cd->c
    nkerns = [32, 32, 64, 64, 64]
    drops = [1, 0, 1, 0, 0]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 1, 2, 1]
    #modes=['same']*5
    n_hidden = [500, 50]
    drop_inverses = [
        1,
    ]
    # 28->12->12->5->5/5*5*64->500->50->500->5*5*64/5->5->12->12->28

    if dataset == 'mnist.pkl.gz':
        dim_input = (28, 28)
        colorImg = False

    logdir = 'results/supervised/cva/mnist/cva_6layer_mnist_60000' + str(
        nkerns) + str(n_hidden) + '_' + str(learning_rate) + '_'
    if predir is not None:
        logdir += 'pre_'
    if dropout_flag == 1:
        logdir += ('dropout_' + str(drops) + '_')
    if drop_inverses_flag == 1:
        logdir += ('inversedropout_' + str(drop_inverses) + '_')
    logdir += str(int(time.time())) + '/'

    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir, 'predir', predir
    print 'cva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, 'logdir:', logdir, 'predir', predir
        print >> f, 'cva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag

    datasets = datapy.load_data_gpu_60000(dataset, have_matrix=True)

    train_set_x, train_set_y, train_y_matrix = datasets[0]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
    test_set_x, test_set_y, test_y_matrix = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels
    random_z = T.matrix('random_z')

    drop = T.iscalar('drop')
    drop_inverse = T.iscalar('drop_inverse')

    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)
    input_x = x.reshape((batch_size, 1, 28, 28))

    recg_layer = []
    cnn_output = []

    #1
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2),
                                border_mode='valid',
                                activation=activation))
    if drops[0] == 1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x,
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))

    #2
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[1] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #3
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[1], 12, 12),
                                filter_shape=(nkerns[2], nkerns[1], 3, 3),
                                poolsize=(2, 2),
                                border_mode='valid',
                                activation=activation))
    if drops[2] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #4
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[2], 5, 5),
                                filter_shape=(nkerns[3], nkerns[2], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[3] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    #5
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[3], 5, 5),
                                filter_shape=(nkerns[4], nkerns[3], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[4] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []

    #1
    recg_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=5 * 5 * nkerns[-1],
                                      n_out=n_hidden[0],
                                      activation=activation))
    if drops[-1] == 1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x,
                                                      drop=drop,
                                                      rng=rng_share))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))

    #stochastic layer
    recg_layer.append(
        GaussianHidden.GaussianHidden(rng=rng,
                                      input=activations[-1],
                                      n_in=n_hidden[0],
                                      n_out=n_hidden[1],
                                      activation=None))

    z = recg_layer[-1].sample_z(rng_share)

    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=n_hidden[1],
                                      n_out=n_hidden[0],
                                      activation=activation))

    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))

    #2
    gene_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=n_hidden[0],
                                      n_out=5 * 5 * nkerns[-1],
                                      activation=activation))

    if drop_inverses[0] == 1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1],
                                                   drop=drop_inverse,
                                                   rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(
            input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(
            gene_layer[-1].output(input=random_z_output[-1]))

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 5, 5))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 5, 5))

    #1
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-1], 5, 5),
                                    filter_shape=(nkerns[-2], nkerns[-1], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=input_random_z, n_batch=n_batch))

    #2
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-2], 5, 5),
                                    filter_shape=(nkerns[-3], nkerns[-2], 3,
                                                  3),
                                    poolsize=(2, 2),
                                    border_mode='full',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-3], 12,
                                                 12),
                                    filter_shape=(nkerns[-4], nkerns[-3], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-4], 12,
                                                 12),
                                    filter_shape=(nkerns[-5], nkerns[-4], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #5 stochastic layer
    # for the last layer, the nonliearity should be sigmoid to achieve mean of Bernoulli
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-5], 12,
                                                 12),
                                    filter_shape=(1, nkerns[-5], 5, 5),
                                    poolsize=(2, 2),
                                    border_mode='full',
                                    activation=nonlinearity.sigmoid))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    gene_layer.append(
        NoParamsBernoulliVisiable.NoParamsBernoulliVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=z_output[-1], data=input_x)

    # 4-D tensor of random generation
    random_x_mean = random_z_output[-1]
    random_x = gene_layer[-1].sample_x(rng_share, random_x_mean)

    #L = (logpx + logpz - logqz).sum()
    cost = ((logpx + recg_layer[-1].logpz - recg_layer[-1].logqz).sum())

    px = (logpx.sum())
    pz = (recg_layer[-1].logpz.sum())
    qz = (-recg_layer[-1].logqz.sum())

    params = []
    for g in gene_layer:
        params += g.params
    for r in recg_layer:
        params += r.params
    gparams = [T.grad(cost, param) for param in params]

    weight_decay = 1.0 / n_train_batches
    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    #get_optimizer = optimizer.get_adam_optimizer(learning_rate=learning_rate)
    get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r,
                                                     decay1=0.1,
                                                     decay2=0.001,
                                                     weight_decay=weight_decay,
                                                     epsilon=1e-8)
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, 'AdaM', learning_rate, weight_decay
    updates = get_optimizer(params, gparams)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=cost,
        #outputs=layer[-1].errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            #y: test_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: test_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=cost,
        #outputs=layer[-1].errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            #y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: valid_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })
    '''
    Save parameters and activations
    '''

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        })

    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        })

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        })

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`

    debug_model = theano.function(
        inputs=[index],
        outputs=[cost, px, pz, qz],
        #updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        })

    random_generation = theano.function(
        inputs=[random_z],
        outputs=[random_x_mean.flatten(2),
                 random_x.flatten(2)],
        givens={
            #drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })

    train_bound_without_dropout = theano.function(
        inputs=[index],
        outputs=cost,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        })

    ##################
    # Pretrain MODEL #
    ##################
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir + 'model.npz')
        pre_train = pre_train['model']
        for (para, pre) in zip(params, pre_train):
            para.set_value(pre)
        tmp = [debug_model(i) for i in xrange(n_train_batches)]
        tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
        print '------------------', tmp

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_bound = -1000000.0
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    NaN_count = 0
    epoch = 0
    threshold = 0
    validation_frequency = 1
    generatition_frequency = 10
    if predir is not None:
        threshold = 0
    color.printRed('threshold, ' + str(threshold) +
                   ' generatition_frequency, ' + str(generatition_frequency) +
                   ' validation_frequency, ' + str(validation_frequency))
    done_looping = False
    n_epochs = 600
    decay_epochs = 500
    '''
    print 'test initialization...'
    pre_model = parameters()
    for i in xrange(len(pre_model)):
        pre_model[i] = np.asarray(pre_model[i])
        print pre_model[i].shape, np.mean(pre_model[i]), np.var(pre_model[i])
    print 'end test...'
    '''
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        minibatch_avg_cost = 0

        tmp_start1 = time.clock()

        test_epoch = epoch - decay_epochs
        if test_epoch > 0 and test_epoch % 10 == 0:
            print l_r.get_value()
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, l_r.get_value()
            l_r.set_value(np.cast['float32'](l_r.get_value() / 3.0))

        for minibatch_index in xrange(n_train_batches):
            #print minibatch_index
            '''
            color.printRed('lalala')
            xxx = dims(minibatch_index)
            print xxx.shape
            '''
            #print n_train_batches
            minibatch_avg_cost += train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

        if math.isnan(minibatch_avg_cost):
            NaN_count += 1
            color.printRed("NaN detected. Reverting to saved best parameters")
            print '---------------NaN_count:', NaN_count
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, '---------------NaN_count:', NaN_count

            tmp = [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            print '------------------NaN check:', tmp
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, '------------------NaN check:', tmp

            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                print model[i].shape, np.mean(model[i]), np.var(model[i])
                print np.max(model[i]), np.min(model[i])
                print np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))
                with open(logdir + 'hook.txt', 'a') as f:
                    print >> f, model[i].shape, np.mean(model[i]), np.var(
                        model[i])
                    print >> f, np.max(model[i]), np.min(model[i])
                    print >> f, np.all(np.isfinite(model[i])), np.any(
                        np.isnan(model[i]))

            best_before = np.load(logdir + 'model.npz')
            best_before = best_before['model']
            for (para, pre) in zip(params, best_before):
                para.set_value(pre)
            tmp = [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            print '------------------', tmp
            return

        #print 'optimization_time', time.clock() - tmp_start1
        print epoch, 'stochastic training error', minibatch_avg_cost / float(
            n_train_batches * batch_size)
        with open(logdir + 'hook.txt', 'a') as f:
            print >> f, epoch, 'stochastic training error', minibatch_avg_cost / float(
                n_train_batches * batch_size)

        if epoch % validation_frequency == 0:
            tmp_start2 = time.clock()

            test_losses = [test_model(i) for i in xrange(n_test_batches)]
            this_test_bound = np.mean(test_losses) / float(batch_size)

            #tmp =  [debug_model(i) for i
            #                     in xrange(n_train_batches)]
            #tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)

            print epoch, 'test bound', this_test_bound
            #print tmp
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, epoch, 'test bound', this_test_bound

        if epoch % 100 == 0:

            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
            np.savez(logdir + 'model-' + str(epoch), model=model)

            for i in xrange(n_train_batches):
                if i == 0:
                    train_features = np.asarray(train_activations(i))
                else:
                    train_features = np.vstack(
                        (train_features, np.asarray(train_activations(i))))

            for i in xrange(n_valid_batches):
                if i == 0:
                    valid_features = np.asarray(valid_activations(i))
                else:
                    valid_features = np.vstack(
                        (valid_features, np.asarray(valid_activations(i))))

            for i in xrange(n_test_batches):
                if i == 0:
                    test_features = np.asarray(test_activations(i))
                else:
                    test_features = np.vstack(
                        (test_features, np.asarray(test_activations(i))))
            np.save(logdir + 'train_features', train_features)
            np.save(logdir + 'valid_features', valid_features)
            np.save(logdir + 'test_features', test_features)

        tmp_start4 = time.clock()
        if epoch % generatition_frequency == 0:
            tail = '-' + str(epoch) + '.png'
            random_z = np.random.standard_normal(
                (n_batch, n_hidden[-1])).astype(np.float32)
            _x_mean, _x = random_generation(random_z)
            #print _x.shape
            #print _x_mean.shape
            image = paramgraphics.mat_to_img(_x.T,
                                             dim_input,
                                             colorImg=colorImg)
            image.save(logdir + 'samples' + tail, 'PNG')
            image = paramgraphics.mat_to_img(_x_mean.T,
                                             dim_input,
                                             colorImg=colorImg)
            image.save(logdir + 'mean_samples' + tail, 'PNG')
        #print 'generation_time', time.clock() - tmp_start4

    end_time = time.clock()
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    if NaN_count > 0:
        print '---------------NaN_count:', NaN_count
        with open(logdir + 'hook.txt', 'a') as f:
            print >> f, '---------------NaN_count:', NaN_count
コード例 #2
0
def cmmva_6layer_dropout_mnist_60000(seed=0, start_layer=0, end_layer=1, dropout_flag=1, drop_inverses_flag=0, learning_rate=3e-5, predir=None, n_batch=144,
             dataset='mnist.pkl.gz', batch_size=500, nkerns=[20, 50], n_hidden=[500, 50]):

    """
    Implementation of convolutional MMVA
    """    
    #cp->cd->cpd->cd->c
    nkerns=[32, 32, 64, 64, 64]
    drops=[1, 0, 1, 0, 0, 1]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 1, 2, 1]
    #modes=['same']*5
    n_hidden=[500, 50]
    drop_inverses=[1,]
    # 28->12->12->5->5/5*5*64->500->50->500->5*5*64/5->5->12->12->28
    
    if dataset=='mnist.pkl.gz':
        dim_input=(28, 28)
        colorImg=False
    D = 1.0
    C = 1.0
    if os.environ.has_key('C'):
        C = np.cast['float32'](float((os.environ['C'])))
    if os.environ.has_key('D'):
        D = np.cast['float32'](float((os.environ['D'])))
    color.printRed('D '+str(D)+' C '+str(C))

    logdir = 'results/supervised/cmmva/mnist/cmmva_6layer_60000_'+str(nkerns)+str(n_hidden)+'_D_'+str(D)+'_C_'+str(C)+'_'+str(learning_rate)+'_'
    if predir is not None:
        logdir +='pre_'
    if dropout_flag == 1:
        logdir += ('dropout_'+str(drops)+'_')
    if drop_inverses_flag==1:
        logdir += ('inversedropout_'+str(drop_inverses)+'_')
    logdir += str(int(time.time()))+'/'

    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir, 'predir', predir
    print 'cmmva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'logdir:', logdir, 'predir', predir
        print >>f, 'cmmva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag

    datasets = datapy.load_data_gpu_60000(dataset, have_matrix=True)

    train_set_x, train_set_y, train_y_matrix = datasets[0]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
    test_set_x, test_set_y, test_y_matrix = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    y_matrix = T.imatrix('y_matrix')
    random_z = T.matrix('random_z')

    drop = T.iscalar('drop')
    drop_inverse = T.iscalar('drop_inverse')
    
    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)
    input_x = x.reshape((batch_size, 1, 28, 28))
    
    recg_layer = []
    cnn_output = []

    #1
    recg_layer.append(ConvMaxPool.ConvMaxPool(
            rng,
            image_shape=(batch_size, 1, 28, 28),
            filter_shape=(nkerns[0], 1, 5, 5),
            poolsize=(2, 2),
            border_mode='valid',
            activation=activation
        ))
    if drops[0]==1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x, drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))

    #2
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[1]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    #3
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[1], 12, 12),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(2, 2),
        border_mode='valid', 
        activation=activation
    ))
    if drops[2]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #4
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[2], 5, 5),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[3]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    #5
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[3], 5, 5),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[4]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
   
    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []

    #1
    recg_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in= 5 * 5 * nkerns[-1],
            n_out=n_hidden[0],
            activation=activation
        ))
    if drops[-1]==1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))
    
    features = T.concatenate(activations[start_layer:end_layer], axis=1)
    color.printRed('feature dimension: '+str(np.sum(n_hidden[start_layer:end_layer])))
    
    classifier = Pegasos.Pegasos(
            input= features,
            rng=rng,
            n_in=np.sum(n_hidden[start_layer:end_layer]),
            n_out=10,
            weight_decay=0,
            loss=1,
            std=1e-2
        )

    recg_layer.append(GaussianHidden.GaussianHidden(
            rng=rng,
            input=activations[-1],
            n_in=n_hidden[0],
            n_out = n_hidden[1],
            activation=None
        ))

    z = recg_layer[-1].sample_z(rng_share)


    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[1],
            n_out = n_hidden[0],
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))

    #2
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 5*5*nkerns[-1],
            activation=activation
        ))

    if drop_inverses[0]==1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(gene_layer[-1].output(input=random_z_output[-1]))

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 5, 5))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 5, 5))

    #1
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-1], 5, 5),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(input=input_random_z, n_batch=n_batch))
    
    #2
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-2], 5, 5),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(2, 2),
            border_mode='full', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-3], 12, 12),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-4], 12, 12),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #5 stochastic layer 
    # for the last layer, the nonliearity should be sigmoid to achieve mean of Bernoulli
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-5], 12, 12),
            filter_shape=(1, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='full', 
            activation=nonlinearity.sigmoid
        ))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))
   
    gene_layer.append(NoParamsBernoulliVisiable.NoParamsBernoulliVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=z_output[-1], data=input_x)


    # 4-D tensor of random generation
    random_x_mean = random_z_output[-1]
    random_x = gene_layer[-1].sample_x(rng_share, random_x_mean)

    #L = (logpx + logpz - logqz).sum()
    lowerbound = (
        (logpx + recg_layer[-1].logpz - recg_layer[-1].logqz).sum()
    )

    hinge_loss = classifier.hinge_loss(10, y, y_matrix) * batch_size

    #
    # D is redundent, you could just set D = 1 and tune C and weight decay parameters
    # beacuse AdaM is scale-invariant
    #
    cost = D * lowerbound - C * hinge_loss #- classifier.L2_reg
    
    px = (logpx.sum())
    pz = (recg_layer[-1].logpz.sum())
    qz = (- recg_layer[-1].logqz.sum())

    params=[]
    for g in gene_layer:
        params+=g.params
    for r in recg_layer:
        params+=r.params
    params+=classifier.params
    gparams = [T.grad(cost, param) for param in params]

    weight_decay=1.0/n_train_batches
    epsilon=1e-8
    
    #get_optimizer = optimizer.get_adam_optimizer(learning_rate=learning_rate)
    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r, 
        decay1=0.1, decay2=0.001, weight_decay=weight_decay, epsilon=epsilon)
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'AdaM', learning_rate, weight_decay, epsilon
    updates = get_optimizer(params,gparams)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        #outputs=layer[-1].errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix: test_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        #outputs=layer[-1].errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix: valid_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    
    '''
    Save parameters and activations
    '''

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    
    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`

    debug_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, px, pz, qz, hinge_loss, cost],
        #updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        }
    )

    random_generation = theano.function(
        inputs=[random_z],
        outputs=[random_x_mean.flatten(2), random_x.flatten(2)],
        givens={
            #drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )
    
    train_bound_without_dropout = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    train_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        }
    )
    # end-snippet-5

    ##################
    # Pretrain MODEL #
    ##################
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir+'model.npz')
        pre_train = pre_train['model']
        # params include w and b, exclude it
        for (para, pre) in zip(params[:-2], pre_train):
            #print pre.shape
            para.set_value(pre)
        tmp =  [debug_model(i) for i in xrange(n_train_batches)]
        tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
        print '------------------', tmp[1:5]
    
    # valid_error test_error  epochs
    predy_test_stats = [1, 1, 0]
    predy_valid_stats = [1, 1, 0]

    best_validation_bound = -1000000.0
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    NaN_count = 0
    epoch = 0
    threshold = 0
    validation_frequency = 1
    generatition_frequency = 10
    if predir is not None:
        threshold = 0
    color.printRed('threshold, '+str(threshold) + 
        ' generatition_frequency, '+str(generatition_frequency)
        +' validation_frequency, '+str(validation_frequency))
    done_looping = False
    decay_epochs=500
    n_epochs=600

    '''
    print 'test initialization...'
    pre_model = parameters()
    for i in xrange(len(pre_model)):
        pre_model[i] = np.asarray(pre_model[i])
        print pre_model[i].shape, np.mean(pre_model[i]), np.var(pre_model[i])
    print 'end test...'
    '''
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        train_error = 0
        train_lowerbound = 0
        train_hinge_loss = 0
        train_obj = 0
        
        test_epoch = epoch - decay_epochs
        if test_epoch > 0 and test_epoch % 10 == 0:
            print l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,l_r.get_value()
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))

        tmp_start1 = time.clock()
        for minibatch_index in xrange(n_train_batches):
            #print n_train_batches
            e, l, h, o = train_model(minibatch_index)
            train_error += e
            train_lowerbound += l
            train_hinge_loss += h
            train_obj += o
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

        
        if math.isnan(train_lowerbound):
            NaN_count+=1
            color.printRed("NaN detected. Reverting to saved best parameters")
            print '---------------NaN_count:', NaN_count
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, '---------------NaN_count:', NaN_count
            
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            tmp[0]*=batch_size
            print '------------------NaN check:', tmp
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, '------------------NaN check:', tmp

            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                print model[i].shape, np.mean(model[i]), np.var(model[i])
                print np.max(model[i]), np.min(model[i])
                print np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, model[i].shape, np.mean(model[i]), np.var(model[i])
                    print >>f, np.max(model[i]), np.min(model[i])
                    print >>f, np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))

            best_before = np.load(logdir+'model.npz')
            best_before = best_before['model']
            for (para, pre) in zip(params, best_before):
                para.set_value(pre)
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            tmp[0]*=batch_size
            print '------------------', tmp
            continue

        n_train=n_train_batches*batch_size
        #print 'optimization_time', time.clock() - tmp_start1
        print epoch, 'stochastic training error', train_error / float(batch_size), train_lowerbound / float(n_train), train_hinge_loss / float(n_train), train_obj / float(n_train)
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, epoch, 'stochastic training error', train_error / float(batch_size), train_lowerbound / float(n_train), train_hinge_loss / float(n_train), train_obj / float(n_train)

        if epoch % validation_frequency == 0:
            tmp_start2 = time.clock()
            # compute zero-one loss on validation set
            #train_stats = [train_bound_without_dropout(i) for i
            #                     in xrange(n_train_batches)]
            #this_train_stats = np.mean(train_stats, axis=0)
            #this_train_stats[1:] = this_train_stats[1:]/ float(batch_size)

            test_stats = [test_model(i) for i in xrange(n_test_batches)]
            this_test_stats = np.mean(test_stats, axis=0)
            this_test_stats[1:] = this_test_stats[1:]/ float(batch_size)
            
            print epoch, 'test error', this_test_stats
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, epoch, 'test error', this_test_stats

        if epoch%100==0:
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                #print model[i].shape, np.mean(model[i]), np.var(model[i])
                            
            np.savez(logdir+'model-'+str(epoch), model=model)
                
        
        tmp_start4=time.clock()
        if epoch % generatition_frequency == 0:
            tail='-'+str(epoch)+'.png'
            random_z = np.random.standard_normal((n_batch, n_hidden[-1])).astype(np.float32)
            _x_mean, _x = random_generation(random_z)
            #print _x.shape
            #print _x_mean.shape
            image = paramgraphics.mat_to_img(_x.T, dim_input, colorImg=colorImg)
            image.save(logdir+'samples'+tail, 'PNG')
            image = paramgraphics.mat_to_img(_x_mean.T, dim_input, colorImg=colorImg)
            image.save(logdir+'mean_samples'+tail, 'PNG')
        #print 'generation_time', time.clock() - tmp_start4
        

    end_time = time.clock()
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    if NaN_count > 0:
        print '---------------NaN_count:', NaN_count
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, '---------------NaN_count:', NaN_count
コード例 #3
0
def cmmd(dataset='mnist.pkl.gz',
         batch_size=100,
         layer_num=3,
         hidden_dim=5,
         seed=0,
         layer_size=[64, 256, 256, 512]):

    validation_frequency = 1
    test_frequency = 1
    pre_train = 1

    dim_input = (28, 28)
    colorImg = False

    print "Loading data ......."
    #datasets = datapy.load_data_gpu_60000_with_noise(dataset, have_matrix = True)
    datasets = datapy.load_data_gpu_60000(dataset, have_matrix=True)
    train_set_x, train_set_y, train_y_matrix = datasets[0]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
    test_set_x, test_set_y, test_y_matrix = datasets[2]

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    n_train_batches = train_set_x.get_value().shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    aImage = paramgraphics.mat_to_img(train_set_x.get_value()[0:169].T,
                                      dim_input,
                                      colorImg=colorImg)
    aImage.save('mnist_sample', 'PNG')

    ################################
    ##        build model         ##
    ################################
    print "Building model ......."

    index = T.lscalar()
    x = T.matrix('x')  ##### batch_size * 28^2
    y = T.vector('y')
    y_matrix = T.matrix('y_matrix')
    random_z = T.matrix('random_z')  ### batch_size * hidden_dim
    Inv_K_d = T.matrix('Inv_K_d')

    layers = []
    layer_output = []

    activation = nonlinearity.relu
    #activation = Tnn.sigmoid
    #### first layer
    layers.append(
        FullyConnected.FullyConnected(
            rng=rng,
            n_in=10 + hidden_dim,
            #n_in = 10,
            n_out=layer_size[0],
            activation=activation))
    layer_output.append(layers[-1].output_mix(input=[y_matrix, random_z]))
    #layer_output.append(layers[-1].output_mix2(input=[y_matrix,random_z]))
    #layer_output.append(layers[-1].output(input=x))
    #layer_output.append(layers[-1].output(input=random_z))

    #### middle layer
    for i in range(layer_num):
        layers.append(
            FullyConnected.FullyConnected(rng=rng,
                                          n_in=layer_size[i],
                                          n_out=layer_size[i + 1],
                                          activation=activation))
        layer_output.append(layers[-1].output(input=layer_output[-1]))

    #### last layer
    activation = Tnn.sigmoid
    #activation = nonlinearity.relu
    layers.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=layer_size[-1],
                                      n_out=28 * 28,
                                      activation=activation))
    x_gen = layers[-1].output(input=layer_output[-1])

    lambda1_ = 100
    lambda_ = theano.shared(np.asarray(lambda1_, dtype=np.float32))

    K_d = kernel_gram_for_y(y_matrix, y_matrix, batch_size, 10)
    K_s = K_d
    K_sd = K_d

    Invv_1 = T.sum(y_matrix, axis=0) / batch_size
    Invv = NL.alloc_diag(1 / Invv_1)
    Inv_K_d = Invv
    #Inv_K_d = NL.matrix_inverse(K_d +lambda_ * T.identity_like(K_d))
    Inv_K_s = Inv_K_d

    L_d = kernel_gram_for_x(x, x, batch_size, 28 * 28)
    L_s = kernel_gram_for_x(x_gen, x_gen, batch_size, 28 * 28)
    L_ds = kernel_gram_for_x(x, x_gen, batch_size, 28 * 28)
    '''
	cost = -(NL.trace(T.dot(T.dot(T.dot(K_d, Inv_K_d), L_d), Inv_K_d)) +\
			NL.trace(T.dot(T.dot(T.dot(K_s, Inv_K_s), L_s),Inv_K_s))- \
			2 * NL.trace(T.dot(T.dot(T.dot(K_sd, Inv_K_d) ,L_ds ), Inv_K_s)))
	'''
    '''
	cost = -(NL.trace(T.dot(L_d, T.ones_like(L_d) )) +\
			NL.trace(T.dot(L_s,T.ones_like(L_s)))- \
			2 * NL.trace(T.dot(L_ds,T.ones_like(L_ds) )))


	cost2 =  2 * T.sum(L_ds) - T.sum(L_s)  + NL.trace(T.dot(L_s, T.ones_like(L_s)))\
			- 2 * NL.trace( T.dot(L_ds , T.ones_like(L_ds)))
	cost2 = T.dot(T.dot(Inv_K_d, K_d),Inv_K_d)
	'''
    cost2 = K_d
    #cost2 = T.dot(T.dot(Inv_K_d,K_d),Inv_K_d)
    #cost =  - T.sum(L_d) +2 * T.sum(L_ds) - T.sum(L_s)
    cost2 = K_d
    cost2 = T.dot(T.dot(T.dot(y_matrix, Inv_K_d), Inv_K_d), y_matrix.T)

    cost = -(NL.trace(T.dot(T.dot(T.dot(T.dot(L_d, y_matrix),Inv_K_d), Inv_K_d),y_matrix.T)) +\
      NL.trace(T.dot(T.dot(T.dot(T.dot(L_s, y_matrix),Inv_K_s), Inv_K_s),y_matrix.T))- \
      2 * NL.trace(T.dot(T.dot(T.dot(T.dot(L_ds, y_matrix),Inv_K_d), Inv_K_s),y_matrix.T)))
    '''
	cost =  - T.sum(L_d) +2 * T.sum(L_ds) - T.sum(L_s)
	cost =  - NL.trace(K_s * Inv_K_s * L_s * Inv_K_s)+ \
			2 * NL.trace(K_sd * Inv_K_d * L_ds * Inv_K_s)
	'''

    ################################
    ##        updates             ##
    ################################
    params = []
    for aLayer in layers:
        params += aLayer.params
    gparams = [T.grad(cost, param) for param in params]

    learning_rate = 3e-4
    weight_decay = 1.0 / n_train_batches
    epsilon = 1e-8

    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r,
                                                     decay1=0.1,
                                                     decay2=0.001,
                                                     weight_decay=weight_decay,
                                                     epsilon=epsilon)
    updates = get_optimizer(params, gparams)

    ################################
    ##         pretrain model     ##
    ################################
    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    gen_fig = theano.function(
        inputs=[y_matrix, random_z],
        outputs=x_gen,
        on_unused_input='warn',
    )

    if pre_train == 1:
        print "pre-training model....."
        pre_train = np.load('./result/MMD-100-5-64-256-256-512.npz')['model']
        for (para, pre) in zip(params, pre_train):
            para.set_value(pre)

        s = 8
        for jj in range(10):
            a = np.zeros((s, 10), dtype=np.float32)
            for ii in range(s):
                kk = random.randint(0, 9)
                a[ii, kk] = 1

            x_gen = gen_fig(a, gen_random_z(s, hidden_dim))

            ttt = train_set_x.get_value()
            for ll in range(s):
                minn = 1000000
                ss = 0
                for kk in range(ttt.shape[0]):
                    tt = np.linalg.norm(x_gen[ll] - ttt[kk])
                    if tt < minn:
                        minn = tt
                        ss = kk
                #np.concatenate(x_gen,ttt[ss])
                x_gen = np.vstack((x_gen, ttt[ss]))

            aImage = paramgraphics.mat_to_img(x_gen.T,
                                              dim_input,
                                              colorImg=colorImg)
            aImage.save('samples_' + str(jj) + '_similar', 'PNG')

    ################################
    ##         prepare data       ##
    ################################

    #### compute matrix inverse
    #print "Preparing data ...."
    #Invv = NL.matrix_inverse(K_d +lambda_ * T.identity_like(K_d))
    '''
	Invv_1 = T.sum(y_matrix,axis=0)/batch_size
	Invv = NL.alloc_diag(1/Invv_1)
	Inv_K_d = Invv

	prepare_data = theano.function(
			inputs = [index],
			outputs = [Invv,K_d],
			givens = {
				#x:train_set_x[index * batch_size:(index + 1) * batch_size],
				y_matrix:train_y_matrix[index * batch_size:(index + 1) * batch_size],
				}
			)

	Inv_K_d_l, K_d_l =  prepare_data(0)
	print Inv_K_d_l

	for minibatch_index in range(1, n_train_batches):
		if minibatch_index % 10 == 0:
			print 'minibatch_index:', minibatch_index
		Inv_pre_mini, K_d_pre_mini = prepare_data(minibatch_index)
		Inv_K_d_l = np.vstack((Inv_K_d_l,Inv_pre_mini))
		K_d_l = np.vstack((K_d_l,K_d_pre_mini))

	Inv_K_d_g = theano.shared(Inv_K_d_l,borrow=True)
	K_d_g = theano.shared(K_d_l, borrow=True)
	'''

    ################################
    ##         train model        ##
    ################################

    train_model = theano.function(
        inputs=[index, random_z],
        outputs=[cost, x_gen, cost2],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix:
            train_y_matrix[index * batch_size:(index + 1) * batch_size],
            #K_d:K_d_g[index * batch_size:(index + 1) * batch_size],
            #Inv_K_d:Inv_K_d_g[index * batch_size:(index + 1) * batch_size],
        },
        on_unused_input='warn')

    n_epochs = 500
    cur_epoch = 0

    print "Training model ......"

    while (cur_epoch < n_epochs):
        cur_epoch = cur_epoch + 1
        cor = 0
        for minibatch_index in xrange(n_train_batches):
            print minibatch_index,
            print " : ",
            cost, x_gen, cost2 = train_model(
                minibatch_index, gen_random_z(batch_size, hidden_dim))
            print 'cost: ', cost
            print 'cost2: ', cost2
            if minibatch_index % 30 == 0:
                aImage = paramgraphics.mat_to_img(x_gen[0:1].T,
                                                  dim_input,
                                                  colorImg=colorImg)
                aImage.save(
                    'samples_epoch_' + str(cur_epoch) + '_mini_' +
                    str(minibatch_index), 'PNG')

        if cur_epoch % 1 == 0:
            model = parameters()
            for i in range(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
            np.savez('model-' + str(cur_epoch), model=model)
コード例 #4
0
def cva_6layer_dropout_mnist_60000(seed=0, dropout_flag=1, drop_inverses_flag=0, learning_rate=3e-4, predir=None, n_batch=144,
             dataset='mnist.pkl.gz', batch_size=500, nkerns=[20, 50], n_hidden=[500, 50]):

    """
    Implementation of convolutional VA
    """    
    #cp->cd->cpd->cd->c
    nkerns=[32, 32, 64, 64, 64]
    drops=[1, 0, 1, 0, 0]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 1, 2, 1]
    #modes=['same']*5
    n_hidden=[500, 50]
    drop_inverses=[1,]
    # 28->12->12->5->5/5*5*64->500->50->500->5*5*64/5->5->12->12->28
    
    if dataset=='mnist.pkl.gz':
        dim_input=(28, 28)
        colorImg=False

    logdir = 'results/supervised/cva/mnist/cva_6layer_mnist_60000'+str(nkerns)+str(n_hidden)+'_'+str(learning_rate)+'_'
    if predir is not None:
        logdir +='pre_'
    if dropout_flag == 1:
        logdir += ('dropout_'+str(drops)+'_')
    if drop_inverses_flag==1:
        logdir += ('inversedropout_'+str(drop_inverses)+'_')
    logdir += str(int(time.time()))+'/'

    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir, 'predir', predir
    print 'cva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'logdir:', logdir, 'predir', predir
        print >>f, 'cva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag

    datasets = datapy.load_data_gpu_60000(dataset, have_matrix=True)

    train_set_x, train_set_y, train_y_matrix = datasets[0]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
    test_set_x, test_set_y, test_y_matrix = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    random_z = T.matrix('random_z')

    drop = T.iscalar('drop')
    drop_inverse = T.iscalar('drop_inverse')
    
    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)
    input_x = x.reshape((batch_size, 1, 28, 28))
    
    recg_layer = []
    cnn_output = []

    #1
    recg_layer.append(ConvMaxPool.ConvMaxPool(
            rng,
            image_shape=(batch_size, 1, 28, 28),
            filter_shape=(nkerns[0], 1, 5, 5),
            poolsize=(2, 2),
            border_mode='valid',
            activation=activation
        ))
    if drops[0]==1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x, drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))

    #2
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[1]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    #3
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[1], 12, 12),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(2, 2),
        border_mode='valid', 
        activation=activation
    ))
    if drops[2]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #4
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[2], 5, 5),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[3]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    #5
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[3], 5, 5),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[4]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []

    #1
    recg_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in= 5 * 5 * nkerns[-1],
            n_out=n_hidden[0],
            activation=activation
        ))
    if drops[-1]==1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))

    #stochastic layer
    recg_layer.append(GaussianHidden.GaussianHidden(
            rng=rng,
            input=activations[-1],
            n_in=n_hidden[0],
            n_out = n_hidden[1],
            activation=None
        ))

    z = recg_layer[-1].sample_z(rng_share)


    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[1],
            n_out = n_hidden[0],
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))

    #2
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 5*5*nkerns[-1],
            activation=activation
        ))

    if drop_inverses[0]==1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(gene_layer[-1].output(input=random_z_output[-1]))

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 5, 5))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 5, 5))

    #1
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-1], 5, 5),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(input=input_random_z, n_batch=n_batch))
    
    #2
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-2], 5, 5),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(2, 2),
            border_mode='full', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-3], 12, 12),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-4], 12, 12),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #5 stochastic layer 
    # for the last layer, the nonliearity should be sigmoid to achieve mean of Bernoulli
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-5], 12, 12),
            filter_shape=(1, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='full', 
            activation=nonlinearity.sigmoid
        ))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))
   
    gene_layer.append(NoParamsBernoulliVisiable.NoParamsBernoulliVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=z_output[-1], data=input_x)


    # 4-D tensor of random generation
    random_x_mean = random_z_output[-1]
    random_x = gene_layer[-1].sample_x(rng_share, random_x_mean)

    #L = (logpx + logpz - logqz).sum()
    cost = (
        (logpx + recg_layer[-1].logpz - recg_layer[-1].logqz).sum()
    )
    
    px = (logpx.sum())
    pz = (recg_layer[-1].logpz.sum())
    qz = (- recg_layer[-1].logqz.sum())

    params=[]
    for g in gene_layer:
        params+=g.params
    for r in recg_layer:
        params+=r.params
    gparams = [T.grad(cost, param) for param in params]

    weight_decay=1.0/n_train_batches
    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    #get_optimizer = optimizer.get_adam_optimizer(learning_rate=learning_rate)
    get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r, 
        decay1=0.1, decay2=0.001, weight_decay=weight_decay, epsilon=1e-8)
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'AdaM', learning_rate, weight_decay
    updates = get_optimizer(params,gparams)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=cost,
        #outputs=layer[-1].errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            #y: test_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: test_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=cost,
        #outputs=layer[-1].errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            #y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: valid_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )
    
    '''
    Save parameters and activations
    '''

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    
    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`

    debug_model = theano.function(
        inputs=[index],
        outputs=[cost, px, pz, qz],
        #updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        }
    )

    random_generation = theano.function(
        inputs=[random_z],
        outputs=[random_x_mean.flatten(2), random_x.flatten(2)],
        givens={
            #drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    train_bound_without_dropout = theano.function(
        inputs=[index],
        outputs=cost,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        }
    )

    ##################
    # Pretrain MODEL #
    ##################
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir+'model.npz')
        pre_train = pre_train['model']
        for (para, pre) in zip(params, pre_train):
            para.set_value(pre)
        tmp =  [debug_model(i) for i in xrange(n_train_batches)]
        tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
        print '------------------', tmp

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_bound = -1000000.0
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    NaN_count = 0
    epoch = 0
    threshold = 0
    validation_frequency = 1
    generatition_frequency = 10
    if predir is not None:
        threshold = 0
    color.printRed('threshold, '+str(threshold) + 
        ' generatition_frequency, '+str(generatition_frequency)
        +' validation_frequency, '+str(validation_frequency))
    done_looping = False
    n_epochs = 600
    decay_epochs = 500

    '''
    print 'test initialization...'
    pre_model = parameters()
    for i in xrange(len(pre_model)):
        pre_model[i] = np.asarray(pre_model[i])
        print pre_model[i].shape, np.mean(pre_model[i]), np.var(pre_model[i])
    print 'end test...'
    '''
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        minibatch_avg_cost = 0
        
        tmp_start1 = time.clock()

        test_epoch = epoch - decay_epochs
        if test_epoch > 0 and test_epoch % 10 == 0:
            print l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,l_r.get_value()
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))


        for minibatch_index in xrange(n_train_batches):
            #print minibatch_index
            '''
            color.printRed('lalala')
            xxx = dims(minibatch_index)
            print xxx.shape
            '''
            #print n_train_batches
            minibatch_avg_cost += train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index
        
        if math.isnan(minibatch_avg_cost):
            NaN_count+=1
            color.printRed("NaN detected. Reverting to saved best parameters")
            print '---------------NaN_count:', NaN_count
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, '---------------NaN_count:', NaN_count
            
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            print '------------------NaN check:', tmp
            with open(logdir+'hook.txt', 'a') as f:
               print >>f, '------------------NaN check:', tmp
               
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                print model[i].shape, np.mean(model[i]), np.var(model[i])
                print np.max(model[i]), np.min(model[i])
                print np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, model[i].shape, np.mean(model[i]), np.var(model[i])
                    print >>f, np.max(model[i]), np.min(model[i])
                    print >>f, np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))

            best_before = np.load(logdir+'model.npz')
            best_before = best_before['model']
            for (para, pre) in zip(params, best_before):
                para.set_value(pre)
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            print '------------------', tmp
            return

        #print 'optimization_time', time.clock() - tmp_start1
        print epoch, 'stochastic training error', minibatch_avg_cost / float(n_train_batches*batch_size)
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, epoch, 'stochastic training error', minibatch_avg_cost / float(n_train_batches*batch_size)

        if epoch % validation_frequency == 0:
            tmp_start2 = time.clock()

            test_losses = [test_model(i) for i
                                 in xrange(n_test_batches)]
            this_test_bound = np.mean(test_losses)/float(batch_size)
            
            #tmp =  [debug_model(i) for i
            #                     in xrange(n_train_batches)]
            #tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            
            print epoch, 'test bound', this_test_bound
            #print tmp
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, epoch, 'test bound', this_test_bound
            
        if epoch%100==0:    
            
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
            np.savez(logdir+'model-'+str(epoch), model=model)
            
            for i in xrange(n_train_batches):
                if i == 0:
                    train_features = np.asarray(train_activations(i))
                else:
                    train_features = np.vstack((train_features, np.asarray(train_activations(i))))
            
            for i in xrange(n_valid_batches):
                if i == 0:
                    valid_features = np.asarray(valid_activations(i))
                else:
                    valid_features = np.vstack((valid_features, np.asarray(valid_activations(i))))

            for i in xrange(n_test_batches):
                if i == 0:
                    test_features = np.asarray(test_activations(i))
                else:
                    test_features = np.vstack((test_features, np.asarray(test_activations(i))))
            np.save(logdir+'train_features', train_features)
            np.save(logdir+'valid_features', valid_features)
            np.save(logdir+'test_features', test_features)
        
        tmp_start4=time.clock()
        if epoch % generatition_frequency == 0:
            tail='-'+str(epoch)+'.png'
            random_z = np.random.standard_normal((n_batch, n_hidden[-1])).astype(np.float32)
            _x_mean, _x = random_generation(random_z)
            #print _x.shape
            #print _x_mean.shape
            image = paramgraphics.mat_to_img(_x.T, dim_input, colorImg=colorImg)
            image.save(logdir+'samples'+tail, 'PNG')
            image = paramgraphics.mat_to_img(_x_mean.T, dim_input, colorImg=colorImg)
            image.save(logdir+'mean_samples'+tail, 'PNG')
        #print 'generation_time', time.clock() - tmp_start4

    end_time = time.clock()
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    if NaN_count > 0:
        print '---------------NaN_count:', NaN_count
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, '---------------NaN_count:', NaN_count
コード例 #5
0
def cmmd(dataset='mnist.pkl.gz',batch_size=500, layer_num = 2, hidden_dim = 20,seed = 0,layer_size=[500,200,100]):

	validation_frequency = 1
	test_frequency = 1
	pre_train = 0
	pre_train_epoch = 30

	print "Loading data ......."
	datasets = datapy.load_data_gpu_60000(dataset, have_matrix = True)
	train_set_x, train_set_y, train_y_matrix = datasets[0]
	valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
	test_set_x, test_set_y, test_y_matrix = datasets[2]

	n_train_batches = train_set_x.get_value().shape[0] / batch_size
	n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
	n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

	rng = np.random.RandomState(seed)                                                          
	rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

	################################
	##        build model         ##
	################################
	print "Building model ......."

	index = T.lscalar()
	x = T.matrix('x')  ##### batch_size * 28^2
	y = T.vector('y') 
	y_matrix = T.matrix('y_matrix') 
	random_z = T.matrix('random_z') ### batch_size * hidden_dim
	Inv_K_d = T.matrix('Inv_K_d')

	layers = []
	layer_output= []

	activation = nonlinearity.relu
	#activation = Tnn.sigmoid
	#### first layer
	layers.append(FullyConnected.FullyConnected(
			rng = rng,
			n_in = 28*28 + hidden_dim, 
			#n_in = 28*28, 
			n_out = layer_size[0],
			activation = activation
	))
	layer_output.append(layers[-1].output_mix(input=[x,random_z]))
	#layer_output.append(layers[-1].output(input=x))

	#### middle layer
	for i in range(layer_num):
		layers.append(FullyConnected.FullyConnected(
			rng = rng,
			n_in = layer_size[i], 
			n_out = layer_size[i+1],
			activation = activation
		))
		layer_output.append(layers[-1].output(input= layer_output[-1]))

	#### last layer
	activation = Tnn.sigmoid
	layers.append(FullyConnected.FullyConnected(
		rng = rng,
		n_in = layer_size[-1],
		n_out = 10,
		activation = activation
	))
	y_gen = layers[-1].output(input = layer_output[-1])
	
	lambda1_ = 1e-3
	lambda_= theano.shared(np.asarray(lambda1_, dtype=np.float32))


	K_d = kernel_gram_for_x(x,x,batch_size,28*28)
	K_s = K_d 
	K_sd = K_d
	#Inv_K_d = NL.matrix_inverse(K_d +lambda_ * T.identity_like(K_d))
	Inv_K_s = Inv_K_d

	L_d = kernel_gram(y_matrix,y_matrix,batch_size,10)
	L_s = kernel_gram(y_gen,y_gen,batch_size,10)
	L_ds = kernel_gram(y_matrix,y_gen,batch_size,10)
	
	cost = -(NL.trace(K_d * Inv_K_d * L_d * Inv_K_d) +\
			NL.trace(K_s * Inv_K_s * L_s * Inv_K_s)- \
			NL.trace(K_sd * Inv_K_d * L_ds * Inv_K_s))
	cost_pre = -T.sum(T.sqr(y_matrix - y_gen))

	cc = T.argmax(y_gen,axis=1)
	correct = T.sum(T.eq(T.cast(T.argmax(y_gen,axis=1),'int32'),T.cast(y,'int32')))

	################################
	##        updates             ##
	################################
	params = []
	for aLayer in layers:
		params += aLayer.params
	gparams = [T.grad(cost,param) for param in params]
	gparams_pre = [T.grad(cost_pre,param) for param in params]

	learning_rate = 3e-4
	weight_decay=1.0/n_train_batches
	epsilon=1e-8

	l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
	get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r,
		decay1=0.1, decay2=0.001, weight_decay=weight_decay, epsilon=epsilon)
	updates = get_optimizer(params,gparams)

	updates_pre = get_optimizer(params,gparams_pre)


	################################
	##         pretrain model     ##
	################################
	parameters = theano.function(
			inputs = [],
			outputs = params,
			)

	'''
	pre_train_model = theano.function(
		inputs = [index,random_z],
		outputs = [cost_pre, correct],
		updates=updates_pre,
		givens={
			x:train_set_x[index * batch_size:(index + 1) * batch_size],
			y:train_set_y[index * batch_size:(index + 1) * batch_size],
			y_matrix:train_y_matrix[index * batch_size:(index + 1) * batch_size],
		},
		on_unused_input='warn'
		)
	cur_epoch = 0
	if pre_train == 1:
		for cur_epoch in range(pre_train_epoch):
			print 'cur_epoch: ', cur_epoch,
			cor = 0 
			for minibatch_index in range(n_train_batches):
				cost_pre_mini,correct_pre_mini = pre_train_model(minibatch_index,gen_random_z(batch_size,hidden_dim))
				cor = cor + correct_pre_mini
			print 'correct number: ' , cor
		#np.savez(,model = model)
		'''

	if pre_train == 1:
		print "pre-training model....."
		pre_train = np.load('model.npz')['model']
		for (para, pre) in zip(params, pre_train):
			para.set_value(pre)

	################################
	##         prepare data       ##
	################################

	#### compute matrix inverse
	print "Preparing data ...."
	Invv = NL.matrix_inverse(K_d +lambda_ * T.identity_like(K_d))
	prepare_data = theano.function(
			inputs = [index],
			outputs = [Invv,K_d],
			givens = {
				x:train_set_x[index * batch_size:(index + 1) * batch_size],
				}
			)

	Inv_K_d_l, K_d_l =  prepare_data(0)

	for minibatch_index in range(1, n_train_batches):
		if minibatch_index % 10 == 0:
			print 'minibatch_index:', minibatch_index
		Inv_pre_mini, K_d_pre_mini = prepare_data(minibatch_index)
		Inv_K_d_l = np.vstack((Inv_K_d_l,Inv_pre_mini))
		K_d_l = np.vstack((K_d_l,K_d_pre_mini))

	Inv_K_d_g = theano.shared(Inv_K_d_l,borrow=True)
	K_d_g = theano.shared(K_d_l, borrow=True)


	################################
	##         train model        ##
	################################

	train_model = theano.function(
		inputs = [index,random_z],
		outputs = [correct,cost,y,cc,y_gen],
		updates=updates,
		givens={
			x:train_set_x[index * batch_size:(index + 1) * batch_size],
			y:train_set_y[index * batch_size:(index + 1) * batch_size],
			y_matrix:train_y_matrix[index * batch_size:(index + 1) * batch_size],
			#K_d:K_d_g[index * batch_size:(index + 1) * batch_size],
			Inv_K_d:Inv_K_d_g[index * batch_size:(index + 1) * batch_size],
		},
		on_unused_input='warn'
	)

	valid_model = theano.function(
		inputs = [index,random_z],
		outputs = correct,
		#updates=updates,
		givens={
			x:valid_set_x[index * batch_size:(index + 1) * batch_size],
			y:valid_set_y[index * batch_size:(index + 1) * batch_size],
			y_matrix:valid_y_matrix[index * batch_size:(index + 1) * batch_size],
		},
		on_unused_input='warn'
	)

	test_model = theano.function(
		inputs = [index,random_z],
		outputs = [correct,y_gen],
		#updates=updates,
		givens={
			x:test_set_x[index * batch_size:(index + 1) * batch_size],
			y:test_set_y[index * batch_size:(index + 1) * batch_size],
			y_matrix:test_y_matrix[index * batch_size:(index + 1) * batch_size],
		},
		on_unused_input='warn'
	)

	n_epochs = 500
	cur_epoch = 0



	print "Training model ......"

	while (cur_epoch < n_epochs) :
		cur_epoch = cur_epoch + 1
		cor = 0
		for minibatch_index in xrange(n_train_batches):
			print minibatch_index,
			print " : ",
			correct,cost,a,b,y_gen = train_model(minibatch_index,gen_random_z(batch_size,hidden_dim))
			cor = cor + correct
			print correct
			print b
			print y_gen
		with open('log.txt','a') as f:
			print >>f , "epoch: " , cur_epoch, "training_correct: " , cor

		if cur_epoch % validation_frequency == 0:
			cor2 = 0
			for minibatch_index in xrange(n_valid_batches):
				correct = valid_model(minibatch_index,gen_random_z(batch_size,hidden_dim))
				cor2 = cor2 + correct
			with open('log.txt','a') as f:
				print >>f , "	validation_correct: " , cor2

		if cur_epoch % test_frequency == 0:
			cor2 = 0
			for minibatch_index in xrange(n_test_batches):
				correct,y_gen = test_model(minibatch_index,gen_random_z(batch_size,hidden_dim))
				with open('log.txt','a') as f:
					for index in range(batch_size):
						if not np.argmax(y_gen[index]) == test_set_y[minibatch_index * batch_size + index]:
							print >>f , "index: " , minibatch_index * batch_size + index, 'true Y: ', test_set_y[minibatch_index * batch_size + index]
							print >>f , 'gen_y: ' , y_gen[index]

				cor2 = cor2 + correct
			with open('log.txt','a') as f:
				print >>f , "	test_correct: " , cor2
		
		if epoch %1 == 0:
			model = parameters()
			for i in range(len(model)):
				model[i] = np.asarray(model[i]).astype(np.float32)
			np.savez('model-'+str(epoch),model=model)