Python PCA Examples

Programming Language: Python

Namespace/Package Name: preprocessing

Method/Function: PCA

Examples at hotexamples.com: 3

Python PCA - 3 examples found. These are the top rated real world Python examples of preprocessing.PCA extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def main(n_z, n_hidden, dataset, seed, comment, gfx=True):

    # Initialize logdir
    #---------------------
    # Setasouto:
    # Create the directory to save the outputs files and log.
    #---------------------
    import time
    logdir = 'results/gpulearn_z_x_' + dataset + '_' + str(n_z) + '-' + str(
        n_hidden) + '_' + comment + '_' + str(int(time.time())) + '/'
    if not os.path.exists(logdir): os.makedirs(logdir)
    print('logdir:', logdir)
    print('gpulearn_z_x', n_z, n_hidden, dataset, seed)
    with open(logdir + 'hook.txt', 'a') as f:
        print(f, 'learn_z_x', n_z, n_hidden, dataset, seed)

    np.random.seed(seed)

    gfx_freq = 1

    weight_decay = 0
    f_enc, f_dec = lambda x: x, lambda x: x

    # Init data
    if dataset == 'mnist':
        import anglepy.data.mnist as mnist

        # MNIST
        size = 28
        train_x, train_y, valid_x, valid_y, test_x, test_y = mnist.load_numpy(
            size)
        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': valid_x.astype(np.float32)}
        x_test = {'x': test_x.astype(np.float32)}
        L_valid = 1
        dim_input = (size, size)
        n_x = size * size
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        nonlinear = 'softplus'
        type_px = 'bernoulli'
        n_train = 50000
        n_batch = 1000
        colorImg = False
        bernoulli_x = True
        byteToFloat = False
        weight_decay = float(n_batch) / n_train

    if dataset == 'mnist_binarized':
        import anglepy.data.mnist_binarized as mnist_binarized
        # MNIST
        train_x, valid_x, test_x = mnist_binarized.load_numpy(28)
        x = {'x': np.hstack((train_x, valid_x)).astype(np.float32)}
        x_valid = {'x': test_x.astype(np.float32)}
        L_valid = 1
        dim_input = (28, 28)
        n_x = 28 * 28
        n_y = 10
        type_qz = 'gaussianmarg'
        type_pz = 'mog'
        nonlinear = 'rectlin'
        type_px = 'bernoulli'
        n_train = 60000
        n_batch = 1000
        colorImg = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / n_train

    elif dataset == 'freyface':
        # Frey's face
        import anglepy.data.freyface as freyface
        n_train = 1600
        train_x = freyface.load_numpy()
        np.random.shuffle(train_x)
        x = {'x': train_x.T[:, 0:n_train]}
        x_valid = {'x': train_x.T[:, n_train:]}
        L_valid = 1
        dim_input = (28, 20)
        n_x = 20 * 28
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'bounded01'
        nonlinear = 'tanh'  #tanh works better with freyface #'softplus'
        n_batch = 100
        colorImg = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / n_train

    elif dataset == 'freyface_pca':
        # Frey's face
        import anglepy.data.freyface as freyface
        n_train = 1600
        train_x = freyface.load_numpy().T
        np.random.shuffle(train_x.T)

        f_enc, f_dec, _ = pp.PCA(train_x, 0.99)
        train_x = f_enc(train_x)

        x = {'x': train_x[:, 0:n_train].astype(np.float32)}
        x_valid = {'x': train_x[:, n_train:].astype(np.float32)}
        L_valid = 1
        dim_input = (28, 20)
        n_x = train_x.shape[0]
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'
        n_batch = 100
        colorImg = False
        bernoulli_x = False
        byteToFloat = False

    elif dataset == 'freyface_bernoulli':
        # Frey's face
        import anglepy.data.freyface as freyface
        n_train = 1600
        train_x = freyface.load_numpy().T
        np.random.shuffle(train_x.T)

        x = {'x': train_x[:, 0:n_train].astype(np.float32)}
        x_valid = {'x': train_x[:, n_train:].astype(np.float32)}
        L_valid = 1
        dim_input = (28, 20)
        n_x = train_x.shape[0]
        type_pz = 'gaussianmarg'
        type_px = 'bernoulli'
        nonlinear = 'softplus'
        n_batch = 100
        colorImg = False
        bernoulli_x = False
        byteToFloat = False

    elif dataset == 'norb':
        # small NORB dataset
        import anglepy.data.norb as norb
        size = 48
        train_x, train_y, test_x, test_y = norb.load_resized(size,
                                                             binarize_y=True)

        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': test_x.astype(np.float32)}
        L_valid = 1
        n_x = train_x.shape[0]
        dim_input = (size, size)
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'
        n_batch = 900  #23400/900 = 27
        colorImg = False
        #binarize = False
        byteToFloat = False
        bernoulli_x = False
        weight_decay = float(n_batch) / train_x.shape[1]

    elif dataset == 'norb_pca':
        # small NORB dataset
        import anglepy.data.norb as norb
        size = 48
        train_x, train_y, test_x, test_y = norb.load_resized(size,
                                                             binarize_y=True)

        f_enc, f_dec, _ = pp.PCA(train_x, 0.999)
        #f_enc, f_dec, _ = pp.normalize_random(train_x)
        train_x = f_enc(train_x)
        test_x = f_enc(test_x)

        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': test_x.astype(np.float32)}
        L_valid = 1
        n_x = train_x.shape[0]
        dim_input = (size, size)
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'
        n_batch = 900  #23400/900 = 27
        colorImg = False
        #binarize = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / train_x.shape[1]

    elif dataset == 'norb_normalized':
        # small NORB dataset
        import anglepy.data.norb as norb
        size = 48
        train_x, train_y, test_x, test_y = norb.load_resized(size,
                                                             binarize_y=True)

        #f_enc, f_dec, _ = pp.PCA(train_x, 0.99)
        #f_enc, f_dec, _ = pp.normalize_random(train_x)
        f_enc, f_dec, _ = pp.normalize(train_x)
        train_x = f_enc(train_x)
        test_x = f_enc(test_x)

        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': test_x.astype(np.float32)}
        L_valid = 1
        n_x = train_x.shape[0]
        dim_input = (size, size)
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'
        n_batch = 900  #23400/900 = 27
        colorImg = False
        #binarize = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / train_x.shape[1]

    elif dataset == 'svhn':
        # SVHN dataset
        import anglepy.data.svhn as svhn
        size = 32
        train_x, train_y, test_x, test_y = svhn.load_numpy(
            False, binarize_y=True)  #norb.load_resized(size, binarize_y=True)
        extra_x, extra_y = svhn.load_numpy_extra(False, binarize_y=True)
        x = {
            'x': np.hstack((train_x, extra_x)),
            'y': np.hstack((train_y, extra_y))
        }
        ndict.shuffleCols(x)

        print('Performing PCA, can take a few minutes... ',
              f_enc,
              f_dec,
              pca_params=pp.PCA(x['x'][:, :10000], cutoff=600, toFloat=True))
        ndict.savez(pca_params, logdir + 'pca_params')
        print('Done.')

        n_y = 10
        x = {'x': f_enc(x['x']).astype(np.float32)}
        x_valid = {'x': f_enc(test_x).astype(np.float32)}
        L_valid = 1
        n_x = x['x'].shape[0]
        dim_input = (size, size)
        n_batch = 5000
        colorImg = True
        bernoulli_x = False
        byteToFloat = False
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'

    elif dataset == 'hyper':
        # Hyperspectral images:

        # Import 1 file of the dataset
        # TODO: import more files: Edit hyperspectralData.py

        #I added the hyperspectralData file in the anglepy library
        from hyperspectralData import HyperspectralData

        train_x, train_y, valid_x, valid_y, test_x, test_y = HyperspectralData(
        ).load_numpy(100000)

        #Dim input: How it has to be written like an image. We said that is:
        dim_input = (67, 4)
        n_x = train_x.shape[0]  #Dimension of our data vector.

        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': valid_x.astype(np.float32)}
        x_test = {'x': test_x.astype(np.float32)}
        L_valid = 1
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        nonlinear = 'softplus'
        type_px = 'bernoulli'
        n_train = train_x.shape[1]
        n_batch = 1000
        colorImg = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / n_train
        #Write the hyperparameters used:
        with open(logdir + 'AA_hyperparameters.txt', 'w') as file:
            file.write("L_valid: " + str(L_valid) + '\n')
            file.write("type_qz: " + type_qz + '\n')
            file.write("type_pz: " + type_pz + '\n')
            file.write("Nonlinear: " + nonlinear + '\n')
            file.write("type_px: " + type_px + '\n')
            file.write("n_train: " + str(n_train) + '\n')
            file.write("n_batch: " + str(n_batch) + '\n')
            file.write("colorImg: " + str(colorImg) + '\n')
            file.write("bernoulli_x: " + str(bernoulli_x) + '\n')
            file.write("byteToFloat: " + str(byteToFloat) + '\n')
            file.close()
        # Write the headers for the csv file output:
        with open(logdir + 'AA_results.txt', 'w') as file:
            # Like a csv file:
            file.write("Step" + ',' + "TimeElapsed" + ',' +
                       "LowerboundMinibatch" + ',' + "LowerboundValid" + ',' +
                       "NumStepNotImproving" + '\n')
            file.close()

    # Construct model
    from anglepy.models import GPUVAE_Z_X
    updates = get_adam_optimizer(learning_rate=3e-4, weight_decay=weight_decay)
    model = GPUVAE_Z_X(updates,
                       n_x,
                       n_hidden,
                       n_z,
                       n_hidden[::-1],
                       nonlinear,
                       nonlinear,
                       type_px,
                       type_qz=type_qz,
                       type_pz=type_pz,
                       prior_sd=100,
                       init_sd=1e-3)
    #---------------
    # SetaSouto:
    # The [::-1] is to reverse the list.
    #---------------

    if False:
        #dir = '/Users/dpkingma/results/learn_z_x_mnist_binarized_50-(500, 500)_mog_1412689061/'
        #dir = '/Users/dpkingma/results/learn_z_x_svhn_bernoulli_300-(1000, 1000)_l1l2_sharing_and_1000HU_1412676966/'
        #dir = '/Users/dpkingma/results/learn_z_x_svhn_bernoulli_300-(1000, 1000)_l1l2_sharing_and_1000HU_1412695481/'
        #dir = '/Users/dpkingma/results/learn_z_x_mnist_binarized_50-(500, 500)_mog_1412695455/'
        #dir = '/Users/dpkingma/results/gpulearn_z_x_svhn_pca_300-(500, 500)__1413904756/'
        dir = '/home/ubuntu/results/gpulearn_z_x_mnist_50-[500, 500]__1414259423/'
        w = ndict.loadz(dir + 'w_best.ndict.tar.gz')
        v = ndict.loadz(dir + 'v_best.ndict.tar.gz')
        ndict.set_value(model.w, w)
        ndict.set_value(model.v, v)

    # Some statistics for optimization
    ll_valid_stats = [-1e99, 0]

    # Progress hook
    def hook(epoch, t, ll):
        '''
        Documented by SetaSouto, may contains errors.

        :epoch: Number of the current step.
        :t: Time elapsed from the beginning.
        :ll: Loglikelihood (?).
        '''

        if epoch % 10 != 0: return

        ll_valid, _ = model.est_loglik(x_valid,
                                       n_samples=L_valid,
                                       n_batch=n_batch,
                                       byteToFloat=byteToFloat)

        # Log
        ndict.savez(ndict.get_value(model.v), logdir + 'v')
        ndict.savez(ndict.get_value(model.w), logdir + 'w')

        if ll_valid > ll_valid_stats[0]:
            ll_valid_stats[0] = ll_valid
            ll_valid_stats[1] = 0
            ndict.savez(ndict.get_value(model.v), logdir + 'v_best')
            ndict.savez(ndict.get_value(model.w), logdir + 'w_best')
        else:
            ll_valid_stats[1] += 1
            # Stop when not improving validation set performance in 100 iterations
            if ll_valid_stats[1] > 100:
                print("Finished")
                with open(logdir + 'hook.txt', 'a') as f:
                    print(f, "Finished")
                exit()

        # This will be showing the current results and write them in a file:
        with open(logdir + 'AA_results.txt', 'a') as file:
            # Like a csv file:
            file.write(
                str(epoch) + ',' + str(t) + ',' + str(ll) + ',' +
                str(ll_valid) + ',' + str(ll_valid_stats[1]) + '\n')
            file.close()
        print("-------------------------")
        print("Current results:")
        print(" ")
        print("Step:", epoch)
        print("Time elapsed:", t)
        print("Loglikelihood minibatch:", ll)
        print("Loglikelihood validSet:", ll_valid)
        print("N not improving:", ll_valid_stats[1])
        #print(epoch, t, ll, ll_valid, ll_valid_stats)

        #This print the file where are written the stats.
        #with open(logdir+'hook.txt', 'a') as f:
        #print(f, epoch, t, ll, ll_valid, ll_valid_stats)

        # Graphics
        if gfx and epoch % gfx_freq == 0:

            #tail = '.png'
            tail = '-' + str(epoch) + '.png'

            v = {i: model.v[i].get_value() for i in model.v}
            w = {i: model.w[i].get_value() for i in model.w}

            if 'pca' not in dataset and 'random' not in dataset and 'normalized' not in dataset:

                if 'w0' in v:
                    image = paramgraphics.mat_to_img(f_dec(v['w0'][:].T),
                                                     dim_input,
                                                     True,
                                                     colorImg=colorImg)
                    image.save(logdir + 'q_w0' + tail, 'PNG')

                image = paramgraphics.mat_to_img(f_dec(w['out_w'][:]),
                                                 dim_input,
                                                 True,
                                                 colorImg=colorImg)
                image.save(logdir + 'out_w' + tail, 'PNG')

                if 'out_unif' in w:
                    image = paramgraphics.mat_to_img(f_dec(
                        w['out_unif'].reshape((-1, 1))),
                                                     dim_input,
                                                     True,
                                                     colorImg=colorImg)
                    image.save(logdir + 'out_unif' + tail, 'PNG')

                if n_z == 2:
                    n_width = 10
                    import scipy.stats
                    z = {'z': np.zeros((2, n_width**2))}
                    for i in range(0, n_width):
                        for j in range(0, n_width):
                            z['z'][0, n_width * i + j] = scipy.stats.norm.ppf(
                                float(i) / n_width + 0.5 / n_width)
                            z['z'][1, n_width * i + j] = scipy.stats.norm.ppf(
                                float(j) / n_width + 0.5 / n_width)

                    x, _, _z = model.gen_xz({}, z, n_width**2)
                    if dataset == 'mnist':
                        x = 1 - _z['x']
                    image = paramgraphics.mat_to_img(f_dec(_z['x']), dim_input)
                    image.save(logdir + '2dmanifold' + tail, 'PNG')
                else:
                    _x, _, _z_confab = model.gen_xz({}, {}, n_batch=144)
                    x_samples = _z_confab['x']
                    image = paramgraphics.mat_to_img(f_dec(x_samples),
                                                     dim_input,
                                                     colorImg=colorImg)
                    image.save(logdir + 'samples' + tail, 'PNG')

                    #x_samples = _x['x']
                    #image = paramgraphics.mat_to_img(x_samples, dim_input, colorImg=colorImg)
                    #image.save(logdir+'samples2'+tail, 'PNG')

            else:
                # Model with preprocessing

                if 'w0' in v:
                    image = paramgraphics.mat_to_img(f_dec(v['w0'][:].T),
                                                     dim_input,
                                                     True,
                                                     colorImg=colorImg)
                    image.save(logdir + 'q_w0' + tail, 'PNG')

                image = paramgraphics.mat_to_img(f_dec(w['out_w'][:]),
                                                 dim_input,
                                                 True,
                                                 colorImg=colorImg)
                image.save(logdir + 'out_w' + tail, 'PNG')

                _x, _, _z_confab = model.gen_xz({}, {}, n_batch=144)
                x_samples = f_dec(_z_confab['x'])
                x_samples = np.minimum(np.maximum(x_samples, 0), 1)
                image = paramgraphics.mat_to_img(x_samples,
                                                 dim_input,
                                                 colorImg=colorImg)
                image.save(logdir + 'samples' + tail, 'PNG')

    # Optimize
    #SFO
    dostep = epoch_vae_adam(model,
                            x,
                            n_batch=n_batch,
                            bernoulli_x=bernoulli_x,
                            byteToFloat=byteToFloat)
    loop_va(dostep, hook)

    pass

Example #2

Show file

File: gpulearn_z_x.py Project: candy4869/2014

def main(n_z, n_hidden, dataset, seed, comment, gfx=True):

    # Initialize logdir
    import time
    logdir = 'results/gpulearn_z_x_' + dataset + '_' + str(n_z) + '-' + str(
        n_hidden) + '_' + comment + '_' + str(int(time.time())) + '/'
    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir
    print 'gpulearn_z_x', n_z, n_hidden, dataset, seed
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, 'learn_z_x', n_z, n_hidden, dataset, seed

    np.random.seed(seed)

    gfx_freq = 1

    weight_decay = 0
    f_enc, f_dec = lambda x: x, lambda x: x

    # Init data
    if dataset == 'mnist':
        import anglepy.data.mnist as mnist

        # MNIST
        size = 28
        train_x, train_y, valid_x, valid_y, test_x, test_y = mnist.load_numpy(
            size)
        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': valid_x.astype(np.float32)}
        x_test = {'x': test_x.astype(np.float32)}
        L_valid = 1
        dim_input = (size, size)
        n_x = size * size
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        nonlinear = 'softplus'
        type_px = 'bernoulli'
        n_train = 50000
        n_batch = 1000
        colorImg = False
        bernoulli_x = True
        byteToFloat = False
        weight_decay = float(n_batch) / n_train

    if dataset == 'mnist_binarized':
        import anglepy.data.mnist_binarized as mnist_binarized
        # MNIST
        train_x, valid_x, test_x = mnist_binarized.load_numpy(28)
        x = {'x': np.hstack((train_x, valid_x)).astype(np.float32)}
        x_valid = {'x': test_x.astype(np.float32)}
        L_valid = 1
        dim_input = (28, 28)
        n_x = 28 * 28
        n_y = 10
        type_qz = 'gaussianmarg'
        type_pz = 'mog'
        nonlinear = 'rectlin'
        type_px = 'bernoulli'
        n_train = 60000
        n_batch = 1000
        colorImg = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / n_train

    elif dataset == 'freyface':
        # Frey's face
        import anglepy.data.freyface as freyface
        n_train = 1600
        train_x = freyface.load_numpy()
        np.random.shuffle(train_x)
        x = {'x': train_x.T[:, 0:n_train]}
        x_valid = {'x': train_x.T[:, n_train:]}
        L_valid = 1
        dim_input = (28, 20)
        n_x = 20 * 28
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'bounded01'
        nonlinear = 'tanh'  #tanh works better with freyface #'softplus'
        n_batch = 100
        colorImg = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / n_train

    elif dataset == 'freyface_pca':
        # Frey's face
        import anglepy.data.freyface as freyface
        n_train = 1600
        train_x = freyface.load_numpy().T
        np.random.shuffle(train_x.T)

        f_enc, f_dec, _ = pp.PCA(train_x, 0.99)
        train_x = f_enc(train_x)

        x = {'x': train_x[:, 0:n_train].astype(np.float32)}
        x_valid = {'x': train_x[:, n_train:].astype(np.float32)}
        L_valid = 1
        dim_input = (28, 20)
        n_x = train_x.shape[0]
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'
        n_batch = 100
        colorImg = False
        bernoulli_x = False
        byteToFloat = False

    elif dataset == 'freyface_bernoulli':
        # Frey's face
        import anglepy.data.freyface as freyface
        n_train = 1600
        train_x = freyface.load_numpy().T
        np.random.shuffle(train_x.T)

        x = {'x': train_x[:, 0:n_train].astype(np.float32)}
        x_valid = {'x': train_x[:, n_train:].astype(np.float32)}
        L_valid = 1
        dim_input = (28, 20)
        n_x = train_x.shape[0]
        type_pz = 'gaussianmarg'
        type_px = 'bernoulli'
        nonlinear = 'softplus'
        n_batch = 100
        colorImg = False
        bernoulli_x = False
        byteToFloat = False

    elif dataset == 'norb':
        # small NORB dataset
        import anglepy.data.norb as norb
        size = 48
        train_x, train_y, test_x, test_y = norb.load_resized(size,
                                                             binarize_y=True)

        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': test_x.astype(np.float32)}
        L_valid = 1
        n_x = train_x.shape[0]
        dim_input = (size, size)
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'
        n_batch = 900  #23400/900 = 27
        colorImg = False
        #binarize = False
        byteToFloat = False
        bernoulli_x = False
        weight_decay = float(n_batch) / train_x.shape[1]

    elif dataset == 'norb_pca':
        # small NORB dataset
        import anglepy.data.norb as norb
        size = 48
        train_x, train_y, test_x, test_y = norb.load_resized(size,
                                                             binarize_y=True)

        f_enc, f_dec, _ = pp.PCA(train_x, 0.999)
        #f_enc, f_dec, _ = pp.normalize_random(train_x)
        train_x = f_enc(train_x)
        test_x = f_enc(test_x)

        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': test_x.astype(np.float32)}
        L_valid = 1
        n_x = train_x.shape[0]
        dim_input = (size, size)
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'
        n_batch = 900  #23400/900 = 27
        colorImg = False
        #binarize = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / train_x.shape[1]

    elif dataset == 'norb_normalized':
        # small NORB dataset
        import anglepy.data.norb as norb
        size = 48
        train_x, train_y, test_x, test_y = norb.load_resized(size,
                                                             binarize_y=True)

        #f_enc, f_dec, _ = pp.PCA(train_x, 0.99)
        #f_enc, f_dec, _ = pp.normalize_random(train_x)
        f_enc, f_dec, _ = pp.normalize(train_x)
        train_x = f_enc(train_x)
        test_x = f_enc(test_x)

        x = {'x': train_x.astype(np.float32)}
        x_valid = {'x': test_x.astype(np.float32)}
        L_valid = 1
        n_x = train_x.shape[0]
        dim_input = (size, size)
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'
        n_batch = 900  #23400/900 = 27
        colorImg = False
        #binarize = False
        bernoulli_x = False
        byteToFloat = False
        weight_decay = float(n_batch) / train_x.shape[1]

    elif dataset == 'svhn':
        # SVHN dataset
        import anglepy.data.svhn as svhn
        size = 32
        train_x, train_y, test_x, test_y = svhn.load_numpy(
            False, binarize_y=True)  #norb.load_resized(size, binarize_y=True)
        extra_x, extra_y = svhn.load_numpy_extra(False, binarize_y=True)
        x = {
            'x': np.hstack((train_x, extra_x)),
            'y': np.hstack((train_y, extra_y))
        }
        ndict.shuffleCols(x)

        print 'Performing PCA, can take a few minutes... ',
        f_enc, f_dec, pca_params = pp.PCA(x['x'][:, :10000],
                                          cutoff=600,
                                          toFloat=True)
        ndict.savez(pca_params, logdir + 'pca_params')
        print 'Done.'

        n_y = 10
        x = {'x': f_enc(x['x']).astype(np.float32)}
        x_valid = {'x': f_enc(test_x).astype(np.float32)}
        L_valid = 1
        n_x = x['x'].shape[0]
        dim_input = (size, size)
        n_batch = 5000
        colorImg = True
        bernoulli_x = False
        byteToFloat = False
        type_qz = 'gaussianmarg'
        type_pz = 'gaussianmarg'
        type_px = 'gaussian'
        nonlinear = 'softplus'

    # Construct model
    from anglepy.models import GPUVAE_Z_X
    updates = get_adam_optimizer(learning_rate=3e-4, weight_decay=weight_decay)
    model = GPUVAE_Z_X(updates,
                       n_x,
                       n_hidden,
                       n_z,
                       n_hidden[::-1],
                       nonlinear,
                       nonlinear,
                       type_px,
                       type_qz=type_qz,
                       type_pz=type_pz,
                       prior_sd=100,
                       init_sd=1e-3)

    if False:
        #dir = '/Users/dpkingma/results/learn_z_x_mnist_binarized_50-(500, 500)_mog_1412689061/'
        #dir = '/Users/dpkingma/results/learn_z_x_svhn_bernoulli_300-(1000, 1000)_l1l2_sharing_and_1000HU_1412676966/'
        #dir = '/Users/dpkingma/results/learn_z_x_svhn_bernoulli_300-(1000, 1000)_l1l2_sharing_and_1000HU_1412695481/'
        #dir = '/Users/dpkingma/results/learn_z_x_mnist_binarized_50-(500, 500)_mog_1412695455/'
        #dir = '/Users/dpkingma/results/gpulearn_z_x_svhn_pca_300-(500, 500)__1413904756/'
        dir = '/home/ubuntu/results/gpulearn_z_x_mnist_50-[500, 500]__1414259423/'
        w = ndict.loadz(dir + 'w_best.ndict.tar.gz')
        v = ndict.loadz(dir + 'v_best.ndict.tar.gz')
        ndict.set_value(model.w, w)
        ndict.set_value(model.v, v)

    # Some statistics for optimization
    ll_valid_stats = [-1e99, 0]

    # Progress hook
    def hook(epoch, t, ll):

        if epoch % 10 != 0: return

        ll_valid, _ = model.est_loglik(x_valid,
                                       n_samples=L_valid,
                                       n_batch=n_batch,
                                       byteToFloat=byteToFloat)

        # Log
        ndict.savez(ndict.get_value(model.v), logdir + 'v')
        ndict.savez(ndict.get_value(model.w), logdir + 'w')

        if ll_valid > ll_valid_stats[0]:
            ll_valid_stats[0] = ll_valid
            ll_valid_stats[1] = 0
            ndict.savez(ndict.get_value(model.v), logdir + 'v_best')
            ndict.savez(ndict.get_value(model.w), logdir + 'w_best')
        else:
            ll_valid_stats[1] += 1
            # Stop when not improving validation set performance in 100 iterations
            if ll_valid_stats[1] > 1000:
                print "Finished"
                with open(logdir + 'hook.txt', 'a') as f:
                    print >> f, "Finished"
                exit()

        print epoch, t, ll, ll_valid, ll_valid_stats
        with open(logdir + 'hook.txt', 'a') as f:
            print >> f, epoch, t, ll, ll_valid, ll_valid_stats

        # Graphics
        if gfx and epoch % gfx_freq == 0:

            #tail = '.png'
            tail = '-' + str(epoch) + '.png'

            v = {i: model.v[i].get_value() for i in model.v}
            w = {i: model.w[i].get_value() for i in model.w}

            if 'pca' not in dataset and 'random' not in dataset and 'normalized' not in dataset:

                if 'w0' in v:
                    image = paramgraphics.mat_to_img(f_dec(v['w0'][:].T),
                                                     dim_input,
                                                     True,
                                                     colorImg=colorImg)
                    image.save(logdir + 'q_w0' + tail, 'PNG')

                image = paramgraphics.mat_to_img(f_dec(w['out_w'][:]),
                                                 dim_input,
                                                 True,
                                                 colorImg=colorImg)
                image.save(logdir + 'out_w' + tail, 'PNG')

                if 'out_unif' in w:
                    image = paramgraphics.mat_to_img(f_dec(
                        w['out_unif'].reshape((-1, 1))),
                                                     dim_input,
                                                     True,
                                                     colorImg=colorImg)
                    image.save(logdir + 'out_unif' + tail, 'PNG')

                if n_z == 2:
                    n_width = 10
                    import scipy.stats
                    z = {'z': np.zeros((2, n_width**2))}
                    for i in range(0, n_width):
                        for j in range(0, n_width):
                            z['z'][0, n_width * i + j] = scipy.stats.norm.ppf(
                                float(i) / n_width + 0.5 / n_width)
                            z['z'][1, n_width * i + j] = scipy.stats.norm.ppf(
                                float(j) / n_width + 0.5 / n_width)

                    x, _, _z = model.gen_xz({}, z, n_width**2)
                    if dataset == 'mnist':
                        x = 1 - _z['x']
                    image = paramgraphics.mat_to_img(f_dec(_z['x']), dim_input)
                    image.save(logdir + '2dmanifold' + tail, 'PNG')
                else:
                    _x, _, _z_confab = model.gen_xz({}, {}, n_batch=144)
                    x_samples = _z_confab['x']
                    image = paramgraphics.mat_to_img(f_dec(x_samples),
                                                     dim_input,
                                                     colorImg=colorImg)
                    image.save(logdir + 'samples' + tail, 'PNG')

                    #x_samples = _x['x']
                    #image = paramgraphics.mat_to_img(x_samples, dim_input, colorImg=colorImg)
                    #image.save(logdir+'samples2'+tail, 'PNG')

            else:
                # Model with preprocessing

                if 'w0' in v:
                    image = paramgraphics.mat_to_img(f_dec(v['w0'][:].T),
                                                     dim_input,
                                                     True,
                                                     colorImg=colorImg)
                    image.save(logdir + 'q_w0' + tail, 'PNG')

                image = paramgraphics.mat_to_img(f_dec(w['out_w'][:]),
                                                 dim_input,
                                                 True,
                                                 colorImg=colorImg)
                image.save(logdir + 'out_w' + tail, 'PNG')

                _x, _, _z_confab = model.gen_xz({}, {}, n_batch=144)
                x_samples = f_dec(_z_confab['x'])
                x_samples = np.minimum(np.maximum(x_samples, 0), 1)
                image = paramgraphics.mat_to_img(x_samples,
                                                 dim_input,
                                                 colorImg=colorImg)
                image.save(logdir + 'samples' + tail, 'PNG')

    # Optimize
    #SFO
    dostep = epoch_vae_adam(model,
                            x,
                            n_batch=n_batch,
                            bernoulli_x=bernoulli_x,
                            byteToFloat=byteToFloat)
    loop_va(dostep, hook)

    pass

Example #3

Show file

File: gpulearn_yz_x.py Project: lucasb-eyer/nips14-ssl

def main(n_z, n_hidden, dataset, seed, gfx=True, _size=None):
    '''Learn a variational auto-encoder with generative model p(x,y,z)=p(y)p(z)p(x|y,z).
    x and y are (always) observed.
    I.e. this cannot be used for semi-supervised learning
    '''
    assert (type(n_hidden) == tuple or type(n_hidden) == list)
    assert type(n_z) == int
    assert isinstance(dataset, str)

    print('gpulearn_yz_x', n_z, n_hidden, dataset, seed)

    import time
    logdir = 'results/gpulearn_yz_x_' + dataset + '_' + str(n_z) + '-' + str(
        n_hidden) + '-' + str(int(time.time())) + '/'
    if not os.path.exists(logdir): os.makedirs(logdir)
    print('logdir:', logdir)

    np.random.seed(seed)

    # Init data
    if dataset == 'mnist':
        '''
        What works well:
        100-2-100 (Generated digits stay bit shady)
        1000-2-1000 (Needs pretty long training)
        '''
        import anglepy.data.mnist as mnist

        # MNIST
        size = 28
        train_x, train_y, valid_x, valid_y, test_x, test_y = mnist.load_numpy(
            size, binarize_y=True)
        f_enc, f_dec = lambda x: x, lambda x: x
        x = {
            'x': train_x[:, :].astype(np.float32),
            'y': train_y[:, :].astype(np.float32)
        }
        x_valid = {
            'x': valid_x.astype(np.float32),
            'y': valid_y.astype(np.float32)
        }
        L_valid = 1
        dim_input = (size, size)
        n_x = size * size
        n_y = 10
        n_batch = 1000
        colorImg = False
        bernoulli_x = True
        byteToFloat = False
        mosaic_w = 5
        mosaic_h = 2
        type_px = 'bernoulli'

    elif dataset == 'norb':
        # resized NORB dataset, reshuffled
        import anglepy.data.norb as norb
        size = _size  #48
        train_x, train_y, test_x, test_y = norb.load_resized(size,
                                                             binarize_y=True)
        _x = {'x': train_x, 'y': train_y}
        ndict.shuffleCols(_x)
        train_x = _x['x']
        train_y = _x['y']

        # Do PCA
        f_enc, f_dec, pca_params = pp.PCA(_x['x'][:, :10000],
                                          cutoff=2000,
                                          toFloat=False)
        ndict.savez(pca_params, logdir + 'pca_params')

        x = {
            'x': f_enc(train_x).astype(np.float32),
            'y': train_y.astype(np.float32)
        }
        x_valid = {
            'x': f_enc(test_x).astype(np.float32),
            'y': test_y.astype(np.float32)
        }

        L_valid = 1
        n_x = x['x'].shape[0]
        n_y = 5
        dim_input = (size, size)
        n_batch = 1000  #23400/900 = 27
        colorImg = False
        bernoulli_x = False
        byteToFloat = False
        mosaic_w = 5
        mosaic_h = 1
        type_px = 'gaussian'

    elif dataset == 'norb_instances':
        # resized NORB dataset with the instances as classes
        import anglepy.data.norb2 as norb2
        size = _size  #48
        x, y = norb2.load_numpy_subclasses(size, binarize_y=True)
        _x = {'x': x, 'y': y}
        ndict.shuffleCols(_x)

        # Do pre=processing
        if True:
            # Works
            f_enc, f_dec, pca_params = pp.PCA(_x['x'][:, :10000],
                                              cutoff=600,
                                              global_sd=True,
                                              toFloat=True)
            ndict.savez(pca_params, logdir + 'pca_params')
        elif False:
            # Doesn't work
            f_enc, f_dec, pp_params = pp.normalize_noise(_x['x'][:, :50000],
                                                         noise_sd=0.01,
                                                         global_sd=True,
                                                         toFloat=True)
        else:
            # Doesn't work
            f_enc, f_dec, params = pp.normalize_random(x=x[:, :10000],
                                                       global_sd=True,
                                                       toFloat=True)
            ndict.savez(params, logdir + 'normalize_random_params')

        n_valid = 5000
        x = {
            'x': f_enc(_x['x'][:, :-n_valid]).astype(np.float32),
            'y': _x['y'][:, :-n_valid].astype(np.float32)
        }
        x_valid = {
            'x': f_enc(_x['x'][:, :n_valid]).astype(np.float32),
            'y': _x['y'][:, :n_valid].astype(np.float32)
        }

        L_valid = 1
        n_x = x['x'].shape[0]
        n_y = 50
        dim_input = (size, size)
        n_batch = 5000  #23400/900 = 27
        colorImg = False
        bernoulli_x = False
        byteToFloat = False
        mosaic_w = 5
        mosaic_h = 1
        type_px = 'gaussian'

    elif dataset == 'svhn':
        # SVHN dataset
        import anglepy.data.svhn as svhn
        size = 32
        train_x, train_y, test_x, test_y = svhn.load_numpy(
            False, binarize_y=True)  #norb.load_resized(size, binarize_y=True)
        extra_x, extra_y = svhn.load_numpy_extra(False, binarize_y=True)
        x = {
            'x': np.hstack((train_x, extra_x)),
            'y': np.hstack((train_y, extra_y))
        }
        ndict.shuffleCols(x)

        #f_enc, f_dec, (x_sd, x_mean) = pp.preprocess_normalize01(train_x, True)
        f_enc, f_dec, pca_params = pp.PCA(x['x'][:, :10000],
                                          cutoff=1000,
                                          toFloat=True)
        ndict.savez(pca_params, logdir + 'pca_params')

        n_y = 10
        x = {
            'x': f_enc(x['x']).astype(np.float32),
            'y': x['y'].astype(np.float32)
        }
        x_valid = {
            'x': f_enc(test_x).astype(np.float32),
            'y': test_y.astype(np.float32)
        }
        L_valid = 1
        n_x = x['x'].shape[0]
        dim_input = (size, size)
        n_batch = 5000
        colorImg = True
        bernoulli_x = False
        byteToFloat = False
        mosaic_w = 5
        mosaic_h = 2
        type_px = 'gaussian'

    # Init model
    n_hidden_q = n_hidden
    n_hidden_p = n_hidden
    from anglepy.models import GPUVAE_YZ_X
    updates = get_adam_optimizer(alpha=3e-4,
                                 beta1=0.9,
                                 beta2=0.999,
                                 weight_decay=0)
    model = GPUVAE_YZ_X(updates,
                        n_x,
                        n_y,
                        n_hidden_q,
                        n_z,
                        n_hidden_p[::-1],
                        'softplus',
                        'softplus',
                        type_px=type_px,
                        type_qz='gaussianmarg',
                        type_pz='gaussianmarg',
                        prior_sd=1,
                        uniform_y=True)

    if False:
        dir = '/home/ubuntu/results/gpulearn_yz_x_svhn_300-(500, 500)-1414094291/'
        dir = '/home/ubuntu/results/gpulearn_yz_x_svhn_300-(500, 500)-1414163488/'
        w = ndict.loadz(dir + 'w_best.ndict.tar.gz')
        v = ndict.loadz(dir + 'v_best.ndict.tar.gz')
        ndict.set_value(model.w, w)
        ndict.set_value(model.v, v)

    # Some statistics for optimization
    ll_valid_stats = [-1e99, 0]

    # Fixed sample for visualisation
    z_sample = {
        'z':
        np.repeat(np.random.standard_normal(size=(n_z, 12)), 12,
                  axis=1).astype(np.float32)
    }
    y_sample = {
        'y':
        np.tile(
            np.random.multinomial(1, [1. / n_y] * n_y, size=12).T, (1, 12))
    }

    # Progress hook
    def hook(epoch, t, ll):

        if epoch % 10 != 0:
            return

        ll_valid, _ = model.est_loglik(x_valid,
                                       n_samples=L_valid,
                                       n_batch=n_batch,
                                       byteToFloat=byteToFloat)

        if math.isnan(ll_valid):
            print("NaN detected. Reverting to saved best parameters")
            ndict.set_value(model.v, ndict.loadz(logdir + 'v.ndict.tar.gz'))
            ndict.set_value(model.w, ndict.loadz(logdir + 'w.ndict.tar.gz'))
            return

        if ll_valid > ll_valid_stats[0]:
            ll_valid_stats[0] = ll_valid
            ll_valid_stats[1] = 0
            ndict.savez(ndict.get_value(model.v), logdir + 'v_best')
            ndict.savez(ndict.get_value(model.w), logdir + 'w_best')
        else:
            ll_valid_stats[1] += 1
            # Stop when not improving validation set performance in 100 iterations
            if False and ll_valid_stats[1] > 1000:
                print("Finished")
                with open(logdir + 'hook.txt', 'a') as f:
                    print("Finished", file=f)
                exit()

        # Log
        ndict.savez(ndict.get_value(model.v), logdir + 'v')
        ndict.savez(ndict.get_value(model.w), logdir + 'w')
        print(epoch, t, ll, ll_valid)
        with open(logdir + 'hook.txt', 'a') as f:
            print(t, ll, ll_valid, file=f)

        if gfx:
            # Graphics

            v = {i: model.v[i].get_value() for i in model.v}
            w = {i: model.w[i].get_value() for i in model.w}

            tail = '-' + str(epoch) + '.png'

            image = paramgraphics.mat_to_img(f_dec(v['w0x'][:].T),
                                             dim_input,
                                             True,
                                             colorImg=colorImg)
            image.save(logdir + 'q_w0x' + tail, 'PNG')

            image = paramgraphics.mat_to_img(f_dec(w['out_w'][:]),
                                             dim_input,
                                             True,
                                             colorImg=colorImg)
            image.save(logdir + 'out_w' + tail, 'PNG')

            _x = {'y': np.random.multinomial(1, [1. / n_y] * n_y, size=144).T}
            _, _, _z_confab = model.gen_xz(_x, {}, n_batch=144)
            image = paramgraphics.mat_to_img(f_dec(_z_confab['x']),
                                             dim_input,
                                             colorImg=colorImg)
            image.save(logdir + 'samples' + tail, 'PNG')

            _, _, _z_confab = model.gen_xz(y_sample, z_sample, n_batch=144)
            image = paramgraphics.mat_to_img(f_dec(_z_confab['x']),
                                             dim_input,
                                             colorImg=colorImg)
            image.save(logdir + 'samples_fixed' + tail, 'PNG')

            if n_z == 2:

                import Image
                import ImageFont
                import ImageDraw

                n_width = 10
                submosaic_offset = 15
                submosaic_width = (dim_input[1] * n_width)
                submosaic_height = (dim_input[0] * n_width)
                mosaic = Image.new(
                    "RGB", (submosaic_width * mosaic_w,
                            submosaic_offset + submosaic_height * mosaic_h))

                for digit in range(0, n_y):
                    if digit >= mosaic_h * mosaic_w: continue

                    _x = {}
                    n_batch_plot = n_width * n_width
                    _x['y'] = np.zeros((n_y, n_batch_plot))
                    _x['y'][digit, :] = 1
                    _z = {'z': np.zeros((2, n_width**2))}
                    for i in range(0, n_width):
                        for j in range(0, n_width):
                            _z['z'][0, n_width * i + j] = scipy.stats.norm.ppf(
                                float(i) / n_width + 0.5 / n_width)
                            _z['z'][1, n_width * i + j] = scipy.stats.norm.ppf(
                                float(j) / n_width + 0.5 / n_width)

                    _x, _, _z_confab = model.gen_xz(_x,
                                                    _z,
                                                    n_batch=n_batch_plot)
                    x_samples = _z_confab['x']
                    image = paramgraphics.mat_to_img(f_dec(x_samples),
                                                     dim_input,
                                                     colorImg=colorImg,
                                                     tile_spacing=(0, 0))

                    #image.save(logdir+'samples_digit_'+str(digit)+'_'+tail, 'PNG')
                    mosaic_x = (digit % mosaic_w) * submosaic_width
                    mosaic_y = submosaic_offset + int(
                        digit / mosaic_w) * submosaic_height
                    mosaic.paste(image, (mosaic_x, mosaic_y))

                draw = ImageDraw.Draw(mosaic)
                draw.text((1, 1),
                          "Epoch #" + str(epoch) + " Loss=" + str(int(ll)))

                #plt.savefig(logdir+'mosaic'+tail, format='PNG')
                mosaic.save(logdir + 'mosaic' + tail, 'PNG')

                #x_samples = _x['x']
                #image = paramgraphics.mat_to_img(f_dec(x_samples), dim_input, colorImg=colorImg)
                #image.save(logdir+'samples2'+tail, 'PNG')

    # Optimize
    dostep = epoch_vae_adam(model,
                            x,
                            n_batch=n_batch,
                            bernoulli_x=bernoulli_x,
                            byteToFloat=byteToFloat)
    loop_va(dostep, hook)

    pass