Esempio n. 1
0
def test_pca_compare_var():
    # create some random data
    num_samples = 10000
    dim = 10
    batch_size = 100
    num_components = 3

    # generate some data
    mean = np.random.random(dim)
    cov_factor = np.random.random((dim, dim))
    cov = np.dot(cov_factor, cov_factor.T)
    samples = be.float_tensor(
        np.random.multivariate_normal(mean, cov, size=num_samples))

    samples_train, samples_validate = batch.split_tensor(samples, 0.9)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # find the principal directions
    pca_sgd = factorization.PCA.from_batch(data,
                                           num_components,
                                           epochs=10,
                                           grad_steps_per_minibatch=1,
                                           stepsize=0.01)
    pca_svd = factorization.PCA.from_svd(samples_train, num_components)

    assert be.norm(pca_sgd.var - pca_svd.var) / be.norm(pca_sgd.var) < 1e-1
Esempio n. 2
0
def test_grbm_reload():
    vis_layer = layers.BernoulliLayer(num_vis, center=True)
    hid_layer = layers.GaussianLayer(num_hid, center=True)

    # create some extrinsics
    grbm = BoltzmannMachine([vis_layer, hid_layer])
    data = batch.Batch({
        'train':
        batch.InMemoryTable(be.randn((10 * num_samples, num_vis)), num_samples)
    })
    grbm.initialize(data)
    with tempfile.NamedTemporaryFile() as file:
        # save the model
        store = pandas.HDFStore(file.name, mode='w')
        grbm.save(store)
        store.close()
        # reload
        store = pandas.HDFStore(file.name, mode='r')
        grbm_reload = BoltzmannMachine.from_saved(store)
        store.close()
    # check the two models are consistent
    vis_data = vis_layer.random((num_samples, num_vis))
    data_state = State.from_visible(vis_data, grbm)
    vis_orig = grbm.deterministic_iteration(1, data_state)[0]
    vis_reload = grbm_reload.deterministic_iteration(1, data_state)[0]
    assert be.allclose(vis_orig, vis_reload)
    assert be.allclose(grbm.layers[0].moments.mean,
                       grbm_reload.layers[0].moments.mean)
    assert be.allclose(grbm.layers[0].moments.var,
                       grbm_reload.layers[0].moments.var)
    assert be.allclose(grbm.layers[1].moments.mean,
                       grbm_reload.layers[1].moments.mean)
    assert be.allclose(grbm.layers[1].moments.var,
                       grbm_reload.layers[1].moments.var)
Esempio n. 3
0
def example_mnist_rbm(paysage_path=None, show_plot = False):
    num_hidden_units = 500
    batch_size = 50
    num_epochs = 10
    learning_rate = 0.01
    mc_steps = 1

    (_, _, shuffled_filepath) = \
            util.default_paths(paysage_path)

    # set up the reader to get minibatches
    data = batch.Batch(shuffled_filepath,
                       'train/images',
                       batch_size,
                       transform=batch.binarize_color,
                       train_fraction=0.99)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = hidden.Model([vis_layer, hid_layer])
    rbm.initialize(data)

    # set up the optimizer and the fit method
    opt = optimizers.ADAM(rbm,
                          stepsize=learning_rate,
                          scheduler=optimizers.PowerLawDecay(0.1))

    sampler = fit.DrivenSequentialMC.from_batch(rbm, data,
                                                method='stochastic')

    cd = fit.PCD(rbm, data, opt, sampler,
                 num_epochs, mcsteps=mc_steps, skip=200,
                 metrics=[M.ReconstructionError(),
                          M.EnergyDistance(),
                          M.EnergyGap(),
                          M.EnergyZscore()])

    # fit the model
    print('training with contrastive divergence')
    cd.train()

    # evaluate the model
    # this will be the same as the final epoch results
    # it is repeated here to be consistent with the sklearn rbm example
    metrics = [M.ReconstructionError(), M.EnergyDistance(),
               M.EnergyGap(), M.EnergyZscore()]
    performance = fit.ProgressMonitor(0, data, metrics=metrics)

    util.show_metrics(rbm, performance)
    util.show_reconstructions(rbm, data.get('validate'), fit, show_plot)
    util.show_fantasy_particles(rbm, data.get('validate'), fit, show_plot)
    util.show_weights(rbm, show_plot)

    # close the HDF5 store
    data.close()
    print("Done")
Esempio n. 4
0
def example_mnist_hopfield(paysage_path=None, num_epochs=10, show_plot=False):

    num_hidden_units = 500
    batch_size = 50
    learning_rate = 0.001
    mc_steps = 1

    (_, _, shuffled_filepath) = \
        util.default_paths(paysage_path)

    # set up the reader to get minibatches
    data = batch.Batch(shuffled_filepath,
                       'train/images',
                       batch_size,
                       transform=batch.binarize_color,
                       train_fraction=0.99)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.GaussianLayer(num_hidden_units)

    rbm = model.Model([vis_layer, hid_layer])
    rbm.initialize(data)

    metrics = [
        'ReconstructionError', 'EnergyDistance', 'EnergyGap', 'EnergyZscore'
    ]
    perf = fit.ProgressMonitor(data, metrics=metrics)

    # set up the optimizer and the fit method
    opt = optimizers.ADAM(stepsize=learning_rate,
                          scheduler=optimizers.PowerLawDecay(0.1))

    sampler = fit.DrivenSequentialMC.from_batch(rbm, data, method='stochastic')

    cd = fit.SGD(rbm,
                 data,
                 opt,
                 num_epochs,
                 method=fit.pcd,
                 sampler=sampler,
                 mcsteps=mc_steps,
                 monitor=perf)

    # fit the model
    print('training with contrastive divergence')
    cd.train()

    # evaluate the model
    util.show_metrics(rbm, perf)
    util.show_reconstructions(rbm, data.get('validate'), fit, show_plot)
    util.show_fantasy_particles(rbm, data.get('validate'), fit, show_plot)
    util.show_weights(rbm, show_plot)

    # close the HDF5 store
    data.close()
    print("Done")
Esempio n. 5
0
def test_gaussian_1D_1mode_train():
    # create some example data
    num = 10000
    mu = 3
    sigma = 1
    samples = be.randn((num, 1)) * sigma + mu

    # set up the reader to get minibatches
    batch_size = 100
    samples_train, samples_validate = batch.split_tensor(samples, 0.9)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # parameters
    learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=0.1)
    mc_steps = 1
    num_epochs = 10
    num_sample_steps = 100

    # set up the model and initialize the parameters
    vis_layer = layers.GaussianLayer(1)
    hid_layer = layers.OneHotLayer(1)

    rbm = BoltzmannMachine([vis_layer, hid_layer])
    rbm.initialize(data, method='hinton')

    # modify the parameters to shift the initialized model from the data
    # this forces it to train
    rbm.layers[0].params = layers.ParamsGaussian(
        rbm.layers[0].params.loc - 3, rbm.layers[0].params.log_var - 1)

    # set up the optimizer and the fit method
    opt = optimizers.ADAM(stepsize=learning_rate)
    cd = fit.SGD(rbm, data)

    # fit the model
    print('training with persistent contrastive divergence')
    cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps)

    # sample data from the trained model
    model_state = \
        samplers.SequentialMC.generate_fantasy_state(rbm, num, num_sample_steps)
    pts_trained = model_state[0]

    percent_error = 10
    mu_trained = be.mean(pts_trained)
    assert numpy.abs(mu_trained / mu - 1) < (percent_error / 100)

    sigma_trained = numpy.sqrt(be.var(pts_trained))
    assert numpy.abs(sigma_trained / sigma - 1) < (percent_error / 100)
Esempio n. 6
0
def test_grbm_save():
    vis_layer = layers.BernoulliLayer(num_vis, center=True)
    hid_layer = layers.GaussianLayer(num_hid, center=True)
    grbm = BoltzmannMachine([vis_layer, hid_layer])
    data = batch.Batch({
        'train':
        batch.InMemoryTable(be.randn((10 * num_samples, num_vis)), num_samples)
    })
    grbm.initialize(data)
    with tempfile.NamedTemporaryFile() as file:
        store = pandas.HDFStore(file.name, mode='w')
        grbm.save(store)
        store.close()
def example_mnist_tap_machine(paysage_path=None, num_epochs = 10, show_plot=True):

    num_hidden_units = 256
    batch_size = 100
    learning_rate = 0.1

    (_, _, shuffled_filepath) = \
            util.default_paths(paysage_path)

    # set up the reader to get minibatches
    data = batch.Batch(shuffled_filepath,
                       'train/images',
                       batch_size,
                       transform=batch.binarize_color,
                       train_fraction=0.95)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = tap_machine.TAP_rbm([vis_layer, hid_layer], num_persistent_samples=0,
                              tolerance_EMF=1e-4, max_iters_EMF=25, terms=2)
    rbm.initialize(data, 'glorot_normal')

    perf  = fit.ProgressMonitor(data,
                                metrics=['ReconstructionError',
                                         'EnergyDistance'])

    opt = optimizers.Gradient(stepsize=learning_rate,
                              scheduler=optimizers.PowerLawDecay(0.1),
                              tolerance=1e-4,
                              ascent=True)

    sgd = fit.SGD(rbm, data, opt, num_epochs, method=fit.tap, monitor=perf)

    # fit the model
    print('training with stochastic gradient ascent ')
    sgd.train()

    util.show_metrics(rbm, perf)
    util.show_reconstructions(rbm, data.get('validate'), fit, show_plot)
    util.show_fantasy_particles(rbm, data.get('validate'), fit, show_plot)
    util.show_weights(rbm, show_plot)

    # close the HDF5 store
    data.close()
    print("Done")
Esempio n. 8
0
def test_pca_save_read_num_components():
    # create some random data
    num_samples = 10000
    dim = 10
    batch_size = 100
    num_components = 3
    num_components_save = 2

    # generate some data
    mean = np.random.random(dim)
    cov_factor = np.random.random((dim, dim))
    cov = np.dot(cov_factor, cov_factor.T)
    samples = be.float_tensor(
        np.random.multivariate_normal(mean, cov, size=num_samples))

    samples_train, samples_validate = batch.split_tensor(samples, 0.9)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # find the principal directions
    pca = factorization.PCA.from_batch(data,
                                       num_components,
                                       epochs=10,
                                       grad_steps_per_minibatch=1,
                                       stepsize=0.01)

    # save it
    pca_file = tempfile.NamedTemporaryFile()
    store = pd.HDFStore(pca_file.name, mode="w")
    pca.save(store, num_components_save=num_components_save)

    # read it
    pca_read = factorization.PCA.from_saved(store)
    store.close()

    # check it
    assert be.allclose(pca.W[:, :num_components_save], pca_read.W)
    assert be.allclose(pca.var[:num_components_save], pca_read.var)
    assert pca.stepsize == pca_read.stepsize
    assert pca_read.num_components == num_components_save
Esempio n. 9
0
def test_hdf_batch():
    # the temporary storage file
    store_file = tempfile.NamedTemporaryFile()

    # create data
    num_rows = 10000
    num_cols = 10
    df_A = pd.DataFrame(np.arange(num_rows*num_cols).reshape(num_rows, num_cols))
    df_B = df_A + num_rows*num_cols

    # save it
    with pd.HDFStore(store_file.name, mode="w", format="table") as store:
        store.append("train", df_A)
        store.append("validate", df_B)

    # read it back with the HDFtable
    batch_size = 1000
    num_train_batches = num_rows // batch_size
    data = batch.Batch(
            {"train": batch.HDFtable(store_file.name, "train", batch_size),
             "validate": batch.HDFtable(store_file.name, "validate", batch_size)})

    # loop through thrice, checking the data
    for i_loop in range(3):
        i_batch = 0
        while True:
            # get the data
            try:
                batch_data_train = data.get("train")
                batch_data_validate = data.get("validate")
            except StopIteration:
                assert i_batch == num_train_batches
                i_batch = 0
                data.reset_generator("all")
                break

            # check it
            assert np.all(be.to_numpy_array(batch_data_train) == \
                df_A.values[i_batch * batch_size: (i_batch + 1) * batch_size])
            assert np.all(be.to_numpy_array(batch_data_validate) == \
                df_B.values[i_batch * batch_size: (i_batch + 1) * batch_size])

            i_batch += 1
Esempio n. 10
0
def test_in_memory_batch():
    # create data
    num_rows = 10000
    num_cols = 10
    tensor = be.rand((num_rows, num_cols))

    # read it back with Batch
    batch_size = 1000
    num_train_batches = num_rows // batch_size
    with batch.Batch({
            'train': batch.InMemoryTable(tensor, batch_size),
            'validate': batch.InMemoryTable(tensor, batch_size)
    }) as data:

        # loop through thrice, checking the data
        i_batch = 0
        while True:
            # get the data
            try:
                batch_data_train = data.get("train")
                batch_data_validate = data.get("validate")
            except StopIteration:
                assert i_batch == num_train_batches
                i_batch = 0
                data.reset_generator("all")
                break

            # check it
            assert be.allclose(
                batch_data_train,
                tensor[i_batch * batch_size:(i_batch + 1) * batch_size])
            assert be.allclose(
                batch_data_validate,
                tensor[i_batch * batch_size:(i_batch + 1) * batch_size])

            i_batch += 1
Esempio n. 11
0
def test_rbm(paysage_path=None):

    num_hidden_units = 50
    batch_size = 50
    num_epochs = 1
    learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1)
    mc_steps = 1

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    import pandas
    samples = pre.binarize_color(
        be.float_tensor(
            pandas.read_hdf(shuffled_filepath,
                            key='train/images').values[:10000]))
    samples_train, samples_validate = batch.split_tensor(samples, 0.95)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = BoltzmannMachine([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = ProgressMonitor()
    untrained_performance = perf.epoch_update(data,
                                              rbm,
                                              store=True,
                                              show=False)

    # set up the optimizer and the fit method
    opt = optimizers.RMSProp(stepsize=learning_rate)
    cd = fit.SGD(rbm, data)

    # fit the model
    print('training with contrastive divergence')
    cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps)

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = cd.monitor.memory[-1]

    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()
Esempio n. 12
0
def test_tap_machine(paysage_path=None):
    num_hidden_units = 10
    batch_size = 50
    num_epochs = 1
    learning_rate = 0.01

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    # set up the reader to get minibatches
    data = batch.Batch(shuffled_filepath,
                       'train/images',
                       batch_size,
                       transform=batch.binarize_color,
                       train_fraction=0.1)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = tap_machine.TAP_rbm([vis_layer, hid_layer],
                              tolerance_EMF=1e-2,
                              max_iters_EMF=50)
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = fit.ProgressMonitor(data, metrics=['ReconstructionError'])
    untrained_performance = perf.check_progress(rbm)

    # set up the optimizer and the fit method
    opt = optimizers.Gradient(stepsize=learning_rate,
                              scheduler=optimizers.PowerLawDecay(0.1),
                              tolerance=1e-3,
                              ascent=True)

    solver = fit.SGD(rbm, data, opt, num_epochs, method=fit.tap, monitor=perf)

    # fit the model
    print('training with stochastic gradient ascent')
    solver.train()

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = perf.check_progress(rbm)

    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()
Esempio n. 13
0
def test_tap_machine(paysage_path=None):
    num_hidden_units = 10
    batch_size = 100
    num_epochs = 5
    learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=1.0)

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    # set up the reader to get minibatches
    samples = pre.binarize_color(
        be.float_tensor(
            pandas.read_hdf(shuffled_filepath,
                            key='train/images').as_matrix()[:10000]))
    samples_train, samples_validate = batch.split_tensor(samples, 0.95)
    data = batch.Batch({
        'train':
        batch.InMemoryTable(samples_train, batch_size),
        'validate':
        batch.InMemoryTable(samples_validate, batch_size)
    })

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = BoltzmannMachine([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = ProgressMonitor(generator_metrics = \
            [ReconstructionError(), TAPLogLikelihood(10), TAPFreeEnergy(10)])
    untrained_performance = perf.epoch_update(data,
                                              rbm,
                                              store=True,
                                              show=False)

    # set up the optimizer and the fit method
    opt = optimizers.Gradient(stepsize=learning_rate, tolerance=1e-5)
    tap = fit.TAP(True, 0.1, 0.01, 25, True, 0.5, 0.001, 0.0)
    solver = fit.SGD(rbm, data)
    solver.monitor.generator_metrics.append(TAPLogLikelihood(10))
    solver.monitor.generator_metrics.append(TAPFreeEnergy(10))

    # fit the model
    print('training with stochastic gradient ascent')
    solver.train(opt, num_epochs, method=tap.tap_update)

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = solver.monitor.memory[-1]

    assert (trained_performance['TAPLogLikelihood'] >
            untrained_performance['TAPLogLikelihood']), \
    "TAP log-likelihood did not increase"
    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()
Esempio n. 14
0
def test_rbm(paysage_path=None):
    """TODO : this is just a placeholder, need to clean up & simplifiy setup. Also
    need to figure how to deal with consistent random seeding throughout the
    codebase to obtain deterministic checkable results.
    """
    num_hidden_units = 50
    batch_size = 50
    num_epochs = 1
    learning_rate = 0.01
    mc_steps = 1

    if not paysage_path:
        paysage_path = os.path.dirname(
            os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(paysage_path, 'mnist', 'mnist.h5')

    if not os.path.exists(filepath):
        raise IOError(
            "{} does not exist. run mnist/download_mnist.py to fetch from the web"
            .format(filepath))

    shuffled_filepath = os.path.join(paysage_path, 'mnist',
                                     'shuffled_mnist.h5')

    # shuffle the data
    if not os.path.exists(shuffled_filepath):
        shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0)
        shuffler.shuffle()

    # set a seed for the random number generator
    be.set_seed()

    # set up the reader to get minibatches
    data = batch.Batch(shuffled_filepath,
                       'train/images',
                       batch_size,
                       transform=batch.binarize_color,
                       train_fraction=0.99)

    # set up the model and initialize the parameters
    vis_layer = layers.BernoulliLayer(data.ncols)
    hid_layer = layers.BernoulliLayer(num_hidden_units)

    rbm = hidden.Model([vis_layer, hid_layer])
    rbm.initialize(data)

    # obtain initial estimate of the reconstruction error
    perf = fit.ProgressMonitor(0, data, metrics=[M.ReconstructionError()])
    untrained_performance = perf.check_progress(rbm, 0)

    # set up the optimizer and the fit method
    opt = optimizers.RMSProp(rbm,
                             stepsize=learning_rate,
                             scheduler=optimizers.PowerLawDecay(0.1))

    sampler = fit.DrivenSequentialMC.from_batch(rbm, data, method='stochastic')

    cd = fit.PCD(rbm,
                 data,
                 opt,
                 sampler,
                 num_epochs,
                 mcsteps=mc_steps,
                 skip=200,
                 metrics=[M.ReconstructionError()])

    # fit the model
    print('training with contrastive divergence')
    cd.train()

    # obtain an estimate of the reconstruction error after 1 epoch
    trained_performance = perf.check_progress(rbm, 0)

    assert (trained_performance['ReconstructionError'] <
            untrained_performance['ReconstructionError']), \
    "Reconstruction error did not decrease"

    # close the HDF5 store
    data.close()