def test_mean(): # create some random data s = be.rand((100000, )) # reference result ref_mean = be.mean(s) # do the online calculation mv = math_utils.MeanVarianceCalculator() for i in range(10): mv.update(s[i * 10000:(i + 1) * 10000]) assert be.allclose(be.float_tensor(np.array([ref_mean])), be.float_tensor(np.array([mv.mean])))
def test_pca_compare_var(): # create some random data num_samples = 10000 dim = 10 batch_size = 100 num_components = 3 # generate some data mean = np.random.random(dim) cov_factor = np.random.random((dim, dim)) cov = np.dot(cov_factor, cov_factor.T) samples = be.float_tensor( np.random.multivariate_normal(mean, cov, size=num_samples)) samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # find the principal directions pca_sgd = factorization.PCA.from_batch(data, num_components, epochs=10, grad_steps_per_minibatch=1, stepsize=0.01) pca_svd = factorization.PCA.from_svd(samples_train, num_components) assert be.norm(pca_sgd.var - pca_svd.var) / be.norm(pca_sgd.var) < 1e-1
def test_pca_svd_save_read(): # create some random data num_samples = 10000 dim = 10 num_components = 3 # generate some data mean = np.random.random(dim) cov_factor = np.random.random((dim, dim)) cov = np.dot(cov_factor, cov_factor.T) samples = be.float_tensor( np.random.multivariate_normal(mean, cov, size=num_samples)) # find the principal directions pca = factorization.PCA.from_svd(samples, num_components) # save it pca_file = tempfile.NamedTemporaryFile() store = pd.HDFStore(pca_file.name, mode="w") pca.save(store) # read it pca_read = factorization.PCA.from_saved(store) store.close() # check it assert be.allclose(pca.W, pca_read.W) assert be.allclose(pca.var, pca_read.var) assert pca.stepsize == pca_read.stepsize assert pca.num_components == pca_read.num_components
def test_mean(): # create some random data num = 100000 num_steps = 10 stepsize = num // num_steps s = be.rand((num,)) # reference result ref_mean = be.mean(s) # do the online calculation mv = math_utils.MeanCalculator() for i in range(num_steps): mv.update(s[i*stepsize:(i+1)*stepsize]) assert be.allclose(be.float_tensor(np.array([ref_mean])), be.float_tensor(np.array([mv.mean])))
def from_dataframe(cls, df): """ Create a MeanVarianceArrayCalculator from a DataFrame config. Args: config (DataFrame): the parameters, stored as a DataFrame. Returns: MeanVarianceArrayCalculator """ mvac = cls() mvac.num = (df["num"].astype(int))[0] # constant column mvac.mean = be.float_tensor(df["mean"].astype(float)) mvac.var = be.float_tensor(df["var"].astype(float)) mvac.square = be.float_tensor(df["square"].astype(float)) return mvac
def create_batch(batch_size, train_fraction=0.95, transform=be.do_nothing): """ Create a Batch reader. Args: transform (callable): the transform function. train_fraction (float): the training data fraction. Returns: data (Batch): a batcher. """ samples = be.float_tensor(pandas.read_hdf( default_paths(), key='train/images').values) return batch.in_memory_batch(samples, batch_size, train_fraction, transform)
def test_pca_svd(): # create some random data num_samples = 10000 dim = 10 num_components = 3 # generate some data mean = np.random.random(dim) cov_factor = np.random.random((dim, dim)) cov = np.dot(cov_factor, cov_factor.T) samples = be.float_tensor( np.random.multivariate_normal(mean, cov, size=num_samples)) # find the principal directions pca = factorization.PCA.from_svd(samples, num_components) assert be.shape(pca.W) == (dim, num_components) assert be.shape(pca.var) == (num_components, )
def test_pca_save_read_num_components(): # create some random data num_samples = 10000 dim = 10 batch_size = 100 num_components = 3 num_components_save = 2 # generate some data mean = np.random.random(dim) cov_factor = np.random.random((dim, dim)) cov = np.dot(cov_factor, cov_factor.T) samples = be.float_tensor( np.random.multivariate_normal(mean, cov, size=num_samples)) samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # find the principal directions pca = factorization.PCA.from_batch(data, num_components, epochs=10, grad_steps_per_minibatch=1, stepsize=0.01) # save it pca_file = tempfile.NamedTemporaryFile() store = pd.HDFStore(pca_file.name, mode="w") pca.save(store, num_components_save=num_components_save) # read it pca_read = factorization.PCA.from_saved(store) store.close() # check it assert be.allclose(pca.W[:, :num_components_save], pca_read.W) assert be.allclose(pca.var[:num_components_save], pca_read.var) assert pca.stepsize == pca_read.stepsize assert pca_read.num_components == num_components_save
def test_rbm(paysage_path=None): num_hidden_units = 50 batch_size = 50 num_epochs = 1 learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1) mc_steps = 1 if not paysage_path: paysage_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5') if not os.path.exists(filepath): raise IOError( "{} does not exist. run mnist/download_mnist.py to fetch from the web" .format(filepath)) shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist', 'shuffled_mnist.h5') # shuffle the data if not os.path.exists(shuffled_filepath): shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0) shuffler.shuffle() # set a seed for the random number generator be.set_seed() import pandas samples = pre.binarize_color( be.float_tensor( pandas.read_hdf(shuffled_filepath, key='train/images').values[:10000])) samples_train, samples_validate = batch.split_tensor(samples, 0.95) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # set up the model and initialize the parameters vis_layer = layers.BernoulliLayer(data.ncols) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data) # obtain initial estimate of the reconstruction error perf = ProgressMonitor() untrained_performance = perf.epoch_update(data, rbm, store=True, show=False) # set up the optimizer and the fit method opt = optimizers.RMSProp(stepsize=learning_rate) cd = fit.SGD(rbm, data) # fit the model print('training with contrastive divergence') cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps) # obtain an estimate of the reconstruction error after 1 epoch trained_performance = cd.monitor.memory[-1] assert (trained_performance['ReconstructionError'] < untrained_performance['ReconstructionError']), \ "Reconstruction error did not decrease" # close the HDF5 store data.close()
def test_binarize_color(): result_pre = [ pre.binarize_color(pre.scale(tensor, 1 / 255)) for tensor in tensors ] result_ref = [be.float_tensor(be.tround(tensor)) for tensor in tensors] assert compare_lists(result_pre, result_ref)
def run(paysage_path=None, num_epochs=10, show_plot=False): num_hidden_units = 256 batch_size = 100 learning_rate = schedules.PowerLawDecay(initial=0.01, coefficient=0.1) mc_steps = 1 (_, _, shuffled_filepath) = \ util.default_paths(paysage_path) # set up the reader to get minibatches import pandas data = batch.InMemoryBatch(pre.binarize_color( be.float_tensor( pandas.read_hdf(shuffled_filepath, key='train/images').as_matrix())), batch_size, train_fraction=0.95) # set up the model and initialize the parameters vis_layer = layers.BernoulliLayer(data.ncols) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = model.Model([vis_layer, hid_layer]) rbm.weights[0].add_penalty({'matrix': pen.l2_penalty(0.001)}) rbm.initialize(data, method='glorot_normal') metrics = [ 'ReconstructionError', 'EnergyDistance', 'EnergyGap', 'EnergyZscore', 'HeatCapacity', 'WeightSparsity', 'WeightSquare' ] perf = fit.ProgressMonitor(data, metrics=metrics) # set up the optimizer and the fit method opt = optimizers.ADAM(stepsize=learning_rate) sampler = fit.DrivenSequentialMC.from_batch(rbm, data) cd = fit.SGD(rbm, data, opt, num_epochs, sampler, method=fit.pcd, mcsteps=mc_steps, monitor=perf) # fit the model print('training with contrastive divergence') cd.train() # evaluate the model util.show_metrics(rbm, perf) valid = data.get('validate') util.show_reconstructions(rbm, valid, fit, show_plot, n_recon=10, vertical=False, num_to_avg=10) util.show_fantasy_particles(rbm, valid, fit, show_plot, n_fantasy=25) util.show_weights(rbm, show_plot, n_weights=25) # close the HDF5 store data.close() print("Done")
def test_tap_machine(paysage_path=None): num_hidden_units = 10 batch_size = 100 num_epochs = 5 learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=1.0) if not paysage_path: paysage_path = os.path.dirname( os.path.dirname(os.path.abspath(__file__))) filepath = os.path.join(paysage_path, 'examples', 'mnist', 'mnist.h5') if not os.path.exists(filepath): raise IOError( "{} does not exist. run mnist/download_mnist.py to fetch from the web" .format(filepath)) shuffled_filepath = os.path.join(paysage_path, 'examples', 'mnist', 'shuffled_mnist.h5') # shuffle the data if not os.path.exists(shuffled_filepath): shuffler = batch.DataShuffler(filepath, shuffled_filepath, complevel=0) shuffler.shuffle() # set a seed for the random number generator be.set_seed() # set up the reader to get minibatches samples = pre.binarize_color( be.float_tensor( pandas.read_hdf(shuffled_filepath, key='train/images').as_matrix()[:10000])) samples_train, samples_validate = batch.split_tensor(samples, 0.95) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # set up the model and initialize the parameters vis_layer = layers.BernoulliLayer(data.ncols) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data) # obtain initial estimate of the reconstruction error perf = ProgressMonitor(generator_metrics = \ [ReconstructionError(), TAPLogLikelihood(10), TAPFreeEnergy(10)]) untrained_performance = perf.epoch_update(data, rbm, store=True, show=False) # set up the optimizer and the fit method opt = optimizers.Gradient(stepsize=learning_rate, tolerance=1e-5) tap = fit.TAP(True, 0.1, 0.01, 25, True, 0.5, 0.001, 0.0) solver = fit.SGD(rbm, data) solver.monitor.generator_metrics.append(TAPLogLikelihood(10)) solver.monitor.generator_metrics.append(TAPFreeEnergy(10)) # fit the model print('training with stochastic gradient ascent') solver.train(opt, num_epochs, method=tap.tap_update) # obtain an estimate of the reconstruction error after 1 epoch trained_performance = solver.monitor.memory[-1] assert (trained_performance['TAPLogLikelihood'] > untrained_performance['TAPLogLikelihood']), \ "TAP log-likelihood did not increase" assert (trained_performance['ReconstructionError'] < untrained_performance['ReconstructionError']), \ "Reconstruction error did not decrease" # close the HDF5 store data.close()