def test_weights_derivative(): ly = layers.Weights((num_vis, num_hid)) p = penalties.l2_penalty(0.37) ly.add_penalty({'matrix': p}) vis = be.randn((num_samples, num_vis)) hid = be.randn((num_samples, num_hid)) derivs = ly.derivatives(vis, hid)
def test_state_for_grad_DrivenSequentialMC(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.BernoulliLayer(num_visible_units) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = model.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units, )) b = be.randn((num_hidden_units, )) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].params.loc[:] = a rbm.layers[1].params.loc[:] = b rbm.weights[0].params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) data_state = State.from_visible(vdata, rbm) dropout_scale = State.dropout_rescale(rbm) # since we set no dropout, dropout_scale should be None assert dropout_scale is None for u in [ 'markov_chain', 'mean_field_iteration', 'deterministic_iteration' ]: # set up the sampler sampler = fit.DrivenSequentialMC(rbm, updater=u, clamped=[0]) sampler.set_state(data_state) # update the state of the hidden layer grad_state = sampler.state_for_grad(1, dropout_scale) assert be.allclose(data_state.units[0], grad_state.units[0]), \ "visible layer is clamped, and shouldn't get updated: {}".format(u) assert not be.allclose(data_state.units[1], grad_state.units[1]), \ "hidden layer is not clamped, and should get updated: {}".format(u) # compute the conditional mean with the layer function ave = rbm.layers[1].conditional_mean( rbm._connected_rescaled_units(1, data_state, dropout_scale), rbm._connected_weights(1)) assert be.allclose(ave, grad_state.units[1]), \ "hidden layer of grad_state should be conditional mean: {}".format(u)
def test_bernoulli_derivatives(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.BernoulliLayer(num_visible_units) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = hidden.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units, )) b = be.randn((num_hidden_units, )) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].int_params.loc[:] = a rbm.layers[1].int_params.loc[:] = b rbm.weights[0].int_params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) vdata_scaled = rbm.layers[0].rescale(vdata) # compute the mean of the hidden layer rbm.layers[1].update([vdata], [rbm.weights[0].W()]) hid_mean = rbm.layers[1].mean() hid_mean_scaled = rbm.layers[1].rescale(hid_mean) # compute the derivatives d_visible_loc = -be.mean(vdata, axis=0) d_hidden_loc = -be.mean(hid_mean_scaled, axis=0) d_W = -be.batch_outer(vdata, hid_mean_scaled) / len(vdata) # compute the derivatives using the layer functions vis_derivs = rbm.layers[0].derivatives(vdata, [hid_mean_scaled], [rbm.weights[0].W()]) hid_derivs = rbm.layers[1].derivatives(hid_mean, [vdata_scaled], [rbm.weights[0].W_T()]) weight_derivs = rbm.weights[0].derivatives(vdata, hid_mean_scaled) assert be.allclose(d_visible_loc, vis_derivs.loc), \ "derivative of visible loc wrong in bernoulli-bernoulli rbm" assert be.allclose(d_hidden_loc, hid_derivs.loc), \ "derivative of hidden loc wrong in bernoulli-bernoulli rbm" assert be.allclose(d_W, weight_derivs.matrix), \ "derivative of weights wrong in bernoulli-bernoulli rbm"
def test_exponential_conditional_params(): ly = layers.ExponentialLayer(num_vis) w = layers.Weights((num_vis, num_hid)) scaled_units = [be.randn((num_samples, num_hid))] weights = [w.W_T()] beta = be.rand((num_samples, 1)) ly._conditional_params(scaled_units, weights, beta)
def test_onehot_conditional_params(): ly = layers.OneHotLayer(num_vis) w = layers.Weights((num_vis, num_hid)) scaled_units = [be.randn((num_samples, num_hid))] weights = [w.W(trans=True)] beta = be.rand((num_samples, 1)) ly.conditional_params(scaled_units, weights, beta)
def test_grbm_reload(): vis_layer = layers.BernoulliLayer(num_vis, center=True) hid_layer = layers.GaussianLayer(num_hid, center=True) # create some extrinsics grbm = BoltzmannMachine([vis_layer, hid_layer]) data = batch.Batch({ 'train': batch.InMemoryTable(be.randn((10 * num_samples, num_vis)), num_samples) }) grbm.initialize(data) with tempfile.NamedTemporaryFile() as file: # save the model store = pandas.HDFStore(file.name, mode='w') grbm.save(store) store.close() # reload store = pandas.HDFStore(file.name, mode='r') grbm_reload = BoltzmannMachine.from_saved(store) store.close() # check the two models are consistent vis_data = vis_layer.random((num_samples, num_vis)) data_state = State.from_visible(vis_data, grbm) vis_orig = grbm.deterministic_iteration(1, data_state)[0] vis_reload = grbm_reload.deterministic_iteration(1, data_state)[0] assert be.allclose(vis_orig, vis_reload) assert be.allclose(grbm.layers[0].moments.mean, grbm_reload.layers[0].moments.mean) assert be.allclose(grbm.layers[0].moments.var, grbm_reload.layers[0].moments.var) assert be.allclose(grbm.layers[1].moments.mean, grbm_reload.layers[1].moments.mean) assert be.allclose(grbm.layers[1].moments.var, grbm_reload.layers[1].moments.var)
def test_gaussian_derivatives(): ly = layers.GaussianLayer(num_vis) w = layers.Weights((num_vis, num_hid)) vis = ly.random((num_samples, num_vis)) hid = [be.randn((num_samples, num_hid))] weights = [w.W_T()] ly.derivatives(vis, hid, weights)
def test_bernoulli_derivatives(): ly = layers.BernoulliLayer(num_vis) w = layers.Weights((num_vis, num_hid)) vis = ly.random((num_samples, num_vis)) hid = [be.randn((num_samples, num_hid))] weights = [w.W_T()] ly.derivatives(vis, hid, weights)
def test_exponential_update(): ly = layers.BernoulliLayer(num_vis) w = layers.Weights((num_vis, num_hid)) scaled_units = [be.randn((num_samples, num_hid))] weights = [w.W_T()] beta = be.rand((num_samples, 1)) ly.update(scaled_units, weights, beta)
def test_ising_update(): ly = layers.IsingLayer(num_vis) w = layers.Weights((num_vis, num_hid)) scaled_units = [be.randn((num_samples, num_hid))] weights = [w.W_T()] beta = be.rand((num_samples, 1)) ly.update(scaled_units, weights, beta)
def test_onehot_derivatives(): ly = layers.OneHotLayer(num_vis) w = layers.Weights((num_vis, num_hid)) vis = ly.random((num_samples, num_vis)) hid = [be.randn((num_samples, num_hid))] weights = [w.W_T()] ly.derivatives(vis, hid, weights)
def test_clamped_SequentialMC(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 steps = 1 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.BernoulliLayer(num_visible_units) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = model.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units, )) b = be.randn((num_hidden_units, )) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].params.loc[:] = a rbm.layers[1].params.loc[:] = b rbm.weights[0].params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) data_state = State.from_visible(vdata, rbm) dropout_scale = State.dropout_rescale(rbm) # since we set no dropout, dropout_scale should be None assert dropout_scale is None for u in [ 'markov_chain', 'mean_field_iteration', 'deterministic_iteration' ]: # set up the sampler with the visible layer clamped sampler = fit.SequentialMC(rbm, updater=u, clamped=[0]) sampler.set_state(data_state) # update the sampler state and check the output sampler.update_state(steps, dropout_scale) assert be.allclose(data_state.units[0], sampler.state.units[0]), \ "visible layer is clamped, and shouldn't get updated: {}".format(u) assert not be.allclose(data_state.units[1], sampler.state.units[1]), \ "hidden layer is not clamped, and should get updated: {}".format(u)
def test_exponential_derivatives(): ly = layers.ExponentialLayer(num_vis) w = layers.Weights((num_vis, num_hid)) vis = ly.random((num_samples, num_vis)) hid = [be.randn((num_samples, num_hid))] weights = [w.W_T()] beta = be.rand((num_samples, 1)) ly.derivatives(vis, hid, weights, beta)
def test_ising_derivatives(): ly = layers.IsingLayer(num_vis) w = layers.Weights((num_vis, num_hid)) vis = ly.random((num_samples, num_vis)) hid = [be.randn((num_samples, num_hid))] weights = [w.W()] beta = be.rand((num_samples, 1)) ly.derivatives(vis, hid, weights, beta)
def test_bernoulli_conditional_params(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.BernoulliLayer(num_visible_units) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = model.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units, )) b = be.randn((num_hidden_units, )) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].params.loc[:] = a rbm.layers[1].params.loc[:] = b rbm.weights[0].params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) hdata = rbm.layers[1].random((batch_size, num_hidden_units)) # compute conditional parameters hidden_field = be.dot(vdata, W) # (batch_size, num_hidden_units) hidden_field += b visible_field = be.dot(hdata, be.transpose(W)) # (batch_size, num_visible_units) visible_field += a # compute conditional parameters with layer funcitons hidden_field_layer = rbm.layers[1]._conditional_params( [vdata], [rbm.weights[0].W()]) visible_field_layer = rbm.layers[0]._conditional_params( [hdata], [rbm.weights[0].W_T()]) assert be.allclose(hidden_field, hidden_field_layer), \ "hidden field wrong in bernoulli-bernoulli rbm" assert be.allclose(visible_field, visible_field_layer), \ "visible field wrong in bernoulli-bernoulli rbm"
def test_pdist(): n = 500 a_shape = (1000, n) b_shape = (1000, n) # distance distributions a_mean, a_scale = 1, 1 b_mean, b_scale = -1, 1 be.set_seed() a = a_mean + a_scale * be.randn(a_shape) b = b_mean + b_scale * be.randn(b_shape) dists = math_utils.pdist(a, b) dists_t = math_utils.pdist(b, a) assert be.shape(dists) == (1000, 1000) assert be.allclose(be.transpose(dists_t), dists) assert be.mean(dists) > 2 * math.sqrt(n) and be.mean( dists) < 3 * math.sqrt(n)
def test_bernoulli_update(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.BernoulliLayer(num_visible_units) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = hidden.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units, )) b = be.randn((num_hidden_units, )) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].int_params.loc[:] = a rbm.layers[1].int_params.loc[:] = b rbm.weights[0].int_params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) hdata = rbm.layers[1].random((batch_size, num_hidden_units)) # compute extrinsic parameters hidden_field = be.dot(vdata, W) # (batch_size, num_hidden_units) hidden_field += be.broadcast(b, hidden_field) visible_field = be.dot(hdata, be.transpose(W)) # (batch_size, num_visible_units) visible_field += be.broadcast(a, visible_field) # update the extrinsic parameter using the layer functions rbm.layers[0].update([hdata], [rbm.weights[0].W_T()]) rbm.layers[1].update([vdata], [rbm.weights[0].W()]) assert be.allclose(hidden_field, rbm.layers[1].ext_params.field), \ "hidden field wrong in bernoulli-bernoulli rbm" assert be.allclose(visible_field, rbm.layers[0].ext_params.field), \ "visible field wrong in bernoulli-bernoulli rbm"
def test_gaussian_GFE_entropy_gradient(): num_units = 5 lay = layers.GaussianLayer(num_units) lay.params.loc[:] = be.rand_like(lay.params.loc) lay.params.log_var[:] = be.randn(be.shape(lay.params.loc)) from cytoolz import compose sum_square = compose(be.tsum, be.square) for itr in range(10): mag = lay.get_random_magnetization() lms = lay.lagrange_multipliers_analytic(mag) entropy = lay.TAP_entropy(mag) lr = 0.001 gogogo = True grad = lay.TAP_magnetization_grad(mag, [], [], []) grad_mag = math.sqrt(be.float_scalar(be.accumulate(sum_square, grad))) normit = partial(be.tmul_, be.float_scalar(1.0/grad_mag)) be.apply_(normit, grad) rand_grad = lay.get_random_magnetization() grad_mag = math.sqrt(be.float_scalar(be.accumulate(sum_square, rand_grad))) normit = partial(be.tmul_, be.float_scalar(1.0/grad_mag)) be.apply_(normit, rand_grad) while gogogo: cop1_mag = deepcopy(mag) cop1_lms = deepcopy(lms) cop2_mag = deepcopy(mag) cop2_lms = deepcopy(lms) cop1_mag.mean[:] = mag.mean + lr * grad.mean cop2_mag.mean[:] = mag.mean + lr * rand_grad.mean cop1_mag.variance[:] = mag.variance + lr * grad.variance cop2_mag.variance[:] = mag.variance + lr * rand_grad.variance lay.clip_magnetization_(cop1_mag) lay.clip_magnetization_(cop2_mag) cop1_lms = lay.lagrange_multipliers_analytic(cop1_mag) cop2_lms = lay.lagrange_multipliers_analytic(cop2_mag) entropy_1 = lay.TAP_entropy(cop1_mag) entropy_2 = lay.TAP_entropy(cop2_mag) regress = entropy_1 - entropy_2 < 0.0 #print(itr, "[",lr, "] ", entropy, entropy_1, entropy_2, regress) if regress: #print(grad, rand_grad) if lr < 1e-6: assert False,\ "Gaussian GFE magnetization gradient is wrong" break else: lr *= 0.5 else: break
def test_gaussian_1D_1mode_train(): # create some example data num = 10000 mu = 3 sigma = 1 samples = be.randn((num, 1)) * sigma + mu # set up the reader to get minibatches batch_size = 100 samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # parameters learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=0.1) mc_steps = 1 num_epochs = 10 num_sample_steps = 100 # set up the model and initialize the parameters vis_layer = layers.GaussianLayer(1) hid_layer = layers.OneHotLayer(1) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data, method='hinton') # modify the parameters to shift the initialized model from the data # this forces it to train rbm.layers[0].params = layers.ParamsGaussian( rbm.layers[0].params.loc - 3, rbm.layers[0].params.log_var - 1) # set up the optimizer and the fit method opt = optimizers.ADAM(stepsize=learning_rate) cd = fit.SGD(rbm, data) # fit the model print('training with persistent contrastive divergence') cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps) # sample data from the trained model model_state = \ samplers.SequentialMC.generate_fantasy_state(rbm, num, num_sample_steps) pts_trained = model_state[0] percent_error = 10 mu_trained = be.mean(pts_trained) assert numpy.abs(mu_trained / mu - 1) < (percent_error / 100) sigma_trained = numpy.sqrt(be.var(pts_trained)) assert numpy.abs(sigma_trained / sigma - 1) < (percent_error / 100)
def test_unclamped_DrivenSequentialMC(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 steps = 1 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.BernoulliLayer(num_visible_units) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units,)) b = be.randn((num_hidden_units,)) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].params.loc[:] = a rbm.layers[1].params.loc[:] = b rbm.connections[0].weights.params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) data_state = State.from_visible(vdata, rbm) for u in ['markov_chain', 'mean_field_iteration', 'deterministic_iteration']: # set up the sampler with the visible layer clamped sampler = samplers.SequentialMC(rbm, updater=u) sampler.set_state(data_state) # update the sampler state and check the output sampler.update_state(steps) assert not be.allclose(data_state[0], sampler.state[0]), \ "visible layer is not clamped, and should get updated: {}".format(u) assert not be.allclose(data_state[1], sampler.state[1]), \ "hidden layer is not clamped, and should get updated: {}".format(u)
def test_grbm_save(): vis_layer = layers.BernoulliLayer(num_vis, center=True) hid_layer = layers.GaussianLayer(num_hid, center=True) grbm = BoltzmannMachine([vis_layer, hid_layer]) data = batch.Batch({ 'train': batch.InMemoryTable(be.randn((10 * num_samples, num_vis)), num_samples) }) grbm.initialize(data) with tempfile.NamedTemporaryFile() as file: store = pandas.HDFStore(file.name, mode='w') grbm.save(store) store.close()
def test_find_k_nearest_neighbors(): n = 20 shp = (20, n) perm = be.rand_int(0, 20, (20, )) k = 1 be.set_seed() y = be.randn(shp) x = y[perm] indices, _distances = math_utils.find_k_nearest_neighbors(x, y, k) assert be.allclose(indices, perm) assert be.allclose(_distances, be.zeros((20, )), 1e-2, 1e-2)
def test_bernoulli_GFE_derivatives(): # Tests that the GFE derivative update increases GFE versus 100 # random update vectors num_units = 5 layer_1 = layers.BernoulliLayer(num_units) layer_2 = layers.BernoulliLayer(num_units) layer_3 = layers.BernoulliLayer(num_units) rbm = BoltzmannMachine([layer_1, layer_2, layer_3]) for i in range(len(rbm.connections)): rbm.connections[i].weights.params.matrix[:] = \ 0.01 * be.randn(rbm.connections[i].shape) for lay in rbm.layers: lay.params.loc[:] = be.rand_like(lay.params.loc) state, cop1_GFE = rbm.compute_StateTAP(init_lr=0.1, tol=1e-7, max_iters=50) grad = rbm._grad_gibbs_free_energy(state) gu.grad_normalize_(grad) for i in range(100): lr = 1.0 gogogo = True random_grad = gu.random_grad(rbm) gu.grad_normalize_(random_grad) while gogogo: cop1 = deepcopy(rbm) lr_mul = partial(be.tmul, lr) cop1.parameter_update(gu.grad_apply(lr_mul, grad)) cop1_state, cop1_GFE = cop1.compute_StateTAP(init_lr=0.1, tol=1e-7, max_iters=50) cop2 = deepcopy(rbm) cop2.parameter_update(gu.grad_apply(lr_mul, random_grad)) cop2_state, cop2_GFE = cop2.compute_StateTAP(init_lr=0.1, tol=1e-7, max_iters=50) regress = cop2_GFE - cop1_GFE < 0.0 if regress: if lr < 1e-6: assert False, \ "TAP FE gradient is not working properly for Bernoulli models" break else: lr *= 0.5 else: break
def test_gaussian_GFE_derivatives_gradient_descent(): num_units = 5 layer_1 = layers.GaussianLayer(num_units) layer_2 = layers.BernoulliLayer(num_units) rbm = BoltzmannMachine([layer_1, layer_2]) for i in range(len(rbm.connections)): rbm.connections[i].weights.params.matrix[:] = \ 0.01 * be.randn(rbm.connections[i].shape) for lay in rbm.layers: lay.params.loc[:] = be.rand_like(lay.params.loc) state, GFE = rbm.compute_StateTAP(use_GD=False, tol=1e-7, max_iters=50) grad = rbm._grad_gibbs_free_energy(state) gu.grad_normalize_(grad) for i in range(100): lr = 0.001 gogogo = True random_grad = gu.random_grad(rbm) gu.grad_normalize_(random_grad) while gogogo: cop1 = deepcopy(rbm) lr_mul = partial(be.tmul, lr) cop1.parameter_update(gu.grad_apply(lr_mul, grad)) cop1_state, cop1_GFE = cop1.compute_StateTAP(use_GD=False, tol=1e-7, max_iters=50) cop2 = deepcopy(rbm) cop2.parameter_update(gu.grad_apply(lr_mul, random_grad)) cop2_state, cop2_GFE = cop2.compute_StateTAP(use_GD=False, tol=1e-7, max_iters=50) regress = cop2_GFE - cop1_GFE < 0 if regress: if lr < 1e-6: assert False, \ "TAP FE gradient is not working properly for Gaussian models" break else: lr *= 0.5 else: break
def test_bernoulli_GFE_derivatives(): num_units = 500 layer_1 = layers.BernoulliLayer(num_units) layer_2 = layers.BernoulliLayer(num_units) layer_3 = layers.BernoulliLayer(num_units) rbm = model.Model([layer_1, layer_2, layer_3]) for i in range(len(rbm.weights)): rbm.weights[i].params.matrix[:] = \ 0.01 * be.randn(rbm.weights[i].shape) for lay in rbm.layers: lay.params.loc[:] = be.rand_like(lay.params.loc) state = rbm.compute_StateTAP(init_lr=0.1, tol=1e-7, max_iters=50) GFE = rbm.gibbs_free_energy(state) lr = 0.1 gogogo = True grad = rbm.grad_TAP_free_energy(0.1, 1e-7, 50) while gogogo: cop = deepcopy(rbm) lr_mul = partial(be.tmul, -lr) delta = gu.grad_apply(lr_mul, grad) cop.parameter_update(delta) cop_state = cop.compute_StateTAP(init_lr=0.1, tol=1e-7, max_iters=50) cop_GFE = cop.gibbs_free_energy(cop_state) regress = cop_GFE - GFE < 0.0 print(lr, cop_GFE, GFE, cop_GFE - GFE, regress) if regress: if lr < 1e-6: assert False, \ "TAP FE gradient is not working properly for Bernoulli models" break else: lr *= 0.5 else: break
def test_gaussian_Compute_StateTAP_GD(): num_units = 10 layer_1 = layers.GaussianLayer(num_units) layer_2 = layers.BernoulliLayer(num_units) rbm = BoltzmannMachine([layer_1, layer_2]) for i in range(len(rbm.connections)): rbm.connections[i].weights.params.matrix[:] = \ 0.01 * be.randn(rbm.connections[i].shape) for lay in rbm.layers: lay.params.loc[:] = be.rand_like(lay.params.loc) for i in range(100): random_state = StateTAP.from_model_rand(rbm) GFE = rbm.gibbs_free_energy(random_state.cumulants) _,min_GFE = rbm._compute_StateTAP_GD(seed=random_state) if GFE - min_GFE < 0.0: assert False, \ "compute_StateTAP_self_consistent is not reducing the GFE"
def test_bernoulli_GFE_magnetization_gradient(): num_units = 500 layer_1 = layers.BernoulliLayer(num_units) layer_2 = layers.BernoulliLayer(num_units) layer_3 = layers.BernoulliLayer(num_units) layer_4 = layers.BernoulliLayer(num_units) rbm = model.Model([layer_1, layer_2, layer_3, layer_4]) for i in range(len(rbm.weights)): rbm.weights[i].params.matrix[:] = \ 0.01 * be.randn(rbm.weights[i].shape) for lay in rbm.layers: lay.params.loc[:] = be.rand_like(lay.params.loc) state = mu.StateTAP.from_model_rand(rbm) GFE = rbm.gibbs_free_energy(state) lr = 0.001 gogogo = True grad = rbm._TAP_magnetization_grad(state) while gogogo: cop = deepcopy(state) for i in range(rbm.num_layers): cop.cumulants[ i].mean[:] = state.cumulants[i].mean + lr * grad[i].mean GFE_next = rbm.gibbs_free_energy(cop) regress = GFE_next - GFE < 0.0 if regress: if lr < 1e-6: assert False,\ "Bernoulli GFE magnetization gradient is wrong" break else: lr *= 0.5 else: break
def test_weights_energy(): ly = layers.Weights((num_vis, num_hid)) vis = be.randn((num_samples, num_vis)) hid = be.randn((num_samples, num_hid)) ly.energy(vis, hid)
def test_parameter_step(): ly = layers.Weights((num_vis, num_hid)) deltas = layers.ParamsWeights(be.randn(ly.shape)) ly.parameter_step(deltas)
def test_gaussian_derivatives(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.GaussianLayer(num_visible_units) hid_layer = layers.GaussianLayer(num_hidden_units) rbm = hidden.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units, )) b = be.randn((num_hidden_units, )) log_var_a = 0.1 * be.randn((num_visible_units, )) log_var_b = 0.1 * be.randn((num_hidden_units, )) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].int_params.loc[:] = a rbm.layers[1].int_params.loc[:] = b rbm.layers[0].int_params.log_var[:] = log_var_a rbm.layers[1].int_params.log_var[:] = log_var_b rbm.weights[0].int_params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) visible_var = be.exp(log_var_a) vdata_scaled = vdata / be.broadcast(visible_var, vdata) # compute the mean of the hidden layer rbm.layers[1].update([vdata_scaled], [rbm.weights[0].W()]) hidden_var = be.exp(log_var_b) hid_mean = rbm.layers[1].mean() hid_mean_scaled = rbm.layers[1].rescale(hid_mean) # compute the derivatives d_vis_loc = -be.mean(vdata_scaled, axis=0) d_vis_logvar = -0.5 * be.mean(be.square(be.subtract(a, vdata)), axis=0) d_vis_logvar += be.batch_dot( hid_mean_scaled, be.transpose(W), vdata, axis=0) / len(vdata) d_vis_logvar /= visible_var d_hid_loc = -be.mean(hid_mean_scaled, axis=0) d_hid_logvar = -0.5 * be.mean( be.square(hid_mean - be.broadcast(b, hid_mean)), axis=0) d_hid_logvar += be.batch_dot(vdata_scaled, W, hid_mean, axis=0) / len(hid_mean) d_hid_logvar /= hidden_var d_W = -be.batch_outer(vdata_scaled, hid_mean_scaled) / len(vdata_scaled) # compute the derivatives using the layer functions rbm.layers[1].update([vdata_scaled], [rbm.weights[0].W()]) rbm.layers[0].update([hid_mean_scaled], [rbm.weights[0].W_T()]) vis_derivs = rbm.layers[0].derivatives(vdata, [hid_mean_scaled], [rbm.weights[0].W()]) hid_derivs = rbm.layers[1].derivatives(hid_mean, [vdata_scaled], [rbm.weights[0].W_T()]) weight_derivs = rbm.weights[0].derivatives(vdata_scaled, hid_mean_scaled) assert be.allclose(d_vis_loc, vis_derivs.loc), \ "derivative of visible loc wrong in gaussian-gaussian rbm" assert be.allclose(d_hid_loc, hid_derivs.loc), \ "derivative of hidden loc wrong in gaussian-gaussian rbm" assert be.allclose(d_vis_logvar, vis_derivs.log_var, rtol=1e-05, atol=1e-01), \ "derivative of visible log_var wrong in gaussian-gaussian rbm" assert be.allclose(d_hid_logvar, hid_derivs.log_var, rtol=1e-05, atol=1e-01), \ "derivative of hidden log_var wrong in gaussian-gaussian rbm" assert be.allclose(d_W, weight_derivs.matrix), \ "derivative of weights wrong in gaussian-gaussian rbm"