def test_exponential_derivatives(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.ExponentialLayer(num_visible_units) hid_layer = layers.ExponentialLayer(num_hidden_units) rbm = hidden.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters # for the exponential layers, we need a > 0, b > 0, and W < 0 a = be.rand((num_visible_units, )) b = be.rand((num_hidden_units, )) W = -be.rand((num_visible_units, num_hidden_units)) rbm.layers[0].int_params.loc[:] = a rbm.layers[1].int_params.loc[:] = b rbm.weights[0].int_params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) vdata_scaled = rbm.layers[0].rescale(vdata) # compute the mean of the hidden layer rbm.layers[1].update([vdata], [rbm.weights[0].W()]) hid_mean = rbm.layers[1].mean() hid_mean_scaled = rbm.layers[1].rescale(hid_mean) # compute the derivatives d_visible_loc = be.mean(vdata, axis=0) d_hidden_loc = be.mean(hid_mean_scaled, axis=0) d_W = -be.batch_outer(vdata, hid_mean_scaled) / len(vdata) # compute the derivatives using the layer functions vis_derivs = rbm.layers[0].derivatives(vdata, [hid_mean_scaled], [rbm.weights[0].W()]) hid_derivs = rbm.layers[1].derivatives(hid_mean, [vdata_scaled], [rbm.weights[0].W_T()]) weight_derivs = rbm.weights[0].derivatives(vdata, hid_mean_scaled) assert be.allclose(d_visible_loc, vis_derivs.loc), \ "derivative of visible loc wrong in exponential-exponential rbm" assert be.allclose(d_hidden_loc, hid_derivs.loc), \ "derivative of hidden loc wrong in exponential-exponential rbm" assert be.allclose(d_W, weight_derivs.matrix), \ "derivative of weights wrong in exponential-exponential rbm"
def test_bernoulli_derivatives(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.BernoulliLayer(num_visible_units) hid_layer = layers.BernoulliLayer(num_hidden_units) rbm = hidden.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units, )) b = be.randn((num_hidden_units, )) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].int_params['loc'] = a rbm.layers[1].int_params['loc'] = b rbm.weights[0].int_params['matrix'] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) vdata_scaled = rbm.layers[0].rescale(vdata) # compute the mean of the hidden layer rbm.layers[1].update(vdata, rbm.weights[0].W()) hid_mean = rbm.layers[1].mean() hid_mean_scaled = rbm.layers[1].rescale(hid_mean) # compute the derivatives d_visible_loc = -be.mean(vdata, axis=0) d_hidden_loc = -be.mean(hid_mean_scaled, axis=0) d_W = -be.batch_outer(vdata, hid_mean_scaled) / len(vdata) # compute the derivatives using the layer functions vis_derivs = rbm.layers[0].derivatives(vdata, hid_mean_scaled, rbm.weights[0].W()) hid_derivs = rbm.layers[1].derivatives(hid_mean, vdata_scaled, be.transpose(rbm.weights[0].W())) weight_derivs = rbm.weights[0].derivatives(vdata, hid_mean_scaled) assert be.allclose(d_visible_loc, vis_derivs['loc']), \ "derivative of visible loc wrong in bernoulli-bernoulli rbm" assert be.allclose(d_hidden_loc, hid_derivs['loc']), \ "derivative of hidden loc wrong in bernoulli-bernoulli rbm" assert be.allclose(d_W, weight_derivs['matrix']), \ "derivative of weights wrong in bernoulli-bernoulli rbm"
def update(self, samples, axis=0) -> None: """ Update the online calculation of the mean. Notes: Modifies the metrics in place. Args: samples: data samples Returns: None """ n = len(samples) sample_mean = be.mean(samples, axis=axis) # initialize the num and mean attributes if necessary if self.mean is None: self.mean = be.zeros_like(sample_mean) self.num = 0 # update the num and mean attributes tmp = self.num*self.mean + n*sample_mean self.num += n self.mean = tmp / max(self.num, 1)
def test_pdist(): n = 500 a_shape = (1000, n) b_shape = (1000, n) # distance distributions a_mean, a_scale = 1, 1 b_mean, b_scale = -1, 1 be.set_seed() a = a_mean + a_scale * be.randn(a_shape) b = b_mean + b_scale * be.randn(b_shape) dists = math_utils.pdist(a, b) dists_t = math_utils.pdist(b, a) assert be.shape(dists) == (1000, 1000) assert be.allclose(be.transpose(dists_t), dists) assert be.mean(dists) > 2 * math.sqrt(n) and be.mean( dists) < 3 * math.sqrt(n)
def test_gaussian_1D_1mode_train(): # create some example data num = 10000 mu = 3 sigma = 1 samples = be.randn((num, 1)) * sigma + mu # set up the reader to get minibatches batch_size = 100 samples_train, samples_validate = batch.split_tensor(samples, 0.9) data = batch.Batch({ 'train': batch.InMemoryTable(samples_train, batch_size), 'validate': batch.InMemoryTable(samples_validate, batch_size) }) # parameters learning_rate = schedules.PowerLawDecay(initial=0.1, coefficient=0.1) mc_steps = 1 num_epochs = 10 num_sample_steps = 100 # set up the model and initialize the parameters vis_layer = layers.GaussianLayer(1) hid_layer = layers.OneHotLayer(1) rbm = BoltzmannMachine([vis_layer, hid_layer]) rbm.initialize(data, method='hinton') # modify the parameters to shift the initialized model from the data # this forces it to train rbm.layers[0].params = layers.ParamsGaussian( rbm.layers[0].params.loc - 3, rbm.layers[0].params.log_var - 1) # set up the optimizer and the fit method opt = optimizers.ADAM(stepsize=learning_rate) cd = fit.SGD(rbm, data) # fit the model print('training with persistent contrastive divergence') cd.train(opt, num_epochs, method=fit.pcd, mcsteps=mc_steps) # sample data from the trained model model_state = \ samplers.SequentialMC.generate_fantasy_state(rbm, num, num_sample_steps) pts_trained = model_state[0] percent_error = 10 mu_trained = be.mean(pts_trained) assert numpy.abs(mu_trained / mu - 1) < (percent_error / 100) sigma_trained = numpy.sqrt(be.var(pts_trained)) assert numpy.abs(sigma_trained / sigma - 1) < (percent_error / 100)
def test_mean(): # create some random data s = be.rand((100000, )) # reference result ref_mean = be.mean(s) # do the online calculation mv = math_utils.MeanVarianceCalculator() for i in range(10): mv.update(s[i * 10000:(i + 1) * 10000]) assert be.allclose(be.float_tensor(np.array([ref_mean])), be.float_tensor(np.array([mv.mean])))
def test_bernoulli_log_partition_gradient(): lay = layers.BernoulliLayer(500) lay.params.loc[:] = be.rand_like(lay.params.loc) * 2.0 - 1.0 A = be.rand((1, 500)) B = be.rand_like(A) grad = lay.grad_log_partition_function(A, B) logZ = be.mean(lay.log_partition_function(A, B), axis=0) lr = 0.01 gogogo = True while gogogo: cop = deepcopy(lay) cop.params.loc[:] = lay.params.loc + lr * grad.loc logZ_next = be.mean(cop.log_partition_function(A, B), axis=0) regress = logZ_next - logZ < 0.0 if True in regress: if lr < 1e-6: assert False, \ "gradient of Bernoulli log partition function is wrong" break else: lr *= 0.5 else: break
def test_mean_2d(): # create some random data num =5000 num_steps = 10 stepsize = num // num_steps s = be.rand((num,10)) # reference result ref_mean = be.mean(s, axis=0) # do the online calculation mv = math_utils.MeanArrayCalculator() for i in range(num_steps): mv.update(s[i*stepsize:(i+1)*stepsize], axis=0) assert be.allclose(ref_mean, mv.mean)
def test_mean(): # create some random data num = 100000 num_steps = 10 stepsize = num // num_steps s = be.rand((num,)) # reference result ref_mean = be.mean(s) # do the online calculation mv = math_utils.MeanCalculator() for i in range(num_steps): mv.update(s[i*stepsize:(i+1)*stepsize]) assert be.allclose(be.float_tensor(np.array([ref_mean])), be.float_tensor(np.array([mv.mean])))
def update(self, samples, **kwargs) -> None: """ Update the online calculation of the mean. Notes: Modifies the metrics in place. Args: samples: data samples Returns: None """ num_samples = len(samples) self.num += num_samples self.mean = self.mean + (be.mean(samples, **kwargs) - self.mean) * num_samples / self.num
def test_mean_variance_2d(): # create some random data num = 10000 dim2 = 10 num_steps = 10 stepsize = num // num_steps s = be.rand((num,dim2)) # reference result ref_mean = be.mean(s, axis=0) ref_var = be.var(s, axis=0) # do the online calculation mv = math_utils.MeanVarianceArrayCalculator() for i in range(num_steps): mv.update(s[i*stepsize:(i+1)*stepsize]) assert be.allclose(ref_mean, mv.mean) assert be.allclose(ref_var, mv.var, rtol=1e-3, atol=1e-5)
def update(self, samples) -> None: """ Update the online calculation of the mean. Notes: Modifies the metrics in place. Args: samples (tensor): data samples Returns: None """ n = len(samples) sample_mean = be.mean(samples) # update the num and mean attributes self.num += n self.mean += (sample_mean - self.mean) * n / max(self.num, 1)
def test_gaussian_derivatives(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.GaussianLayer(num_visible_units) hid_layer = layers.GaussianLayer(num_hidden_units) rbm = hidden.Model([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units, )) b = be.randn((num_hidden_units, )) log_var_a = 0.1 * be.randn((num_visible_units, )) log_var_b = 0.1 * be.randn((num_hidden_units, )) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].int_params.loc[:] = a rbm.layers[1].int_params.loc[:] = b rbm.layers[0].int_params.log_var[:] = log_var_a rbm.layers[1].int_params.log_var[:] = log_var_b rbm.weights[0].int_params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) visible_var = be.exp(log_var_a) vdata_scaled = vdata / be.broadcast(visible_var, vdata) # compute the mean of the hidden layer rbm.layers[1].update([vdata_scaled], [rbm.weights[0].W()]) hidden_var = be.exp(log_var_b) hid_mean = rbm.layers[1].mean() hid_mean_scaled = rbm.layers[1].rescale(hid_mean) # compute the derivatives d_vis_loc = -be.mean(vdata_scaled, axis=0) d_vis_logvar = -0.5 * be.mean(be.square(be.subtract(a, vdata)), axis=0) d_vis_logvar += be.batch_dot( hid_mean_scaled, be.transpose(W), vdata, axis=0) / len(vdata) d_vis_logvar /= visible_var d_hid_loc = -be.mean(hid_mean_scaled, axis=0) d_hid_logvar = -0.5 * be.mean( be.square(hid_mean - be.broadcast(b, hid_mean)), axis=0) d_hid_logvar += be.batch_dot(vdata_scaled, W, hid_mean, axis=0) / len(hid_mean) d_hid_logvar /= hidden_var d_W = -be.batch_outer(vdata_scaled, hid_mean_scaled) / len(vdata_scaled) # compute the derivatives using the layer functions rbm.layers[1].update([vdata_scaled], [rbm.weights[0].W()]) rbm.layers[0].update([hid_mean_scaled], [rbm.weights[0].W_T()]) vis_derivs = rbm.layers[0].derivatives(vdata, [hid_mean_scaled], [rbm.weights[0].W()]) hid_derivs = rbm.layers[1].derivatives(hid_mean, [vdata_scaled], [rbm.weights[0].W_T()]) weight_derivs = rbm.weights[0].derivatives(vdata_scaled, hid_mean_scaled) assert be.allclose(d_vis_loc, vis_derivs.loc), \ "derivative of visible loc wrong in gaussian-gaussian rbm" assert be.allclose(d_hid_loc, hid_derivs.loc), \ "derivative of hidden loc wrong in gaussian-gaussian rbm" assert be.allclose(d_vis_logvar, vis_derivs.log_var, rtol=1e-05, atol=1e-01), \ "derivative of visible log_var wrong in gaussian-gaussian rbm" assert be.allclose(d_hid_logvar, hid_derivs.log_var, rtol=1e-05, atol=1e-01), \ "derivative of hidden log_var wrong in gaussian-gaussian rbm" assert be.allclose(d_W, weight_derivs.matrix), \ "derivative of weights wrong in gaussian-gaussian rbm"
def test_onehot_derivatives(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.OneHotLayer(num_visible_units) hid_layer = layers.OneHotLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units,)) b = be.randn((num_hidden_units,)) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].params.loc[:] = a rbm.layers[1].params.loc[:] = b rbm.connections[0].weights.params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) vdata_scaled = rbm.layers[0].rescale(vdata) # compute the conditional mean of the hidden layer hid_mean = rbm.layers[1].conditional_mean([vdata], [rbm.connections[0].W()]) hid_mean_scaled = rbm.layers[1].rescale(hid_mean) # compute the derivatives d_visible_loc = -be.mean(vdata, axis=0) d_hidden_loc = -be.mean(hid_mean_scaled, axis=0) d_W = -be.batch_outer(vdata, hid_mean_scaled) / len(vdata) # compute the derivatives using the layer functions vis_derivs = rbm.layers[0].derivatives(vdata, [hid_mean_scaled], [rbm.connections[0].W(trans=True)]) hid_derivs = rbm.layers[1].derivatives(hid_mean, [vdata_scaled], [rbm.connections[0].W()]) weight_derivs = rbm.connections[0].weights.derivatives(vdata, hid_mean_scaled) # compute simple weighted derivatives using the layer functions scale = 2 scale_func = partial(be.multiply, be.float_scalar(scale)) vis_derivs_scaled = rbm.layers[0].derivatives(vdata, [hid_mean_scaled], [rbm.connections[0].W(trans=True)], weighting_function=scale_func) hid_derivs_scaled = rbm.layers[1].derivatives(hid_mean, [vdata_scaled], [rbm.connections[0].W()], weighting_function=scale_func) weight_derivs_scaled = rbm.connections[0].weights.derivatives(vdata, hid_mean_scaled, weighting_function=scale_func) assert be.allclose(d_visible_loc, vis_derivs[0].loc), \ "derivative of visible loc wrong in onehot-onehot rbm" assert be.allclose(d_hidden_loc, hid_derivs[0].loc), \ "derivative of hidden loc wrong in onehot-onehot rbm" assert be.allclose(d_W, weight_derivs[0].matrix), \ "derivative of weights wrong in onehot-onehot rbm" assert be.allclose(scale * d_visible_loc, vis_derivs_scaled[0].loc), \ "weighted derivative of visible loc wrong in onehot-onehot rbm" assert be.allclose(scale * d_hidden_loc, hid_derivs_scaled[0].loc), \ "weighted derivative of hidden loc wrong in onehot-onehot rbm" assert be.allclose(scale * d_W, weight_derivs_scaled[0].matrix), \ "weighted derivative of weights wrong in onehot-onehot rbm"
def test_gaussian_derivatives(): num_visible_units = 100 num_hidden_units = 50 batch_size = 25 # set a seed for the random number generator be.set_seed() # set up some layer and model objects vis_layer = layers.GaussianLayer(num_visible_units) hid_layer = layers.GaussianLayer(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.randn((num_visible_units,)) b = be.randn((num_hidden_units,)) log_var_a = 0.1 * be.randn((num_visible_units,)) log_var_b = 0.1 * be.randn((num_hidden_units,)) W = be.randn((num_visible_units, num_hidden_units)) rbm.layers[0].params.loc[:] = a rbm.layers[1].params.loc[:] = b rbm.layers[0].params.log_var[:] = log_var_a rbm.layers[1].params.log_var[:] = log_var_b rbm.connections[0].weights.params.matrix[:] = W # generate a random batch of data vdata = rbm.layers[0].random((batch_size, num_visible_units)) visible_var = be.exp(log_var_a) vdata_scaled = vdata / visible_var # compute the mean of the hidden layer hid_mean = rbm.layers[1].conditional_mean( [vdata_scaled], [rbm.connections[0].W()]) hidden_var = be.exp(log_var_b) hid_mean_scaled = rbm.layers[1].rescale(hid_mean) # compute the derivatives d_vis_loc = be.mean((a-vdata)/visible_var, axis=0) d_vis_logvar = -0.5 * be.mean(be.square(be.subtract(a, vdata)), axis=0) d_vis_logvar += be.batch_quadratic(hid_mean_scaled, be.transpose(W), vdata, axis=0) / len(vdata) d_vis_logvar /= visible_var d_hid_loc = be.mean((b-hid_mean)/hidden_var, axis=0) d_hid_logvar = -0.5 * be.mean(be.square(hid_mean - b), axis=0) d_hid_logvar += be.batch_quadratic(vdata_scaled, W, hid_mean, axis=0) / len(hid_mean) d_hid_logvar /= hidden_var d_W = -be.batch_outer(vdata_scaled, hid_mean_scaled) / len(vdata_scaled) # compute the derivatives using the layer functions vis_derivs = rbm.layers[0].derivatives(vdata, [hid_mean_scaled], [rbm.connections[0].W(trans=True)]) hid_derivs = rbm.layers[1].derivatives(hid_mean, [vdata_scaled], [rbm.connections[0].W()]) weight_derivs = rbm.connections[0].weights.derivatives(vdata_scaled, hid_mean_scaled) # compute simple weighted derivatives using the layer functions scale = 2 scale_func = partial(be.multiply, be.float_scalar(scale)) vis_derivs_scaled = rbm.layers[0].derivatives(vdata, [hid_mean_scaled], [rbm.connections[0].W(trans=True)], weighting_function=scale_func) hid_derivs_scaled = rbm.layers[1].derivatives(hid_mean, [vdata_scaled], [rbm.connections[0].W()], weighting_function=scale_func) weight_derivs_scaled = rbm.connections[0].weights.derivatives(vdata_scaled, hid_mean_scaled, weighting_function=scale_func) assert be.allclose(d_vis_loc, vis_derivs[0].loc), \ "derivative of visible loc wrong in gaussian-gaussian rbm" assert be.allclose(d_hid_loc, hid_derivs[0].loc), \ "derivative of hidden loc wrong in gaussian-gaussian rbm" assert be.allclose(d_vis_logvar, vis_derivs[0].log_var, rtol=1e-05, atol=1e-01), \ "derivative of visible log_var wrong in gaussian-gaussian rbm" assert be.allclose(d_hid_logvar, hid_derivs[0].log_var, rtol=1e-05, atol=1e-01), \ "derivative of hidden log_var wrong in gaussian-gaussian rbm" assert be.allclose(d_W, weight_derivs[0].matrix), \ "derivative of weights wrong in gaussian-gaussian rbm" assert be.allclose(scale * d_vis_loc, vis_derivs_scaled[0].loc), \ "weighted derivative of visible loc wrong in gaussian-gaussian rbm" assert be.allclose(scale * d_hid_loc, hid_derivs_scaled[0].loc), \ "weighted derivative of hidden loc wrong in gaussian-gaussian rbm" assert be.allclose(scale * d_vis_logvar, vis_derivs_scaled[0].log_var, rtol=1e-05, atol=1e-01), \ "weighted derivative of visible log_var wrong in gaussian-gaussian rbm" assert be.allclose(scale * d_hid_logvar, hid_derivs_scaled[0].log_var, rtol=1e-05, atol=1e-01), \ "weighted derivative of hidden log_var wrong in gaussian-gaussian rbm" assert be.allclose(scale * d_W, weight_derivs_scaled[0].matrix), \ "weighted derivative of weights wrong in gaussian-gaussian rbm"
def test_independent(): """ Test sampling from an rbm with two layers connected by a weight matrix that contains all zeros, so that the layers are independent. Note: This test compares values estimated by *sampling* to values computed analytically. It can fail for small batch_size, or strict tolerances, even if everything is working propery. """ num_visible_units = 20 num_hidden_units = 10 batch_size = 1000 steps = 100 mean_tol = 0.2 corr_tol = 0.2 # set a seed for the random number generator be.set_seed() layer_types = [ layers.BernoulliLayer, layers.GaussianLayer] for layer_type in layer_types: # set up some layer and model objects vis_layer = layer_type(num_visible_units) hid_layer = layer_type(num_hidden_units) rbm = BoltzmannMachine([vis_layer, hid_layer]) # randomly set the intrinsic model parameters a = be.rand((num_visible_units,)) b = be.rand((num_hidden_units,)) W = be.zeros((num_visible_units, num_hidden_units)) rbm.layers[0].params.loc[:] = a rbm.layers[1].params.loc[:] = b rbm.connections[0].weights.params.matrix[:] = W if layer_type == layers.GaussianLayer: log_var_a = be.randn((num_visible_units,)) log_var_b = be.randn((num_hidden_units,)) rbm.layers[0].params.log_var[:] = log_var_a rbm.layers[1].params.log_var[:] = log_var_b # initialize a state state = State.from_model(batch_size, rbm) # run a markov chain to update the state state = rbm.markov_chain(steps, state) # compute the mean state_for_moments = State.from_model(1, rbm) sample_mean = [be.mean(state[i], axis=0) for i in range(state.len)] model_mean = [rbm.layers[i].conditional_mean( rbm._connected_rescaled_units(i, state_for_moments), rbm._connected_weights(i)) for i in range(rbm.num_layers)] # check that the means are roughly equal for i in range(rbm.num_layers): ave = sample_mean[i] close = be.allclose(ave, model_mean[i][0], rtol=mean_tol, atol=mean_tol) assert close, "{0} {1}: sample mean does not match model mean".format(layer_type, i) # check the cross correlation between the layers crosscov = be.cov(state[0], state[1]) norm = be.outer(be.std(state[0], axis=0), be.std(state[1], axis=0)) crosscorr = be.divide(norm, crosscov) assert be.tmax(be.tabs(crosscorr)) < corr_tol, "{} cross correlation too large".format(layer_type)