def setup(): """ Create pickle file with a simple model. """ # tearDown is guaranteed to run pop_load_data. control.push_load_data(False) with open('dbm.pkl', 'wb') as f: dataset = MNIST(which_set='train', start=0, stop=100, binarize=True) vis_layer = BinaryVector(nvis=784, bias_from_marginals=dataset) hid_layer1 = BinaryVectorMaxPool(layer_name='h1', pool_size=1, irange=.05, init_bias=-2., detector_layer_dim=50) hid_layer2 = BinaryVectorMaxPool(layer_name='h2', pool_size=1, irange=.05, init_bias=-2., detector_layer_dim=10) model = DBM(batch_size=20, niter=2, visible_layer=vis_layer, hidden_layers=[hid_layer1, hid_layer2]) model.dataset_yaml_src = """ !obj:pylearn2.datasets.binarizer.Binarizer { raw: !obj:pylearn2.datasets.mnist.MNIST { which_set: "train", start: 0, stop: 100 } } """ model.layer_to_chains = model.make_layer_to_state(1) cPickle.dump(model, f, protocol=cPickle.HIGHEST_PROTOCOL)
def test_softmax_mf_sample_consistent(): # A test of the Softmax class # Verifies that the mean field update is consistent with # the sampling function # Since a Softmax layer contains only one random variable # (with n_classes possible values) the mean field assumption # does not impose any restriction so mf_update simply gives # the true expected value of h given v. # We can thus use mf_update to compute the expected value # of a sample of y conditioned on v, and check that samples # drawn using the layer's sample method convert to that # value. rng = np.random.RandomState([2012, 11, 1, 1154]) theano_rng = MRG_RandomStreams(2012 + 11 + 1 + 1154) num_samples = 1000 tol = .042 # Make DBM num_vis = rng.randint(1, 11) n_classes = rng.randint(1, 11) v = BinaryVector(num_vis) v.set_biases(rng.uniform(-1., 1., (num_vis, )).astype(config.floatX)) y = Softmax(n_classes=n_classes, layer_name='y', irange=1.) y.set_biases(rng.uniform(-1., 1., (n_classes, )).astype(config.floatX)) dbm = DBM(visible_layer=v, hidden_layers=[y], batch_size=1, niter=50) # Randomly pick a v to condition on # (Random numbers are generated via dbm.rng) layer_to_state = dbm.make_layer_to_state(1) v_state = layer_to_state[v] y_state = layer_to_state[y] # Infer P(y | v) using mean field expected_y = y.mf_update(state_below=v.upward_state(v_state)) expected_y = expected_y[0, :] expected_y = expected_y.eval() # copy all the states out into a batch size of num_samples cause_copy = sharedX(np.zeros((num_samples, ))).dimshuffle(0, 'x') v_state = v_state[0, :] + cause_copy y_state = y_state[0, :] + cause_copy y_samples = y.sample(state_below=v.upward_state(v_state), theano_rng=theano_rng) y_samples = function([], y_samples)() check_multinomial_samples(y_samples, (num_samples, n_classes), expected_y, tol)
def test_variational_cd(): # Verifies that VariationalCD works well with make_layer_to_symbolic_state visible_layer = BinaryVector(nvis=100) hidden_layer = BinaryVectorMaxPool(detector_layer_dim=500, pool_size=1, layer_name='h', irange=0.05, init_bias=-2.0) model = DBM(visible_layer=visible_layer, hidden_layers=[hidden_layer], batch_size=100, niter=1) cost = VariationalCD(num_chains=100, num_gibbs_steps=2) data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s' % (source) arg = space.make_theano_batch(name=name) theano_args.append(arg) theano_args = tuple(theano_args) nested_args = mapping.nest(theano_args) grads, updates = cost.get_gradients(model, nested_args)
def make_random_basic_binary_dbm( rng, pool_size_1, num_vis = None, num_pool_1 = None, num_pool_2 = None, pool_size_2 = None, center = False ): """ Makes a DBM with BinaryVector for the visible layer, and two hidden layers of type BinaryVectorMaxPool. The weights and biases are initialized randomly with somewhat large values (i.e., not what you'd want to use for learning) rng: A numpy RandomState. pool_size_1: The size of the pools to use in the first layer. """ if num_vis is None: num_vis = rng.randint(1,11) if num_pool_1 is None: num_pool_1 = rng.randint(1,11) if num_pool_2 is None: num_pool_2 = rng.randint(1,11) if pool_size_2 is None: pool_size_2 = rng.randint(1,6) num_h1 = num_pool_1 * pool_size_1 num_h2 = num_pool_2 * pool_size_2 v = BinaryVector(num_vis, center=center) v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX), recenter=center) h1 = BinaryVectorMaxPool( detector_layer_dim = num_h1, pool_size = pool_size_1, layer_name = 'h1', center = center, irange = 1.) h1.set_biases(rng.uniform(-1., 1., (num_h1,)).astype(config.floatX), recenter=center) h2 = BinaryVectorMaxPool( center = center, detector_layer_dim = num_h2, pool_size = pool_size_2, layer_name = 'h2', irange = 1.) h2.set_biases(rng.uniform(-1., 1., (num_h2,)).astype(config.floatX), recenter=center) dbm = DBM(visible_layer = v, hidden_layers = [h1, h2], batch_size = 1, niter = 50) return dbm
def stitch_rbms(batch_size, rbm_list, niter, inference_procedure=None, targets=False): """ Returns a DBM initialized with pre-trained RBMs, with weights and biases initialized according to R. Salakhutdinov's policy. This method assumes the RBMs were trained normally. It divides the first and last hidden layer's weights by two and initialized a hidden layer's biases as the mean of its biases and the biases of the visible layer of the RBM above it. """ assert len(rbm_list) > 1 # For intermediary hidden layers, there are two set of biases to choose # from: those from the hidden layer of the given RBM, and those from # the visible layer of the RBM above it. As in R. Salakhutdinov's code, # we handle this by computing the mean of those two sets of biases. for this_rbm, above_rbm in zip(rbm_list[:-1], rbm_list[1:]): hidden_layer = this_rbm.hidden_layers[0] visible_layer = above_rbm.visible_layer new_biases = 0.5 * (hidden_layer.get_biases() + visible_layer.get_biases()) hidden_layer.set_biases(new_biases) visible_layer = rbm_list[0].visible_layer visible_layer.dbm = None hidden_layers = [] for rbm in rbm_list: # Make sure all DBM have only one hidden layer, except for the last # one, which can have an optional target layer if rbm == rbm_list[-1]: if targets: assert len(rbm.hidden_layers) == 2 else: assert len(rbm.hidden_layers) == 1 else: assert len(rbm.hidden_layers) == 1 hidden_layers = hidden_layers + rbm.hidden_layers for hidden_layer in hidden_layers: hidden_layer.dbm = None # Divide first and last hidden layer's weights by two, as described # in R. Salakhutdinov's paper (equivalent to training with RBMs with # doubled weights) first_hidden_layer = hidden_layers[-1] if targets: last_hidden_layer = hidden_layers[-2] else: last_hidden_layer = hidden_layers[-1] first_hidden_layer.set_weights(0.5 * first_hidden_layer.get_weights()) last_hidden_layer.set_weights(0.5 * last_hidden_layer.get_weights()) return DBM(batch_size, visible_layer, hidden_layers, niter, inference_procedure)
def test_softmax_mf_energy_consistent_centering(): # A test of the Softmax class # Verifies that the mean field update is consistent with # the energy function when using the centering trick # Since a Softmax layer contains only one random variable # (with n_classes possible values) the mean field assumption # does not impose any restriction so mf_update simply gives # the true expected value of h given v. # We also know P(h | v) # = P(h, v) / P( v) # = P(h, v) / sum_h P(h, v) # = exp(-E(h, v)) / sum_h exp(-E(h, v)) # So we can check that computing P(h | v) with both # methods works the same way rng = np.random.RandomState([2012,11,1,1131]) # Make DBM num_vis = rng.randint(1,11) n_classes = rng.randint(1, 11) v = BinaryVector(num_vis, center=True) v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX), recenter=True) y = Softmax( n_classes = n_classes, layer_name = 'y', irange = 1., center=True) y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX), recenter=True) dbm = DBM(visible_layer = v, hidden_layers = [y], batch_size = 1, niter = 50) # Randomly pick a v to condition on # (Random numbers are generated via dbm.rng) layer_to_state = dbm.make_layer_to_state(1) v_state = layer_to_state[v] y_state = layer_to_state[y] # Infer P(y | v) using mean field expected_y = y.mf_update( state_below = v.upward_state(v_state)) expected_y = expected_y[0, :] expected_y = expected_y.eval() # Infer P(y | v) using the energy function energy = dbm.energy(V = v_state, hidden = [y_state]) unnormalized_prob = T.exp(-energy) assert unnormalized_prob.ndim == 1 unnormalized_prob = unnormalized_prob[0] unnormalized_prob = function([], unnormalized_prob) def compute_unnormalized_prob(which): write_y = np.zeros((n_classes,)) write_y[which] = 1. y_value = y_state.get_value() y_value[0, :] = write_y y_state.set_value(y_value) return unnormalized_prob() probs = [compute_unnormalized_prob(idx) for idx in xrange(n_classes)] denom = sum(probs) probs = [on_prob / denom for on_prob in probs] # np.asarray(probs) doesn't make a numpy vector, so I do it manually wtf_numpy = np.zeros((n_classes,)) for i in xrange(n_classes): wtf_numpy[i] = probs[i] probs = wtf_numpy if not np.allclose(expected_y, probs): print 'mean field expectation of h:',expected_y print 'expectation of h based on enumerating energy function values:',probs assert False
def test_softmax_mf_sample_consistent(): # A test of the Softmax class # Verifies that the mean field update is consistent with # the sampling function # Since a Softmax layer contains only one random variable # (with n_classes possible values) the mean field assumption # does not impose any restriction so mf_update simply gives # the true expected value of h given v. # We can thus use mf_update to compute the expected value # of a sample of y conditioned on v, and check that samples # drawn using the layer's sample method convert to that # value. rng = np.random.RandomState([2012,11,1,1154]) theano_rng = MRG_RandomStreams(2012+11+1+1154) num_samples = 1000 tol = .042 # Make DBM num_vis = rng.randint(1,11) n_classes = rng.randint(1, 11) v = BinaryVector(num_vis) v.set_biases(rng.uniform(-1., 1., (num_vis,)).astype(config.floatX)) y = Softmax( n_classes = n_classes, layer_name = 'y', irange = 1.) y.set_biases(rng.uniform(-1., 1., (n_classes,)).astype(config.floatX)) dbm = DBM(visible_layer = v, hidden_layers = [y], batch_size = 1, niter = 50) # Randomly pick a v to condition on # (Random numbers are generated via dbm.rng) layer_to_state = dbm.make_layer_to_state(1) v_state = layer_to_state[v] y_state = layer_to_state[y] # Infer P(y | v) using mean field expected_y = y.mf_update( state_below = v.upward_state(v_state)) expected_y = expected_y[0, :] expected_y = expected_y.eval() # copy all the states out into a batch size of num_samples cause_copy = sharedX(np.zeros((num_samples,))).dimshuffle(0,'x') v_state = v_state[0,:] + cause_copy y_state = y_state[0,:] + cause_copy y_samples = y.sample(state_below = v.upward_state(v_state), theano_rng=theano_rng) y_samples = function([], y_samples)() check_multinomial_samples(y_samples, (num_samples, n_classes), expected_y, tol)
def test_ais(): """ Test ais computation by comparing the output of estimate_likelihood to Russ's code's output for the same parameters. """ try: # TODO: the one_hot=True is only necessary because one_hot=False is # broken, remove it after one_hot=False is fixed. trainset = MNIST(which_set='train', one_hot=True) testset = MNIST(which_set='test', one_hot=True) except NoDataPathError: raise SkipTest("PYLEARN2_DATA_PATH environment variable not defined") nvis = 784 nhid = 20 # Random initialization of RBM parameters numpy.random.seed(98734) w_hid = 10 * numpy.cast[theano.config.floatX](numpy.random.randn( nvis, nhid)) b_vis = 10 * numpy.cast[theano.config.floatX](numpy.random.randn(nvis)) b_hid = 10 * numpy.cast[theano.config.floatX](numpy.random.randn(nhid)) # Initialization of RBM visible_layer = BinaryVector(nvis) hidden_layer = BinaryVectorMaxPool(detector_layer_dim=nhid, pool_size=1, layer_name='h', irange=0.1) rbm = DBM(100, visible_layer, [hidden_layer], 1) rbm.visible_layer.set_biases(b_vis) rbm.hidden_layers[0].set_weights(w_hid) rbm.hidden_layers[0].set_biases(b_hid) rbm.nvis = nvis rbm.nhid = nhid # Compute real logz and associated train_ll and test_ll using rbm_tools v_sample = T.matrix('v_sample') h_sample = T.matrix('h_sample') W = theano.shared(rbm.hidden_layers[0].get_weights()) hbias = theano.shared(rbm.hidden_layers[0].get_biases()) vbias = theano.shared(rbm.visible_layer.get_biases()) wx_b = T.dot(v_sample, W) + hbias vbias_term = T.dot(v_sample, vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) free_energy_v = -hidden_term - vbias_term free_energy_v_fn = theano.function(inputs=[v_sample], outputs=free_energy_v) wh_c = T.dot(h_sample, W.T) + vbias hbias_term = T.dot(h_sample, hbias) visible_term = T.sum(T.log(1 + T.exp(wh_c)), axis=1) free_energy_h = -visible_term - hbias_term free_energy_h_fn = theano.function(inputs=[h_sample], outputs=free_energy_h) real_logz = rbm_tools.compute_log_z(rbm, free_energy_h_fn) real_ais_train_ll = -rbm_tools.compute_nll( rbm, trainset.get_design_matrix(), real_logz, free_energy_v_fn) real_ais_test_ll = -rbm_tools.compute_nll(rbm, testset.get_design_matrix(), real_logz, free_energy_v_fn) # Compute train_ll, test_ll and logz using dbm_metrics train_ll, test_ll, logz = dbm_metrics.estimate_likelihood([W], [vbias, hbias], trainset, testset, pos_mf_steps=100) assert (real_logz - logz) < 2.0 assert (real_ais_train_ll - train_ll) < 2.0 assert (real_ais_test_ll - test_ll) < 2.0
def test_ais(): """ Test ais computation by comparing the output of estimate_likelihood to Russ's code's output for the same parameters. """ try: # TODO: the one_hot=True is only necessary because one_hot=False is # broken, remove it after one_hot=False is fixed. trainset = MNIST(which_set='train', one_hot=True) testset = MNIST(which_set='test', one_hot=True) except NoDataPathError: raise SkipTest("PYLEARN2_DATA_PATH environment variable not defined") nvis = 784 nhid = 20 # Random initialization of RBM parameters numpy.random.seed(98734) w_hid = 10 * numpy.cast[theano.config.floatX](numpy.random.randn(nvis, nhid)) b_vis = 10 * numpy.cast[theano.config.floatX](numpy.random.randn(nvis)) b_hid = 10 * numpy.cast[theano.config.floatX](numpy.random.randn(nhid)) # Initialization of RBM visible_layer = BinaryVector(nvis) hidden_layer = BinaryVectorMaxPool(detector_layer_dim=nhid, pool_size=1, layer_name='h', irange=0.1) rbm = DBM(100, visible_layer, [hidden_layer], 1) rbm.visible_layer.set_biases(b_vis) rbm.hidden_layers[0].set_weights(w_hid) rbm.hidden_layers[0].set_biases(b_hid) rbm.nvis = nvis rbm.nhid = nhid # Compute real logz and associated train_ll and test_ll using rbm_tools v_sample = T.matrix('v_sample') h_sample = T.matrix('h_sample') W = theano.shared(rbm.hidden_layers[0].get_weights()) hbias = theano.shared(rbm.hidden_layers[0].get_biases()) vbias = theano.shared(rbm.visible_layer.get_biases()) wx_b = T.dot(v_sample, W) + hbias vbias_term = T.dot(v_sample, vbias) hidden_term = T.sum(T.log(1 + T.exp(wx_b)), axis=1) free_energy_v = -hidden_term - vbias_term free_energy_v_fn = theano.function(inputs=[v_sample], outputs=free_energy_v) wh_c = T.dot(h_sample, W.T) + vbias hbias_term = T.dot(h_sample, hbias) visible_term = T.sum(T.log(1 + T.exp(wh_c)), axis=1) free_energy_h = -visible_term - hbias_term free_energy_h_fn = theano.function(inputs=[h_sample], outputs=free_energy_h) real_logz = rbm_tools.compute_log_z(rbm, free_energy_h_fn) real_ais_train_ll = -rbm_tools.compute_nll(rbm, trainset.get_design_matrix(), real_logz, free_energy_v_fn) real_ais_test_ll = -rbm_tools.compute_nll(rbm, testset.get_design_matrix(), real_logz, free_energy_v_fn) # Compute train_ll, test_ll and logz using dbm_metrics train_ll, test_ll, logz = dbm_metrics.estimate_likelihood([W], [vbias, hbias], trainset, testset, pos_mf_steps=100) assert (real_logz - logz) < 2.0 assert (real_ais_train_ll - train_ll) < 2.0 assert (real_ais_test_ll - test_ll) < 2.0