def up_pass(params, pixels): """ Perform and upward pass from the visible pixels to the visible units of the top-level RBM. """ # This is deterministic. (i.e. It uses the real-valued # probabilities rather than sampling.) hid1_mean = logistic(pixels.dot(params[0].W_r) + params[0].b_r) hid2_mean = logistic(hid1_mean.dot(params[1].W_r) + params[1].b_r) return hid2_mean
def down_pass(params, v): """ Perform a deterministic downward pass from the visible units of the top-level RBM to the visible pixels. """ # The visible units of the top-level RBM include a softmax group # which is not directly connected to the visible pixels. hid2_mean = v[:, mnist.NUM_CLASSES:] hid1_mean = logistic(hid2_mean.dot(params[1].W_g) + params[1].b_g) vis_mean = logistic(hid1_mean.dot(params[0].W_g) + params[0].b_g) return vis_mean
def down_pass(params, v): """ Perform a deterministic downward pass from the visible units of the top-level RBM to the visible pixels. """ # The visible units of the top-level RBM include a softmax group # which is not directly connected to the visible pixels. hid2_mean = v[:,mnist.NUM_CLASSES:] hid1_mean = logistic(hid2_mean.dot(params[1].W_g) + params[1].b_g) vis_mean = logistic(hid1_mean.dot(params[0].W_g) + params[0].b_g) return vis_mean
def sample_h(rbm, v, end_of_chain): h_mean = logistic((v / rbm.sigma).dot(rbm.W.T) + rbm.h_bias) if not end_of_chain: h = h_mean > np.random.random(h_mean.shape) else: h = None return h, h_mean
def sample_h_noisy_relu(rbm, v, end_of_chain): propup = (v / rbm.sigma).dot(rbm.W.T) + rbm.h_bias h_mean = np.maximum(0, propup) if not end_of_chain: noise = np.sqrt(logistic(propup)) * np.random.standard_normal(propup.shape) h = np.maximum(0, propup + noise) else: h = None return h, h_mean
targets = targets[0:n] labels = labels[0:n] # These layers differ slightly from those in the paper. My main # motivation is to avoid having a square weight matrix between hidden # layers to avoid matrix transpose errors. num_vis = inputs.shape[1] num_hid1 = 529 # 23^2 num_hid2 = 484 # 22^2 num_top = 1936 # 44^2 batches = data.BatchIterator(inputs) initial_params = rbm.initial_params(num_hid1, num_vis) params = sgd(rbm_obj, initial_params, batches, momentum) inputs = logistic(inputs.dot(params.W.T) + params.h_bias) batches = data.BatchIterator(inputs) initial_params = rbm.initial_params(num_hid2, num_hid1) params = sgd(rbm_obj, initial_params, batches, momentum) inputs = logistic(inputs.dot(params.W.T) + params.h_bias) batches = data.BatchIterator(np.hstack((targets, inputs))) initial_params = rbm.initial_params(num_top, num_hid2 + mnist.NUM_CLASSES) def post_epoch(*args): print 'Mean hidden activation prob. is %.2f' % pcd.q # Optimization objective for the top-level RBM. pcd = rbm.pcd(rbm.sample_h, sample_v_softmax, rbm.neg_free_energy_grad,
targets = targets[0:n] labels = labels[0:n] # These layers differ slightly from those in the paper. My main # motivation is to avoid having a square weight matrix between hidden # layers to avoid matrix transpose errors. num_vis = inputs.shape[1] num_hid1 = 529 # 23^2 num_hid2 = 484 # 22^2 num_top = 1936 # 44^2 batches = data.BatchIterator(inputs) initial_params = rbm.initial_params(num_hid1, num_vis) params = sgd(rbm_obj, initial_params, batches, momentum) inputs = logistic(inputs.dot(params.W.T) + params.h_bias) batches = data.BatchIterator(inputs) initial_params = rbm.initial_params(num_hid2, num_hid1) params = sgd(rbm_obj, initial_params, batches, momentum) inputs = logistic(inputs.dot(params.W.T) + params.h_bias) batches = data.BatchIterator(np.hstack((targets, inputs))) initial_params = rbm.initial_params(num_top, num_hid2 + mnist.NUM_CLASSES) def post_epoch(*args): print 'Mean hidden activation prob. is %.2f' % pcd.q # Optimization objective for the top-level RBM. pcd = rbm.pcd(rbm.sample_h, sample_v_softmax, rbm.neg_free_energy_grad, weight_decay)
def contrastive_wake_sleep(params, data, weight_decay=None, cd_k=1): inputs, targets = data.inputs, data.targets num_cases = inputs.shape[0] # Turn the single tuple of parameters into something easier to # work with. dbn_params = dbn.stack_params(params) grad = [] # Wake phase. wake_hid1_states = rbm.sample_bernoulli(logistic(inputs.dot(dbn_params[0].W_r) + dbn_params[0].b_r)) wake_hid2_states = rbm.sample_bernoulli(logistic(wake_hid1_states.dot(dbn_params[1].W_r) + dbn_params[1].b_r)) # Contrastive divergence. gc = rbm.gibbs_chain(np.hstack((targets, wake_hid2_states)), dbn_params[-1], rbm.sample_h, sample_v_softmax, cd_k + 1) pos_sample = gc.next() if cd_k == 1: neg_sample = gc.next() else: recon_sample = gc.next() neg_sample = itertools.islice(gc, cd_k - 2, None).next() # Sleep phase. sleep_hid2_states = neg_sample[0][:,mnist.NUM_CLASSES:] sleep_hid1_states = rbm.sample_bernoulli(logistic(sleep_hid2_states.dot(dbn_params[1].W_g) + dbn_params[1].b_g)) sleep_vis_probs = logistic(sleep_hid1_states.dot(dbn_params[0].W_g) + dbn_params[0].b_g) # Predictions. p_sleep_hid2 = logistic(sleep_hid1_states.dot(dbn_params[1].W_r) + dbn_params[1].b_r) p_sleep_hid1 = logistic(sleep_vis_probs.dot(dbn_params[0].W_r) + dbn_params[0].b_r) p_wake_vis = logistic(wake_hid1_states.dot(dbn_params[0].W_g) + dbn_params[0].b_g) p_wake_hid1 = logistic(wake_hid2_states.dot(dbn_params[1].W_g) + dbn_params[1].b_g) # Gradients. # Layer 0. W_r_grad = sleep_vis_probs.T.dot(p_sleep_hid1 - sleep_hid1_states) / num_cases b_r_grad = np.mean(p_sleep_hid1 - sleep_hid1_states, 0) W_g_grad = wake_hid1_states.T.dot(p_wake_vis - inputs) / num_cases b_g_grad = np.mean(p_wake_vis - inputs, 0) grad.extend([W_r_grad, b_r_grad, W_g_grad, b_g_grad]) # Layer 1. W_r_grad = sleep_hid1_states.T.dot(p_sleep_hid2 - sleep_hid2_states) / num_cases b_r_grad = np.mean(p_sleep_hid2 - sleep_hid2_states, 0) W_g_grad = wake_hid2_states.T.dot(p_wake_hid1 - wake_hid1_states) / num_cases b_g_grad = np.mean(p_wake_hid1 - wake_hid1_states, 0) grad.extend([W_r_grad, b_r_grad, W_g_grad, b_g_grad]) # Top-level RBM. pos_grad = rbm.neg_free_energy_grad(dbn_params[-1], pos_sample) neg_grad = rbm.neg_free_energy_grad(dbn_params[-1], neg_sample) rbm_grad = map(operator.sub, neg_grad, pos_grad) grad.extend(rbm_grad) # Weight decay. if weight_decay: weight_grad = (weight_decay(p)[1] for p in params) grad = map(operator.add, grad, weight_grad) # One-step reconstruction error. if cd_k == 1: recon = sleep_vis_probs else: # Perform a determisitic down pass from the first sample of # the Gibbs chain in order to compute the one-step # reconstruction error. recon_hid2_probs = recon_sample[1][:,mnist.NUM_CLASSES:] recon_hid1_probs = rbm.sample_bernoulli(logistic(recon_hid2_probs.dot(dbn_params[1].W_g) + dbn_params[1].b_g)) recon = logistic(recon_hid1_probs.dot(dbn_params[0].W_g) + dbn_params[0].b_g) error = np.sum((inputs - recon) ** 2) / num_cases return error, grad
def contrastive_wake_sleep(params, data, weight_decay=None, cd_k=1): inputs, targets = data.inputs, data.targets num_cases = inputs.shape[0] # Turn the single tuple of parameters into something easier to # work with. dbn_params = dbn.stack_params(params) grad = [] # Wake phase. wake_hid1_states = rbm.sample_bernoulli( logistic(inputs.dot(dbn_params[0].W_r) + dbn_params[0].b_r)) wake_hid2_states = rbm.sample_bernoulli( logistic(wake_hid1_states.dot(dbn_params[1].W_r) + dbn_params[1].b_r)) # Contrastive divergence. gc = rbm.gibbs_chain(np.hstack( (targets, wake_hid2_states)), dbn_params[-1], rbm.sample_h, sample_v_softmax, cd_k + 1) pos_sample = gc.next() if cd_k == 1: neg_sample = gc.next() else: recon_sample = gc.next() neg_sample = itertools.islice(gc, cd_k - 2, None).next() # Sleep phase. sleep_hid2_states = neg_sample[0][:, mnist.NUM_CLASSES:] sleep_hid1_states = rbm.sample_bernoulli( logistic(sleep_hid2_states.dot(dbn_params[1].W_g) + dbn_params[1].b_g)) sleep_vis_probs = logistic( sleep_hid1_states.dot(dbn_params[0].W_g) + dbn_params[0].b_g) # Predictions. p_sleep_hid2 = logistic( sleep_hid1_states.dot(dbn_params[1].W_r) + dbn_params[1].b_r) p_sleep_hid1 = logistic( sleep_vis_probs.dot(dbn_params[0].W_r) + dbn_params[0].b_r) p_wake_vis = logistic( wake_hid1_states.dot(dbn_params[0].W_g) + dbn_params[0].b_g) p_wake_hid1 = logistic( wake_hid2_states.dot(dbn_params[1].W_g) + dbn_params[1].b_g) # Gradients. # Layer 0. W_r_grad = sleep_vis_probs.T.dot(p_sleep_hid1 - sleep_hid1_states) / num_cases b_r_grad = np.mean(p_sleep_hid1 - sleep_hid1_states, 0) W_g_grad = wake_hid1_states.T.dot(p_wake_vis - inputs) / num_cases b_g_grad = np.mean(p_wake_vis - inputs, 0) grad.extend([W_r_grad, b_r_grad, W_g_grad, b_g_grad]) # Layer 1. W_r_grad = sleep_hid1_states.T.dot(p_sleep_hid2 - sleep_hid2_states) / num_cases b_r_grad = np.mean(p_sleep_hid2 - sleep_hid2_states, 0) W_g_grad = wake_hid2_states.T.dot(p_wake_hid1 - wake_hid1_states) / num_cases b_g_grad = np.mean(p_wake_hid1 - wake_hid1_states, 0) grad.extend([W_r_grad, b_r_grad, W_g_grad, b_g_grad]) # Top-level RBM. pos_grad = rbm.neg_free_energy_grad(dbn_params[-1], pos_sample) neg_grad = rbm.neg_free_energy_grad(dbn_params[-1], neg_sample) rbm_grad = map(operator.sub, neg_grad, pos_grad) grad.extend(rbm_grad) # Weight decay. if weight_decay: weight_grad = (weight_decay(p)[1] for p in params) grad = map(operator.add, grad, weight_grad) # One-step reconstruction error. if cd_k == 1: recon = sleep_vis_probs else: # Perform a determisitic down pass from the first sample of # the Gibbs chain in order to compute the one-step # reconstruction error. recon_hid2_probs = recon_sample[1][:, mnist.NUM_CLASSES:] recon_hid1_probs = rbm.sample_bernoulli( logistic( recon_hid2_probs.dot(dbn_params[1].W_g) + dbn_params[1].b_g)) recon = logistic( recon_hid1_probs.dot(dbn_params[0].W_g) + dbn_params[0].b_g) error = np.sum((inputs - recon)**2) / num_cases return error, grad