def dbn_forward_pass(ws_vh, ws_v, ws_h, x, y=None): """ Deep belief net forward pass. x: input data (N x D matrix) y: Class label (1-of-K coded, N x K matrix). If not None, it is concatenated to the input for top layer RBM when calculating the output of the DBN. ws_vh: list of layer weights (L x D x H) ws_v: list of layer input biases (L x D x 1) ws_h: list of layer output biases (L x H x 1) Returns activations (continuous) and outputs (0-1, sigmoid(activations)) of top layer """ L = len(ws_vh) h = x.T # forward (bottom-up) pass for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h) + ws_h[l] h = gnp.logistic(ah) # if supervised, concatenate class labels to input to top layer RBM if y is not None: h = gnp.concatenate((y.T, h)) ah = gnp.dot(ws_vh[-1].T, h) + ws_h[-1] h = gnp.logistic(ah) return ah.T, h.T
def check_fisher_information_indep(): """Fisher information should agree with analytic solution for base rate RBM.""" with misc.gnumpy_conversion_check('allow'): rbm = random_base_rate_rbm() E_v = gnp.logistic(rbm.vbias) E_h = gnp.logistic(rbm.hbias) G = tractable.exact_fisher_information(rbm, batch_units=BATCH_UNITS) assert_close(G, G.T, 'G not symmetric') G_vis_vishid = G[:NVIS, NVIS + NHID:].reshape((NVIS, NVIS, NHID)) G_hid_vishid = G[NVIS:NVIS + NHID, NVIS + NHID:].reshape( (NHID, NVIS, NHID)) G_vishid_vishid = G[NVIS + NHID:, NVIS + NHID:].reshape( (NVIS, NHID, NVIS, NHID)) assert_close(G_vis_vishid[0, 0, 1], E_v[0] * (1. - E_v[0]) * E_h[1]) assert_close(G_vis_vishid[0, 1, 2], 0.) assert_close(G_hid_vishid[0, 1, 0], E_h[0] * (1. - E_h[0]) * E_v[1]) assert_close(G_hid_vishid[0, 1, 2], 0.) assert_close(G_vishid_vishid[0, 1, 0, 1], E_v[0] * E_h[1] * (1. - E_v[0] * E_h[1])) assert_close(G_vishid_vishid[0, 1, 0, 2], E_v[0] * (1. - E_v[0]) * E_h[1] * E_h[2]) assert_close(G_vishid_vishid[0, 2, 1, 2], E_h[2] * (1. - E_h[2]) * E_v[0] * E_v[1]) assert_close(G_vishid_vishid[0, 1, 2, 3], 0.)
def rbm_sample(w_vh, w_v, w_h, x, k=1, clamped=None): """ Sample from RBM with k steps of Gibbs sampling w_vh: Weights between visible and hidden units (matrix of size DxH) w_v: Visible unit biases (column vector of size Dx1) w_h: Hidden unit biases (column vector of size Hx1) x: Input (column vector of size DxN) k: Number of Gibbs steps. Default is 1. clamped: If not None, keeps the given elements of x clamped (constant) while sampling clamped is a two-tuple that gives the start and end indices of clamped elements Returns hidden unit and visible unit activations (matrices of size HxN, DxN) """ if clamped is not None: cx = x[clamped[0] : clamped[1], :] v = x for i in range(k): # sample hiddens ah = gnp.dot(w_vh.T, v) + w_h h = gnp.logistic(ah) hs = h > gnp.rand(h.shape[0], h.shape[1]) # sample visibles av = gnp.dot(w_vh, hs) + w_v v = gnp.logistic(av) if clamped is not None: v[clamped[0] : clamped[1], :] = cx return h, v
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn)**2, axis=0) * w cae_grad += (gdot(inpts.T, (Dsigmoid(hddn)**2 * (1 - 2 * hddn))) / m * gpu.sum(w**2, axis=0)) g[:self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot( delta, params[:self.m_end].reshape(self.shape)) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def exact_fisher_information_biases(rbm, batch_units=10, show_progress=False): batch_size = 2 ** batch_units nvis, nhid = rbm.nvis, rbm.nhid num_params = nvis + nhid s = gnp.zeros(num_params) G = gnp.zeros((num_params, num_params)) for hid, p in iter_configurations(rbm, batch_units=batch_units, show_progress=show_progress): g = gnp.zeros((batch_size, num_params)) cond_vis = gnp.logistic(rbm.vis_inputs(hid)) g[:, :nvis] = cond_vis g[:, nvis:] = hid s += gnp.dot(p, g) G += gnp.dot(g.T * p, g) diag_term = gnp.dot(p, g * (1. - g)) G += np.diag(diag_term.as_numpy_array()) G -= s[:, nax] * s[nax, :] return G
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[self.size:-self.shape[0]] = gdot(hddn.T, delta).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel() g[self.m_end:self.size] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def exact_samples(rbm, num, batch_units=10, show_progress=False): scores = get_scores(rbm, batch_units=batch_units).as_numpy_array() scores -= np.logaddexp.reduce(scores.ravel()) p = np.exp(scores) prefix_len = rbm.nhid - batch_units prefixes = combinations_array(prefix_len).as_numpy_array() postfixes = combinations_array(batch_units).as_numpy_array() p_row = p.sum(1) p_row /= p_row.sum() cond_p_col = p / p_row[:, nax] cond_p_col *= (1. - 1e-8) # keep np.random.multinomial from choking because the sum is greater than 1 vis = np.zeros((num, rbm.nvis)) hid = np.zeros((num, rbm.nhid)) with misc.gnumpy_conversion_check('allow'): rows = np.random.multinomial(1, p_row, size=num).argmax(1) #cols = np.random.multinomial(1, cond_p_col[rows, :]).argmax(1) cols = np.array([np.random.multinomial(1, cond_p_col[row, :]).argmax() for row in rows]) hid = np.hstack([prefixes[rows, :], postfixes[cols, :]]) vis = np.random.binomial(1, gnp.logistic(rbm.vis_inputs(hid))) return binary_rbms.RBMState(gnp.garray(vis), gnp.garray(hid))
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def dbn_supervised_predict_sample(ws_vh, ws_v, ws_h, x, k=20): """ Predict the class label of input x from supervised DBN WARNING: THIS IS PRETTY SLOW AND LESS RELIABLE THAN THE EXACT METHOD Uses the sampling method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 x: Input data. (NxD matrix) k: Number of Gibbs steps """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # we give random values to the supervised portion of the input v = gnp.concatenate((gnp.ones((K, N)) / K, h_prev)) # we keep the visible units clamped while sampling h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], v, k, clamped=(K, H)) # sample visible units of top level RBM given return v[0:K, :].T
def sigmoid(x, computeGrad=False): if (not computeGrad): f = gp.logistic(x) return f g = x * (1. - x) return g
def sigmoid(x, computeGrad = False): if (not computeGrad): f = gp.logistic(x) return f g = x * (1.-x) return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[: self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0] :] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * w cae_grad += gdot(inpts.T, (Dsigmoid(hddn) ** 2 * (1 - 2 * hddn))) / m * gpu.sum(w ** 2, axis=0) g[: self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot(delta, params[: self.m_end].reshape(self.shape)) g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pseudo_likelihood_for_bit(self, vis, i): """Returns the likelihood of bit i of vis given all other bits of vis.""" fe = self.free_energy(vis) vis_flip = vis vis_flip[:,i] = 1 - vis[:,i] fe_flip = self.free_energy(vis_flip) pl = gp.log(gp.logistic(fe_flip - fe)) return pl
def __init__(self, m, n, q=100, name=""): # name of layer self.name = name self.m = m # input layer size self.n = n # output layer size #self.p = p # piece group self.q = q # batch size self.dropout = 0.1 # dropout rate self.learn = 10**-3 self.l2reg = (1.0 - 10**-100) # activation function #self.f = lambda z: 1.0/(gpu.exp(-z)+1.0) self.f = lambda z: gpu.logistic(z) #self.f = lambda z: z # deriviative of activation function #self.g = lambda z: self.f(z)*(1.0-self.f(z)) self.g = lambda z: z * (1.0 - z) #self.g = lambda z: 1.0 d = 10**-8 # weight matrix self.w = gpu.garray( np.random.uniform(low=-d, high=d, size=(m, n)).astype(np.float32)) # bias vector self.b = gpu.garray( np.random.uniform(low=-d, high=d, size=(n)).astype(np.float32)) # input of forward propagation self.x = gpu.garray( np.random.uniform(low=-d, high=d, size=(q, m)).astype(np.float32)) # output of forward propagation self.s = gpu.garray( np.random.uniform(low=-d, high=d, size=(q, n)).astype(np.float32)) # input of back propagation self.d = gpu.garray( np.random.uniform(low=-d, high=d, size=(q, n)).astype(np.float32)) # output of back propagation self.e = gpu.garray( np.random.uniform(low=-d, high=d, size=(q, m)).astype(np.float32)) # temporary array for error #self.u = gpu.garray(np.random.uniform(low=-d, high=d, size=(q, n, m)).astype(np.float32)) # novelty key ****-> set self.t.size to (n, 1, p, 1) ---> group max # mask for dropout self.r = gpu.garray( np.random.uniform(low=0., high=1., size=(self.m)).astype( np.float32) > self.dropout) / (1.0 - self.dropout) #print self.r # mask for piece group #self.t = gpu.garray(np.random.randint(low=0, high=2, size=(1, n, q)).astype(np.float32)) # outward connections self.next = [] # inward connections self.prev = []
def dbn_sample(ws_vh, ws_v, ws_h, x, y=None, k=1): """ Sample from DBN ws_vh, ws_v, ws_h: Lists of layer weights for DBN x: Initial sample. This is the input to DBN. (1xD vector) y: Class label for the sample. This corresponds to sampling from class conditionals. (1-of-K coded, row vector) k: Number of Gibbs steps Returns a sample from DBN (1xD vector) """ L = len(ws_vh) # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) h_prev = h_prev > gnp.rand(h_prev.shape[0], h_prev.shape[1]) # if not supervised, sample from top layer RBM without clamping any of its # inputs if y is None: # sample from top layer RBM h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], h_prev, k) else: K = y.shape[1] # number of classes H = ws_vh[-1].shape[0] # generate a random input to top layer RBM with class label units clamped to y v = gnp.concatenate((y.T, h_prev)) # sample from top layer RBM h, v = rbm_sample(ws_vh[-1], ws_v[-1], ws_h[-1], v, k, clamped=(0, K)) v = v[K:H, :] # backward (top-down) pass # propagate sample from RBM back to input for l in range(L - 2, -1, -1): av = gnp.dot(ws_vh[l], v) + ws_v[l] v = gnp.logistic(av) return v.T
def check_fisher_information_biases_indep(): """Fisher information should agree with analytic solution for base rate RBM.""" with misc.gnumpy_conversion_check('allow'): rbm = random_base_rate_rbm() E_v = gnp.logistic(rbm.vbias) E_h = gnp.logistic(rbm.hbias) G = tractable.exact_fisher_information_biases(rbm, batch_units=BATCH_UNITS) assert_close(G, G.T, 'G not symmetric') G_vis_vis = G[:NVIS, :NVIS] G_vis_hid = G[:NVIS, NVIS:] G_hid_hid = G[NVIS:, NVIS:] assert_close(G_vis_vis[0, 0], E_v[0] * (1. - E_v[0])) assert_close(G_vis_vis[0, 1], 0.) assert_close(G_vis_hid[0, 0], 0.) assert_close(G_hid_hid[0, 0], E_h[0] * (1. - E_h[0])) assert_close(G_hid_hid[0, 1], 0.)
def nn_forward_pass(x, w, b, return_all=True): """ Forward pass for multilayer feed-forward sigmoid neural network Hidden units have sigmoid non-linearity. Output is soft-max. x: DxN matrix of input data w: Weights. List of weight matrices for each layer. b: Biases. List of bias vectors for each layer return_all: If True, returns hidden unit activations for each layer. If False just returns the output layer activations Returns a list h where each element is a matrix containing the activations for that layer. h[0] is input data x. """ # ---- TEMP HACK -------------- # I should find a more seamless way of running in mixed (some operations # with numpy, some with gnumpy) mode. # I had to resort to this, because i needed the validation classification # step in nn_train to run on CPU with numpy. GPU ran out of memory. if isinstance(x, gnp.garray): use_gpu = True else: use_gpu = False layer_count = len(w) if return_all: hs = [x] # unit activations for each layer h = x # all layers except the output layer for l in range(layer_count - 1): if use_gpu: a = gnp.dot(w[l].T, h) + b[l] h = gnp.logistic(a) else: a = np.dot(gnp.as_numpy_array(w[l]).T, h) + gnp.as_numpy_array(b[l]) h = 1.0 / (1 + np.exp(-a)) if return_all: hs.append(h) # output layer if use_gpu: h = gnp.dot(w[-1].T, h) + b[-1] h = gnp.exp(h) / gnp.sum(gnp.exp(h), axis=0) # soft-max else: h = np.dot(gnp.as_numpy_array(w[-1]).T, h) + gnp.as_numpy_array(b[-1]) h = np.exp(h) / np.sum(np.exp(h), axis=0) # soft-max if return_all: hs.append(h) return hs else: return h
def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x): """ Predict the class label of input x from supervised DBN Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html x: Input data. (NxD matrix) """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # for every class, assume it is the correct label and calculate its free energy y = gnp.zeros((K, N)) free_energy = gnp.zeros((N, K)) # we actually calculate -free_energy for k in range(K): # set the current assumed class label y[k, :] = 1.0 # visible unit vector v = gnp.concatenate((y, h_prev)) e_v = gnp.dot(ws_v[-1].T, v) # bias energy term ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1] e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0) free_energy[:, k] = e_v + e_h # zero the class labels for next iteration y[:, :] = 0.0 # since these numbers may get pretty small, use the sum-exp trick for converting # these to probabilities pred_y = ( gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]) / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis] ) return pred_y
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae sc = self.score(Z, inpts, addon=cae) return np.array([sc, cae])
def exact_moments(rbm, batch_units=10, show_progress=False): expect_vis = gnp.zeros(rbm.nvis) expect_hid = gnp.zeros(rbm.nhid) expect_prod = gnp.zeros((rbm.nvis, rbm.nhid)) for hid, p in iter_configurations(rbm, batch_units=batch_units, show_progress=show_progress): cond_vis = gnp.logistic(rbm.vis_inputs(hid)) expect_vis += gnp.dot(p, cond_vis) expect_hid += gnp.dot(p, hid) expect_prod += gnp.dot(cond_vis.T * p, hid) return binary_rbms.Moments(expect_vis, expect_hid, expect_prod)
def check_fisher_information_indep(): """Fisher information should agree with analytic solution for base rate RBM.""" with misc.gnumpy_conversion_check('allow'): rbm = random_base_rate_rbm() E_v = gnp.logistic(rbm.vbias) E_h = gnp.logistic(rbm.hbias) G = tractable.exact_fisher_information(rbm, batch_units=BATCH_UNITS) assert_close(G, G.T, 'G not symmetric') G_vis_vishid = G[:NVIS, NVIS+NHID:].reshape((NVIS, NVIS, NHID)) G_hid_vishid = G[NVIS:NVIS+NHID, NVIS+NHID:].reshape((NHID, NVIS, NHID)) G_vishid_vishid = G[NVIS+NHID:, NVIS+NHID:].reshape((NVIS, NHID, NVIS, NHID)) assert_close(G_vis_vishid[0, 0, 1], E_v[0] * (1. - E_v[0]) * E_h[1]) assert_close(G_vis_vishid[0, 1, 2], 0.) assert_close(G_hid_vishid[0, 1, 0], E_h[0] * (1. - E_h[0]) * E_v[1]) assert_close(G_hid_vishid[0, 1, 2], 0.) assert_close(G_vishid_vishid[0, 1, 0, 1], E_v[0] * E_h[1] * (1. - E_v[0] * E_h[1])) assert_close(G_vishid_vishid[0, 1, 0, 2], E_v[0] * (1. - E_v[0]) * E_h[1] * E_h[2]) assert_close(G_vishid_vishid[0, 2, 1, 2], E_h[2] * (1. - E_h[2]) * E_v[0] * E_v[1]) assert_close(G_vishid_vishid[0, 1, 2, 3], 0.)
def __init__(self, m, n, q=100, name=""): # name of layer self.name = name self.m = m # input layer size self.n = n # output layer size # self.p = p # piece group self.q = q # batch size self.dropout = 0.1 # dropout rate self.learn = 10 ** -3 self.l2reg = 1.0 - 10 ** -100 # activation function # self.f = lambda z: 1.0/(gpu.exp(-z)+1.0) self.f = lambda z: gpu.logistic(z) # self.f = lambda z: z # deriviative of activation function # self.g = lambda z: self.f(z)*(1.0-self.f(z)) self.g = lambda z: z * (1.0 - z) # self.g = lambda z: 1.0 d = 10 ** -8 # weight matrix self.w = gpu.garray(np.random.uniform(low=-d, high=d, size=(m, n)).astype(np.float32)) # bias vector self.b = gpu.garray(np.random.uniform(low=-d, high=d, size=(n)).astype(np.float32)) # input of forward propagation self.x = gpu.garray(np.random.uniform(low=-d, high=d, size=(q, m)).astype(np.float32)) # output of forward propagation self.s = gpu.garray(np.random.uniform(low=-d, high=d, size=(q, n)).astype(np.float32)) # input of back propagation self.d = gpu.garray(np.random.uniform(low=-d, high=d, size=(q, n)).astype(np.float32)) # output of back propagation self.e = gpu.garray(np.random.uniform(low=-d, high=d, size=(q, m)).astype(np.float32)) # temporary array for error # self.u = gpu.garray(np.random.uniform(low=-d, high=d, size=(q, n, m)).astype(np.float32)) # novelty key ****-> set self.t.size to (n, 1, p, 1) ---> group max # mask for dropout self.r = gpu.garray(np.random.uniform(low=0.0, high=1.0, size=(self.m)).astype(np.float32) > self.dropout) / ( 1.0 - self.dropout ) # print self.r # mask for piece group # self.t = gpu.garray(np.random.randint(low=0, high=2, size=(1, n, q)).astype(np.float32)) # outward connections self.next = [] # inward connections self.prev = []
def pt_score(self, params, inpts, **kwargs): hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1*hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return sc
def pt_score(self, params, inpts, **kwargs): hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1*hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return np.array([sc, sc-sparsity, sparsity, gpu.mean(self.rho_hat)])
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae sc = self.score(Z, inpts, addon=cae) return np.array([sc, cae])
def sig_ssd(z, targets, weight=0.5, predict=False, error=False, addon=0): """ Sigmoid SSD. """ bern = gpu.logistic(z) if predict: return bern n, m = bern.shape err = bern - targets if error: # rec. error + first deriv return weight * gpu.sum(err**2) / n + addon, 2. * weight * err / n else: # only return reconstruction error return weight * gpu.sum(err**2) / n + addon
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape( self.Tshape)) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return sc
def mia(z, targets, predict=False, error=False, addon=0, tiny=1e-10): """ Multiple independent attributes (i.e. independent binary cross entropy errors). Feed model output _z_ through logistic to get bernoulli distributed variables. """ bern = gpu.logistic(z) if predict: return bern n, _ = bern.shape # loss is binary cross entropy # for every output variable bce = -(targets * (bern + tiny).log() + (1 - targets) * (1 - bern + tiny).log()).sum() if error: return bce + addon, (bern - targets) / n else: return bce + addon
def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False): y = gnp.logistic(gnp.as_garray(pred)) return -((1 - self.target) * safe_log(1 - y) + self.target * safe_log(y)).sum(), y - self.target
import numpy as np import gnumpy as g from bkputils import * import scipy.special g.max_memory_usage = 285 * (1024 * 1024) # no idea where this comes from, but it makes gnumpy not crash (empiric value) w = 10000 h = 10000 write("making random matrices") m1 = np.random.rand(w, h) m2 = np.random.rand(h, w) writeDone() write("numpy multiply") n = np.dot(m1, m2) p = scipy.special.expit(n) writeDone() write("gnumpy setup") a = g.garray(m1) b = g.garray(m2) writeDone() write("gnumpy multiply") c = g.dot(a, b) c = g.logistic(c) writeDone()
def rbm_train(dataset, H, batch_size, epoch_count, epsilon, momentum, return_hidden=True, verbose=True): """ Train a (binary) restricted boltzmann machine. dataset: Input data. DataSet instance or matrix of size N (number of data points) x D (input dimension) H: Number of hidden units batch_size: Number of data points in each batch epoch_count: Number of training epochs epsilon: Learning rate, either a scalar or an array (one value for each epoch) momentum: Momentum parameter, either a scalar or an array (one value for each epoch) return_hidden: If True, returns hidden unit activations for training data. verbose: If True, prints progress information Returns w_vh (weights between visible-hidden units), w_v (visible unit biases), w_h (hidden unit biases), h (hidden unit activations for input data), error (reconstruction error at each epoch) """ if isinstance(dataset, ds.DataSet): train_x = dataset.train.x N = dataset.train.N D = dataset.train.D else: train_x = dataset N = train_x.shape[0] D = train_x.shape[1] batch_count = int(np.ceil(N / float(batch_size))) # if momentum is a scalar, create a list with the same value for all epochs if not isinstance(momentum, list): momentum = [momentum] * epoch_count if not isinstance(epsilon, list): epsilon = [epsilon] * epoch_count # initialize weights w_vh = gnp.randn((D, H)) * 0.1 w_v = gnp.zeros((D, 1)) w_h = gnp.zeros((H, 1)) # weight updates dw_vh = gnp.zeros((D, H)) dw_v = gnp.zeros((D, 1)) dw_h = gnp.zeros((H, 1)) # hidden unit activations if return_hidden: h = np.zeros((N, H)) # keep this a numpy array to save memory else: h = [] start_time = time.time() # reconstruction errors over epochs error = [] batch_order = range(batch_count) for e in range(epoch_count): if verbose: print("Epoch " + repr(e + 1)) batch_error = [] processed_batch = 0 for b in range(batch_count): processed_batch += 1 if verbose: print("\r%d/%d" % (processed_batch, batch_count)), start = b * batch_size end = (b + 1) * batch_size if (b + 1) * batch_size < N else N x = train_x[start:end, :].T # apply momentum dw_vh *= momentum[e] dw_v *= momentum[e] dw_h *= momentum[e] # positive phase ahp = gnp.dot(w_vh.T, x) + w_h hp = gnp.logistic(ahp) # if it is the last epoch, store hidden unit activations if return_hidden and e == epoch_count - 1: h[start:end, :] = gnp.as_numpy_array(hp.T) # add positive gradient term dw_vh += gnp.dot(x, hp.T) dw_v += gnp.sum(x, axis=1)[:, gnp.newaxis] dw_h += gnp.sum(hp, axis=1)[:, gnp.newaxis] # sample hiddens hs = hp > gnp.rand(hp.shape[0], hp.shape[1]) # negative phase avn = gnp.dot(w_vh, hs) + w_v vn = gnp.logistic(avn) ahn = gnp.dot(w_vh.T, vn) + w_h hn = gnp.logistic(ahn) dw_vh -= gnp.dot(vn, hn.T) dw_v -= gnp.sum(vn, axis=1)[:, gnp.newaxis] dw_h -= gnp.sum(hn, axis=1)[:, gnp.newaxis] # update weights w_vh += epsilon[e] / (end - start) * dw_vh w_v += epsilon[e] / (end - start) * dw_v w_h += epsilon[e] / (end - start) * dw_h batch_error.append(gnp.mean((vn - x) ** 2)) # shuffle batch order np.random.shuffle(batch_order) error.append(np.mean(batch_error)) if verbose: print("\nReconstruction error: " + repr(error[-1])) print("Elapsed time: " + str(time.time() - start_time)) return w_vh, w_v, w_h, h, error
def negative_phase(self): self.h = gpu.logistic(gpu.dot(self.v,self.w)+self.bias_h) self.w_updt -= gpu.dot(self.v.T,self.h) self.bias_h_updt -= gpu.sum(self.h,axis=0) self.bias_v_updt -= gpu.sum(self.v,axis=0)
def p_vis_given_hid(self, hid): """Returns a vector whose ith component is the probability that the ith visible unit is active given the states of the hidden units""" return gp.logistic(gp.dot(hid, self.weights.T) + self.bias_vis)
def gibbs_updates(self): self.h = (self.h > gpu.rand(100,800)) self.v = gpu.logistic(gpu.dot(self.h,self.w.T)+self.bias_v)
def vis_expectations(self, h): return gnp.logistic(self.vis_inputs(h))
def forward(self, A): return gnp.logistic(A)
def hid_expectations(self, v): return gnp.logistic(self.hid_inputs(v))
def forward_prop(self, x): return gnp.logistic(x)
def p_hid_given_vis(self, vis): """Returns a vector whose ith component is the probability that the ith hidden unit is active given the states of the visible units""" return gp.logistic(gp.dot(vis, self.weights) + self.bias_hid)
m2 = (momentum*m2) - ((grad2 + n2*L2)*alpha/(batch_size*1.0)) mb1 = (momentum*mb1) - ((gradb1 + nb1*L2)*alpha/(batch_size*1.0)) mb2 = (momentum*mb2) - ((gradb2 + nb2*L2)*alpha/(batch_size*1.0)) w1 = w1 + m1 w2 = w2 + m2 b1 = b1 + mb1 b2 = b2 + mb2 momentum = momentum + 0.001 if momentum > 0.95: momentum = 0.95 batch_error_cv = 0 for i in range(100): batch_error_cv += 1.0 - (gpu.sum(np.argmax(gpu.dot(gpu.logistic(gpu.dot(X_val[i],w1))*0.5,w2),axis=1) == y_val[i])/120.) batch_error = 0 for i in xrange(batches):#train error 5.9 sec z1 = gpu.dot(X[i],w1).logistic()*0.5 feedforward = gpu.dot(z1,w2) batch_error += 1. - (np.sum(np.equal(np.argmax(feedforward,axis=1),y.as_numpy_array()[i].T)/(batch_size*1.0))) ''' if gpu.max(w1)**2 > 9: print 'halving the weights of w1' w1 = w1/2. m1 = m1/2. if gpu.max(w2)**2 > 9: print 'halving the weights of w2' w2 = w2/2.
mb2 = (momentum * mb2) - ((gradb2 + nb2 * L2) * alpha / (batch_size * 1.0)) w1 = w1 + m1 w2 = w2 + m2 b1 = b1 + mb1 b2 = b2 + mb2 momentum = momentum + 0.001 if momentum > 0.95: momentum = 0.95 batch_error_cv = 0 for i in range(100): batch_error_cv += 1.0 - (gpu.sum( np.argmax(gpu.dot(gpu.logistic(gpu.dot(X_val[i], w1)) * 0.5, w2), axis=1) == y_val[i]) / 120.) batch_error = 0 for i in xrange(batches): #train error 5.9 sec z1 = gpu.dot(X[i], w1).logistic() * 0.5 feedforward = gpu.dot(z1, w2) batch_error += 1. - (np.sum( np.equal(np.argmax(feedforward, axis=1), y.as_numpy_array()[i].T) / (batch_size * 1.0))) ''' if gpu.max(w1)**2 > 9: print 'halving the weights of w1' w1 = w1/2. m1 = m1/2.
def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False): y = gnp.logistic(pred) return -((1 - self.target) * safe_log(1 - y) + self.target * safe_log(y)).sum(), y - self.target
def base_sample_vis(self, n_samples): "Samples the visible units from the base rate RBM" p = gp.logistic(self.base_bias_vis) r = gp.rand((n_samples, self.base_bias_vis.shape[0])) return r < p
def exact_fisher_information(rbm, batch_units=10, show_progress=False, vis_shape=None, downsample=1, return_mean=False): batch_size = 2 ** batch_units if downsample == 1: vis_idxs = np.arange(rbm.nvis) else: temp = np.arange(rbm.nvis).reshape((28, 28)) mask = np.zeros((28, 28), dtype=bool) mask[::downsample, ::downsample] = 1 vis_idxs = temp[mask] nvis = vis_idxs.size nhid = rbm.nhid num_params = nvis + nhid + nvis * nhid E_vis = np.zeros(nvis) E_hid = np.zeros(nhid) E_vishid = np.zeros((nvis, nhid)) E_vis_vis = np.zeros((nvis, nvis)) E_vis_hid = np.zeros((nvis, nhid)) E_vis_vishid = np.zeros((nvis, nvis, nhid)) E_hid_hid = np.zeros((nhid, nhid)) E_hid_vishid = np.zeros((nhid, nvis, nhid)) E_vishid_vishid = np.zeros((nvis, nhid, nvis, nhid)) for hid, p in iter_configurations(rbm, batch_units=batch_units, show_progress=show_progress): with misc.gnumpy_conversion_check('allow'): cond_vis = gnp.logistic(rbm.vis_inputs(hid)) cond_vis = gnp.garray(cond_vis.as_numpy_array()[:, vis_idxs]) vishid = (cond_vis[:, :, nax] * hid[:, nax, :]).reshape((batch_size, nvis * nhid)) var_vis = cond_vis * (1. - cond_vis) E_vis += gnp.dot(p, cond_vis) E_hid += gnp.dot(p, hid) E_vishid += gnp.dot(cond_vis.T * p, hid) E_vis_vis += gnp.dot(cond_vis.T * p, cond_vis) diag_term = gnp.dot(p, cond_vis * (1. - cond_vis)) E_vis_vis += gnp.garray(np.diag(diag_term.as_numpy_array())) E_vis_hid += gnp.dot(cond_vis.T * p, hid) E_hid_hid += gnp.dot(hid.T * p, hid) E_vis_vishid += gnp.dot(cond_vis.T * p, vishid).reshape((nvis, nvis, nhid)) diag_term = gnp.dot(var_vis.T * p, hid) E_vis_vishid[np.arange(nvis), np.arange(nvis), :] += diag_term E_hid_vishid += gnp.dot(hid.T * p, vishid).reshape((nhid, nvis, nhid)) E_vishid_vishid += gnp.dot(vishid.T * p, vishid).reshape((nvis, nhid, nvis, nhid)) diag_term = ((cond_vis * (1. - cond_vis))[:, :, nax, nax] * hid[:, nax, :, nax] * hid[:, nax, nax, :] * p[:, nax, nax, nax]).sum(0) E_vishid_vishid[np.arange(nvis), :, np.arange(nvis), :] += diag_term G = np.zeros((num_params, num_params)) vis_slc = slice(0, nvis) hid_slc = slice(nvis, nvis + nhid) vishid_slc = slice(nvis + nhid, None) G[vis_slc, vis_slc] = E_vis_vis G[vis_slc, hid_slc] = E_vis_hid G[vis_slc, vishid_slc] = E_vis_vishid.reshape((nvis, nvis * nhid)) G[hid_slc, vis_slc] = E_vis_hid.T G[hid_slc, hid_slc] = E_hid_hid G[hid_slc, vishid_slc] = E_hid_vishid.reshape((nhid, nvis * nhid)) G[vishid_slc, vis_slc] = E_vis_vishid.reshape((nvis, nvis * nhid)).T G[vishid_slc, hid_slc] = E_hid_vishid.reshape((nhid, nvis * nhid)).T G[vishid_slc, vishid_slc] = E_vishid_vishid.reshape((nvis * nhid, nvis * nhid)) s = np.concatenate([E_vis, E_hid, E_vishid.ravel()]) G -= np.outer(s, s) if return_mean: return G, s else: return G