def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[self.size:-self.shape[0]] = gdot(hddn.T, delta).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel() g[self.m_end:self.size] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def score(weights, structure, inputs, predict=False, error=False, **params): """ Computes the sparisty penalty using exponential weighting. """ hdim = structure["hdim"] A = structure["af"] _, idim = inputs.shape ih = idim * hdim rho_hat = structure["rho_hat"] # exponential decay for rho_hat over minibatches lmbd = structure["lmbd"] hddn = A(np.dot(inputs, weights[:ih].reshape(idim, hdim)) + weights[ih:ih+hdim]) z = np.dot(hddn, weights[:ih].reshape(idim, hdim).T) + weights[ih+hdim:] # sparsity penalty is bernoulli KL # avoid full passes over dataset via exponential decay rho_hat *= lmbd rho_hat += (1 - lmbd) * hddn.mean(axis=0) sparse_pen = structure["beta"] * np.sum(bKL(structure["rho"], rho_hat)) if error: structure["hiddens"] = hddn return structure["score"](z, inputs, predict=predict, error=error, addon=sparse_pen)
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_score(self, params, inpts, **kwargs): hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1*hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return sc
def pt_score(self, params, inpts, **kwargs): hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1*hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return np.array([sc, sc-sparsity, sparsity, gpu.mean(self.rho_hat)])
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape( self.Tshape)) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return sc
def true_score(weights, structure, inputs, predict=False, error=False, **params): """ Computes the sparsity penalty according to 'correct' formula, but needs a full pass over training set. Use for numerical gradient check. """ hdim = structure["hdim"] A = structure["af"] _, idim = inputs.shape ih = idim * hdim hddn = A(np.dot(inputs, weights[:ih].reshape(idim, hdim)) + weights[ih:ih+hdim]) z = np.dot(hddn, weights[:ih].reshape(idim, hdim).T) + weights[ih+hdim:] # sparsity penalty is bernoulli KL rho_hat = hddn.mean(axis=0) sparse_pen = structure["beta"] * np.sum(bKL(structure["rho"], rho_hat)) if error: structure["hiddens"] = hddn structure["rho_hat"] = rho_hat return structure["score"](z, inputs, predict=predict, error=error, addon=sparse_pen)