def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[: self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0] :] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * w cae_grad += gdot(inpts.T, (Dsigmoid(hddn) ** 2 * (1 - 2 * hddn))) / m * gpu.sum(w ** 2, axis=0) g[: self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot(delta, params[: self.m_end].reshape(self.shape)) g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def score_grad(weights, structure, inputs, score=score, **params): """ """ hdim = structure["hdim"] m, idim = inputs.shape ih = idim * hdim g = np.zeros(weights.shape, dtype=weights.dtype) # forward pass through model, # need 'error' signal at the end. sc, delta = score(weights, structure=structure, inputs=inputs, predict=False, error=True, **params) # recover saved hidden values hddn = structure["hiddens"] # weights are tied g[:ih] = np.dot(delta.T, hddn).ravel() g[ih+hdim:] = delta.sum(axis=0) # derivative of cae cost w = weights[:ih].reshape(idim, hdim) cae_grad = np.mean(Dsigmoid(hddn)**2, axis=0) * w cae_grad += (np.dot(inputs.T, (Dsigmoid(hddn)**2 * (1-2*hddn)))/m * np.sum(w**2, axis=0)) cae_weight = structure["cae"] g[:ih] += cae_weight * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * np.dot(delta, weights[:ih].reshape(idim, hdim)) g[:ih] += np.dot(inputs.T, dsc_dha).ravel() g[ih:ih+hdim] = dsc_dha.sum(axis=0) # clean up structure del structure["hiddens"] return sc, g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn)**2, axis=0) * w cae_grad += (gdot(inpts.T, (Dsigmoid(hddn)**2 * (1 - 2 * hddn))) / m * gpu.sum(w**2, axis=0)) g[:self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot( delta, params[:self.m_end].reshape(self.shape)) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae sc = self.score(Z, inpts, addon=cae) return np.array([sc, cae])
def score(weights, structure, inputs, predict=False, error=False, **params): """ Computes the sparisty penalty using exponential weighting. """ hdim = structure["hdim"] _, idim = inputs.shape ih = idim * hdim hddn = sigmoid(np.dot(inputs, weights[:ih].reshape(idim, hdim)) + weights[ih:ih+hdim]) z = np.dot(hddn, weights[:ih].reshape(idim, hdim).T) + weights[ih+hdim:] w = weights[:ih].reshape(idim, hdim) cae = np.sum(np.mean(Dsigmoid(hddn)**2, axis=0) * np.sum(w**2, axis=0)) cae_weight = structure["cae"] cae *= cae_weight if error: structure["hiddens"] = hddn return structure["score"](z, inputs, predict=predict, error=error, addon=cae)