def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn)**2, axis=0) * w cae_grad += (gdot(inpts.T, (Dsigmoid(hddn)**2 * (1 - 2 * hddn))) / m * gpu.sum(w**2, axis=0)) g[:self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot( delta, params[:self.m_end].reshape(self.shape)) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] if self.rho_hat_grad == None: self.rho_hat_grad = hddn.mean(axis=0) else: self.rho_hat_grad *= 0.9 self.rho_hat_grad += 0.1*hddn.mean(axis=0) # rho_hat = hddn.mean(axis=0) rho_hat = self.rho_hat_grad rho = self.rho sparsity = self.beta * gpu.sum(bKL(rho, rho_hat)) _, delta = self.score(Z, inpts, error=True, addon=sparsity) g[self.size:-self.shape[0]] = gdot(hddn.T, delta).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = Dsigmoid(hddn) dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat) dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m) g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel() g[self.m_end:self.size] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def pt_score(self, params, inpts, **kwargs): hddn = self.activ(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] sc = self.score(Z, inpts) return sc
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) hddn = self.activ( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) _hddn = hddn.as_numpy_array() idxs = np.argsort(_hddn, axis=1) _hddn[range(_hddn.shape[0]), idxs[:, self.ak :].T] = 0 hddn = gpu.garray(_hddn) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] _, delta = self.score(Z, inpts, error=True) g[: self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0] :] = delta.sum(axis=0) dsc_dha = gdot(delta, params[: self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn) g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae _, delta = self.score(Z, inpts, error=True, addon=cae) g[: self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0] :] = delta.sum(axis=0) cae_grad = gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * w cae_grad += gdot(inpts.T, (Dsigmoid(hddn) ** 2 * (1 - 2 * hddn))) / m * gpu.sum(w ** 2, axis=0) g[: self.m_end] += self.cae * 2 * cae_grad.ravel() dsc_dha = Dsigmoid(hddn) * gdot(delta, params[: self.m_end].reshape(self.shape)) g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def bprop(self, params, grad, delta): dE_da = delta * diff_table[self.activ](self.Z) # gradient of the bias grad[self.m_end:] = dE_da.sum(axis=0) # gradient of the weights grad[:self.m_end] = gdot(self.data.T, dE_da).ravel() # backpropagate the delta delta = gdot(dE_da, params[:self.m_end].reshape(self.shape).T) del self.Z return delta
def grad_cd1(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) n, _ = inputs.shape m_end = self.m_end V = self.shape[0] H = self.shape[1] wm = params[:m_end].reshape(self.shape) prec = params[-V:][:, gpu.newaxis] h1, h_sampled = self.H(inputs, wm=prec*wm, bias=params[m_end:m_end+H], sampling=True) v2, v_sampled = gauss(h_sampled, wm=(wm/prec).T, bias=params[-(2*V):-V], prec=prec.T, sampling=True) h2, _ = self.H(v2, wm=prec*wm, bias=params[m_end:m_end+H]) #print h1[0,0], h_sampled[0,0], v2[0,0], v_sampled[0,0] # Note the negative sign: the gradient is # supposed to point into 'wrong' direction. g[:m_end] = -gdot(inputs.T*prec, h1).ravel() g[:m_end] += gdot(v_sampled.T*prec, h2).ravel() g[:m_end] *= 1./n g[:m_end] += self.l2*params[:m_end] g[m_end:m_end+H] = -h1.sum(axis=0) g[m_end:m_end+H] += h2.sum(axis=0) g[m_end:m_end+H] *= 1./n g[-2*V:-V] = -inputs.sum(axis=0) g[-2*V:-V] += v_sampled.sum(axis=0) g[-2*V:-V] *= 1./n g[-2*V:-V] *= (prec**2).T #print gsum(g[:m_end]**2), gsum(g[m_end:m_end+H]**2), gsum(g[-2*V:-V]**2) # Gradient for square root of precision g[-V:] = -gsum(2*prec.T*inputs*(params[-2*V:-V] - inputs/2), axis=0) + gsum(gdot(inputs.T, h1)*wm, axis=1) g[-V:] += (gsum(2*prec.T*v_sampled*(params[-2*V:-V] - v_sampled/2), axis=0) + gsum(gdot(v_sampled.T, h2)*wm, axis=1)) g[-V:] *= 1./n #print gsum(g[-V:]**2) if self.lmbd > 0.: if self.rho_hat is None: self.rho_hat = h1.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * h1.mean(axis=0) dKL_drho_hat = (self.rho - self.rho_hat)/(self.rho_hat*(1-self.rho_hat)) h1_1mh1 = h1*(1 - h1) g[m_end:m_end+H] -= self.lmbd/n * gsum(h1_1mh1, axis=0) * dKL_drho_hat g[:m_end] -= self.lmbd/n * (gdot(inputs.T * prec, h1_1mh1) * dKL_drho_hat).ravel() #g[:] = -g[:] return g
def pt_score(self, params, inpts, **kwargs): hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1*hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return sc
def bprop(self, params, grad, delta): # TODO: check next line, is it according # to formula in the paper? delta must be # defined correctly!! # self.C necessary? in loss Function, there is no C dE_da = self.C * delta * diff_table[self.activ](self.Z) # gradient of the bias grad[self.m_end:] = dE_da.sum(axis=0) # gradient of the weights, takes care of weight 'decay' factor (second addend) grad[:self.m_end] = gdot(self.data.T, dE_da).ravel() + params[:self.m_end] # backpropagate the delta delta = gdot(dE_da, params[:self.m_end].reshape(self.shape).T) del self.Z return delta
def pt_score(self, params, inpts, **kwargs): # fprop in tied AE hddn = self.activ(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)), self.theta) # get indices Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] sc = self.score(Z, inpts) return sc
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape( self.Tshape)) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return sc
def grad_cd1(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) n, _ = inputs.shape m_end = self.m_end V = self.shape[0] H = self.shape[1] wm = params[:m_end].reshape(self.shape) h1, h_sampled = self.H(inputs, wm=wm, bias=params[m_end:-V], sampling=True) v2, _ = self.V(h_sampled, wm=wm.T, bias=params[-V:]) h2, _ = self.H(v2, wm=wm, bias=params[m_end:-V]) # Note the negative sign: the gradient is # supposed to point into 'wrong' direction, # because the used optimizer likes to minimize. g[:m_end] = -gdot(inputs.T, h1).ravel() g[:m_end] += gdot(v2.T, h2).ravel() g[:m_end] *= 1. / n g[:m_end] += self.l2 * params[:m_end] g[m_end:-V] = -h1.mean(axis=0) g[m_end:-V] += h2.mean(axis=0) g[-V:] = -inputs.mean(axis=0) g[-V:] += v2.mean(axis=0) if self.rho_hat is None: self.rho_hat = h1.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * h1.mean(axis=0) dKL_drho_hat = (self.rho - self.rho_hat) / (self.rho_hat * (1 - self.rho_hat)) h1_1mh1 = h1 * (1 - h1) g[m_end:-V] -= self.lmbd / n * gsum(h1_1mh1, axis=0) * dKL_drho_hat g[:m_end] -= self.lmbd / n * (gdot(inputs.T, h1_1mh1) * dKL_drho_hat).ravel() return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) hddn = self.activ(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)), self.theta) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] _, delta = self.score(Z, inpts, error=True) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() # clean up del delta return g
def reconstruction(self, params, inputs, **kwargs): """ """ x, y = inputs n, _ = x.shape weights_xf = params[:self.xf_sz].reshape(self.xfshape) weights_yf = params[self.xf_sz:self._cum_xy].reshape(self.yfshape) weights_fh = params[self._cum_xy:self._cum_xyh].reshape(self.fhshape) bias_h = params[self._cum_xyh:self.size] bias_x = params[self.size:-self.shape[0][1]] bias_y = params[-self.shape[0][1]:] factors_x = gdot(x, weights_xf) factors_y = gdot(y, weights_yf) factors = factors_x * factors_y h, h_sampled = bernoulli(factors, wm=weights_fh, bias=bias_h, sampling=True) rho_hat = h.sum() factors_h = gdot(h, weights_fh.T) way = np.random.rand() > 0.5 if way: # reconstruct y (output) first. tmp = factors_x * factors_h y1, _ = self.V(tmp, wm=weights_yf.T, bias=bias_y) factors_y[:] = gdot(y1, weights_yf) # then reconstruct x (input). tmp = factors_y * factors_h x1, _ = self.V(tmp, wm=weights_xf.T, bias=bias_x) else: # reconstruct x (input) first. tmp = factors_y * factors_h x1, _ = self.V(tmp, wm=weights_xf.T, bias=bias_x) factors_x[:] = gdot(x1, weights_xf) # then reconstruct y (output). tmp = factors_x * factors_h y1, _ = self.V(tmp, wm=weights_yf.T, bias=bias_y) xrec = gsum((x - x1)**2) yrec = gsum((y - y1)**2) return np.array([xrec, yrec, self.lmbd*rho_hat, self.avg_nxyf, self.avg_nfh])
def pt_grad(self, params, noisy_inpts, targets, l2=0., **kwargs): g = gzeros(params.shape) hddn = self.activ(gpu.dot(noisy_inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] _, delta = self.score(Z, targets, error=True) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn) g[:self.m_end] += gdot(noisy_inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta return g
def pt_score(self, params, inpts, **kwargs): # fprop in tied AE hddn = self.activ( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)), self.theta) # get indices Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] sc = self.score(Z, inpts) return sc
def pt_grad(self, params, inputs, targets, l2=0, **kwargs): g = gzeros(params.shape) Z = self.activ(gpu.dot(inputs, params[:self.m_end].reshape(self.shape)) + params[self.m_end:]) _, delta = self.score(Z, targets, error=True) # necessary? delta = self.C * delta g[:self.m_end] = gdot(inputs.T, delta).ravel() + params[:self.m_end] g[self.m_end:] = delta.sum(axis=0) # clean up del delta return g
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) m, _ = inpts.shape hddn = self.activ(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size]) Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:] _, delta = self.score(Z, inpts, error=True) g[self.end:-self.shape[0]] = gdot(hddn.T, delta).ravel() g[-self.shape[0]:] = delta.sum(axis=0) diff = diff_table[self.activ](hddn) dsc_dha = diff * gdot(delta, params[:self.m_end].reshape(self.shape)) g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel() g[self.m_end:self.size] = dsc_dha.sum(axis=0) # clean up del delta, hddn, Z return g
def grad_cd1(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) n, _ = inputs.shape m_end = self.m_end V = self.shape[0] H = self.shape[1] wm = params[:m_end].reshape(self.shape) h1, h_sampled = self.H(inputs, wm=wm, bias=params[m_end:-V], sampling=True) v2, _ = self.V(h_sampled, wm=wm.T, bias=params[-V:]) h2, _ = self.H(v2, wm=wm, bias=params[m_end:-V]) # Note the negative sign: the gradient is # supposed to point into 'wrong' direction, # because the used optimizer likes to minimize. g[:m_end] = -gdot(inputs.T, h1).ravel() g[:m_end] += gdot(v2.T, h2).ravel() g[:m_end] *= 1./n g[:m_end] += self.l2*params[:m_end] g[m_end:-V] = -h1.mean(axis=0) g[m_end:-V] += h2.mean(axis=0) g[-V:] = -inputs.mean(axis=0) g[-V:] += v2.mean(axis=0) if self.rho_hat is None: self.rho_hat = h1.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * h1.mean(axis=0) dKL_drho_hat = (self.rho - self.rho_hat)/(self.rho_hat*(1-self.rho_hat)) h1_1mh1 = h1*(1 - h1) g[m_end:-V] -= self.lmbd/n * gsum(h1_1mh1, axis=0) * dKL_drho_hat g[:m_end] -= self.lmbd/n * (gdot(inputs.T, h1_1mh1) * dKL_drho_hat).ravel() return g
def pt_score(self, params, inpts, **kwargs): # fprop in tied AE hddn = self.activ(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) # get indices _hddn= hddn.as_numpy_array() idxs = np.argsort(_hddn, axis=1) _hddn[range(_hddn.shape[0]), idxs[:, self.ak:].T] = 0 hddn = gpu.garray(_hddn) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] sc = self.score(Z, inpts) return sc
def pt_grad(self, params, inputs, targets, l2=0, **kwargs): g = gzeros(params.shape) Z = self.activ(gpu.dot(inputs, params[:self.m_end].reshape(self.shape)) + params[self.m_end:]) _, delta = self.score(Z, targets, error=True) g[:self.m_end] = gdot(inputs.T, delta).ravel() g[self.m_end:] = delta.sum(axis=0) # clean up del delta return g
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] w = params[: self.m_end].reshape(self.shape) cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0)) cae *= self.cae sc = self.score(Z, inpts, addon=cae) return np.array([sc, cae])
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) hddn = self.activ( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)), self.theta) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] _, delta = self.score(Z, inpts, error=True) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) dsc_dha = gdot(delta, params[:self.m_end].reshape( self.shape)) * diff_table[self.activ](hddn) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() # clean up del delta return g
def pt_score(self, params, inpts, **kwargs): # fprop in tied AE hddn = self.activ( gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]] ) # get indices _hddn = hddn.as_numpy_array() idxs = np.argsort(_hddn, axis=1) _hddn[range(_hddn.shape[0]), idxs[:, self.ak :].T] = 0 hddn = gpu.garray(_hddn) Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :] sc = self.score(Z, inpts) return sc
def pt_score(self, params, inpts, **kwargs): hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] if self.rho_hat == None: self.rho_hat = hddn.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1*hddn.mean(axis=0) sparsity = self.beta * gpu.sum(bKL(self.rho, self.rho_hat)) sc = self.score(Z, inpts, addon=sparsity) return np.array([sc, sc-sparsity, sparsity, gpu.mean(self.rho_hat)])
def pt_grad(self, params, inpts, **kwargs): g = gzeros(params.shape) hddn = self.activ(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) _hddn= hddn.as_numpy_array() idxs = np.argsort(_hddn, axis=1) _hddn[range(_hddn.shape[0]), idxs[:, self.ak:].T] = 0 hddn = gpu.garray(_hddn) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] _, delta = self.score(Z, inpts, error=True) g[:self.m_end] = gdot(delta.T, hddn).ravel() g[-self.shape[0]:] = delta.sum(axis=0) dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn) g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel() g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0) # clean up del delta return g
def pt_score(self, params, inpts, **kwargs): hddn = logistic( gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end + self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape( self.shape).T) + params[-self.shape[0]:] w = params[:self.m_end].reshape(self.shape) cae = gpu.sum( gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0)) cae *= self.cae sc = self.score(Z, inpts, addon=cae) return np.array([sc, cae])
def cd1_3way_grad(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) x, y = inputs n, _ = x.shape #print self.avg_nxyf, self.avg_nfh weights_xf = params[:self.xf_sz].reshape(self.xfshape) weights_yf = params[self.xf_sz:self._cum_xy].reshape(self.yfshape) weights_fh = params[self._cum_xy:self._cum_xyh].reshape(self.fhshape) bias_h = params[self._cum_xyh:self.size] bias_x = params[self.size:-self.shape[0][1]] bias_y = params[-self.shape[0][1]:] # normalize weights sq_xf = weights_xf * weights_xf norm_xf = gpu.sqrt(sq_xf.sum(axis=0)) + SMALL sq_yf = weights_yf * weights_yf norm_yf = gpu.sqrt(sq_yf.sum(axis=0)) + SMALL norm_xyf = (norm_xf.mean() + norm_yf.mean())/2. self.avg_nxyf *= 0.95 self.avg_nxyf += (0.05 * norm_xyf) weights_xf *= (self.avg_nxyf / norm_xf) weights_yf *= (self.avg_nxyf / norm_yf) sq_fh = weights_fh*weights_fh norm_fh = gpu.sqrt(sq_fh.sum(axis=1)) + SMALL self.avg_nfh *= 0.95 self.avg_nfh += (0.05 * norm_fh.mean()) weights_fh *= (self.avg_nfh / norm_fh[:, gpu.newaxis]) # normalization done factors_x = gdot(x, weights_xf) factors_y = gdot(y, weights_yf) factors = factors_x * factors_y h, h_sampled = bernoulli(factors, wm=weights_fh, bias=bias_h, sampling=True) factors_h = gdot(h_sampled, weights_fh.T) g[:self.xf_sz] = -gdot(x.T, factors_y*factors_h).ravel() g[self.xf_sz:self._cum_xy] = -gdot(y.T, factors_x*factors_h).ravel() g[self._cum_xy:self._cum_xyh] = -gdot(factors.T, h_sampled).ravel() g[self._cum_xyh:self.size] = -h.sum(axis=0) g[self.size:-self.shape[0][1]] = -x.sum(axis=0) g[-self.shape[0][1]:] = -y.sum(axis=0) # 3way cd way = np.random.rand() > 0.5 if way: # reconstruct y (output) first. tmp = factors_x * factors_h y1, _ = self.V(tmp, wm=weights_yf.T, bias=bias_y) factors_y[:] = gdot(y1, weights_yf) # then reconstruct x (input). tmp = factors_y * factors_h x1, _ = self.V(tmp, wm=weights_xf.T, bias=bias_x) factors_x[:] = gdot(x1, weights_xf) else: # reconstruct x (input) first. tmp = factors_y * factors_h x1, _ = self.V(tmp, wm=weights_xf.T, bias=bias_x) factors_x[:] = gdot(x1, weights_xf) # then reconstruct y (output). tmp = factors_x * factors_h y1, _ = self.V(tmp, wm=weights_yf.T, bias=bias_y) factors_y[:] = gdot(y1, weights_yf) factors[:] = factors_x * factors_y h1, _ = bernoulli(factors, wm=weights_fh, bias=bias_h) factors_h[:] = gdot(h1, weights_fh.T) g[:self.xf_sz] += gdot(x1.T, factors_y*factors_h).ravel() g[:self.xf_sz] *= 1./n g[self.xf_sz:self._cum_xy] += gdot(y1.T, factors_x*factors_h).ravel() g[self.xf_sz:self._cum_xy] *= 1./n g[self._cum_xy:self._cum_xyh] += gdot(factors.T, h1).ravel() g[self._cum_xy:self._cum_xyh] *= 1./n g[self._cum_xyh:self.size] += h1.sum(axis=0) g[self._cum_xyh:self.size] *= 1./n g[self.size:-self.shape[0][1]] += x1.sum(axis=0) g[self.size:-self.shape[0][1]] *= 1./n g[-self.shape[0][1]:] += y1.sum(axis=0) g[-self.shape[0][1]:] *= 1./n return g
def fward(self, params, data): return gdot(data, params[:self.m_end].reshape( self.shape)) + params[self.m_end:]
def fprop(self, params, data): self.data = data self.Z = gdot(data, params[:self.m_end].reshape(self.shape)) + params[self.m_end:] return self.Z
def fward(self, params, data): return gdot(data, params[:self.m_end].reshape(self.shape)) + params[self.m_end:]
def fprop_spike(self, params, data): self.data = data self.Z = self.activ(gdot(data, params[:self.m_end].reshape(self.shape)) + params[self.m_end:]) spike = self.Z > gpu.rand(self.Z.shape) return spike
def fprop(self, params, data): self.data = data self.Z = gdot(data, params[:self.m_end].reshape( self.shape)) + params[self.m_end:] return self.Z
def grad_cd1(self, params, inputs, **kwargs): """ """ g = gzeros(params.shape) n, _ = inputs.shape m_end = self.m_end V = self.shape[0] H = self.shape[1] wm = params[:m_end].reshape(self.shape) prec = params[-V:][:, gpu.newaxis] h1, h_sampled = self.H(inputs, wm=prec * wm, bias=params[m_end:m_end + H], sampling=True) v2, v_sampled = gauss(h_sampled, wm=(wm / prec).T, bias=params[-(2 * V):-V], prec=prec.T, sampling=True) h2, _ = self.H(v2, wm=prec * wm, bias=params[m_end:m_end + H]) #print h1[0,0], h_sampled[0,0], v2[0,0], v_sampled[0,0] # Note the negative sign: the gradient is # supposed to point into 'wrong' direction. g[:m_end] = -gdot(inputs.T * prec, h1).ravel() g[:m_end] += gdot(v_sampled.T * prec, h2).ravel() g[:m_end] *= 1. / n g[:m_end] += self.l2 * params[:m_end] g[m_end:m_end + H] = -h1.sum(axis=0) g[m_end:m_end + H] += h2.sum(axis=0) g[m_end:m_end + H] *= 1. / n g[-2 * V:-V] = -inputs.sum(axis=0) g[-2 * V:-V] += v_sampled.sum(axis=0) g[-2 * V:-V] *= 1. / n g[-2 * V:-V] *= (prec**2).T #print gsum(g[:m_end]**2), gsum(g[m_end:m_end+H]**2), gsum(g[-2*V:-V]**2) # Gradient for square root of precision g[-V:] = -gsum(2 * prec.T * inputs * (params[-2 * V:-V] - inputs / 2), axis=0) + gsum(gdot(inputs.T, h1) * wm, axis=1) g[-V:] += (gsum(2 * prec.T * v_sampled * (params[-2 * V:-V] - v_sampled / 2), axis=0) + gsum(gdot(v_sampled.T, h2) * wm, axis=1)) g[-V:] *= 1. / n #print gsum(g[-V:]**2) if self.lmbd > 0.: if self.rho_hat is None: self.rho_hat = h1.mean(axis=0) else: self.rho_hat *= 0.9 self.rho_hat += 0.1 * h1.mean(axis=0) dKL_drho_hat = (self.rho - self.rho_hat) / (self.rho_hat * (1 - self.rho_hat)) h1_1mh1 = h1 * (1 - h1) g[m_end:m_end + H] -= self.lmbd / n * gsum(h1_1mh1, axis=0) * dKL_drho_hat g[:m_end] -= self.lmbd / n * (gdot(inputs.T * prec, h1_1mh1) * dKL_drho_hat).ravel() #g[:] = -g[:] return g
def fprop_dropout(self, params, data): self.data = data self.Z = self.activ(gdot(data, params[:self.m_end].reshape(self.shape)) + params[self.m_end:]) self.drop = gpu.rand(self.Z.shape) > self.dropout self.Z *= self.drop return self.Z
def fprop(self, params, data): self.data = data self.Z = self.activ(gdot(data, params[:self.m_end].reshape(self.shape))\ + params[self.m_end:]) return self.Z
def fward_spike(self, params, data): Z = self.activ(gdot(data, params[:self.m_end].reshape(self.shape))\ + params[self.m_end:]) spike = Z > gpu.rand(Z.shape) return spike
def fward_dropout(self, params, data): return (1 - self.dropout) * self.activ(gdot(data,\ params[:self.m_end].reshape(self.shape)) + params[self.m_end:])
def fward(self, params, data): return self.activ(gdot(data, params[:self.m_end].reshape(self.shape))\ + params[self.m_end:])
def pt_score(self, params, noisy_inpts, targets, l2=0., **kwargs): # fprop in tied AE hddn = self.activ(gpu.dot(noisy_inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]]) Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:] sc = self.score(Z, targets) return sc