Example #1
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)
        m, _ = inpts.shape

        hddn = logistic(
            gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]]
        )
        Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :]

        w = params[: self.m_end].reshape(self.shape)
        cae = gpu.sum(gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * gpu.sum(w ** 2, axis=0))
        cae *= self.cae

        _, delta = self.score(Z, inpts, error=True, addon=cae)

        g[: self.m_end] = gdot(delta.T, hddn).ravel()
        g[-self.shape[0] :] = delta.sum(axis=0)

        cae_grad = gpu.mean(Dsigmoid(hddn) ** 2, axis=0) * w
        cae_grad += gdot(inpts.T, (Dsigmoid(hddn) ** 2 * (1 - 2 * hddn))) / m * gpu.sum(w ** 2, axis=0)
        g[: self.m_end] += self.cae * 2 * cae_grad.ravel()

        dsc_dha = Dsigmoid(hddn) * gdot(delta, params[: self.m_end].reshape(self.shape))

        g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel()

        g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0)
        # clean up
        del delta, hddn, Z
        return g
Example #2
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)
        m, _ = inpts.shape

        hddn = logistic(
            gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) +
            params[self.m_end:self.m_end + self.shape[1]])
        Z = gdot(hddn, params[:self.m_end].reshape(
            self.shape).T) + params[-self.shape[0]:]

        w = params[:self.m_end].reshape(self.shape)
        cae = gpu.sum(
            gpu.mean(Dsigmoid(hddn)**2, axis=0) * gpu.sum(w**2, axis=0))
        cae *= self.cae

        _, delta = self.score(Z, inpts, error=True, addon=cae)

        g[:self.m_end] = gdot(delta.T, hddn).ravel()
        g[-self.shape[0]:] = delta.sum(axis=0)

        cae_grad = gpu.mean(Dsigmoid(hddn)**2, axis=0) * w
        cae_grad += (gdot(inpts.T, (Dsigmoid(hddn)**2 * (1 - 2 * hddn))) / m *
                     gpu.sum(w**2, axis=0))
        g[:self.m_end] += self.cae * 2 * cae_grad.ravel()

        dsc_dha = Dsigmoid(hddn) * gdot(
            delta, params[:self.m_end].reshape(self.shape))

        g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel()

        g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0)
        # clean up
        del delta, hddn, Z
        return g
Example #3
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)
        m, _ = inpts.shape

        hddn = logistic(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size])
        Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:]

        if self.rho_hat_grad == None:
            self.rho_hat_grad = hddn.mean(axis=0)
        else:
            self.rho_hat_grad *= 0.9
            self.rho_hat_grad += 0.1*hddn.mean(axis=0)

#        rho_hat = hddn.mean(axis=0)
        rho_hat = self.rho_hat_grad
        rho = self.rho
        sparsity = self.beta * gpu.sum(bKL(rho, rho_hat))
 
        _, delta = self.score(Z, inpts, error=True, addon=sparsity)

        g[self.size:-self.shape[0]] = gdot(hddn.T, delta).ravel()
        g[-self.shape[0]:] = delta.sum(axis=0)

        diff = Dsigmoid(hddn)
        dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat)
        dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m)

        g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel()

        g[self.m_end:self.size] = dsc_dha.sum(axis=0)
        # clean up
        del delta, hddn, Z
        return g
Example #4
0
    def pt_init(self,
                init_var=1e-2,
                init_bias=0.,
                rho=0.5,
                lmbd=0.,
                l2=0.,
                SI=15,
                **kwargs):
        """
        """
        # 2*self.shape[0]: precision parameters have size shape[0]
        pt_params = gzeros(self.m_end + self.shape[1] + 2 * self.shape[0])
        if init_var is None:
            pt_params[:self.m_end] = gpu.garray(
                init_SI(self.shape, sparsity=SI)).ravel()
        else:
            pt_params[:self.m_end] = init_var * gpu.randn(self.m_end)
        pt_params[self.m_end:-self.shape[0]] = init_bias
        pt_params[-self.shape[0]:] = 1.

        self.pt_score = self.reconstruction
        self.pt_grad = self.grad_cd1

        self.l2 = l2

        self.rho = rho
        self.lmbd = lmbd
        self.rho_hat = None

        return pt_params
Example #5
0
    def pt_init(self,
                H=bernoulli,
                V=bernoulli,
                init_var=1e-2,
                init_bias=0.,
                rho=0.5,
                lmbd=0.,
                l2=0.,
                **kwargs):
        pt_params = gzeros(self.m_end + self.shape[1] + self.shape[0])
        if init_var is None:
            init_heur = 4 * np.sqrt(6. / (self.shape[0] + self.shape[1]))
            pt_params[:self.m_end] = gpu.rand(self.m_end)
            pt_params[:self.m_end] *= 2
            pt_params[:self.m_end] -= 1
            pt_params[:self.m_end] *= init_heur
        else:
            pt_params[:self.m_end] = init_var * gpu.randn(self.m_end)
        pt_params[self.m_end:] = init_bias

        self.H = H
        self.V = V
        self.activ = match_table[H]

        self.pt_score = self.reconstruction
        self.pt_grad = self.grad_cd1

        self.l2 = l2

        self.rho = rho
        self.lmbd = lmbd
        self.rho_hat = None

        return pt_params
Example #6
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)
        m, _ = inpts.shape

        hddn = logistic(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]])
        Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:]

        if self.rho_hat_grad == None:
            self.rho_hat_grad = hddn.mean(axis=0)
        else:
            self.rho_hat_grad *= 0.9
            self.rho_hat_grad += 0.1*hddn.mean(axis=0)

#        rho_hat = hddn.mean(axis=0)
        rho_hat = self.rho_hat_grad
        rho = self.rho
        sparsity = self.beta * gpu.sum(bKL(rho, rho_hat))

        _, delta = self.score(Z, inpts, error=True, addon=sparsity)

        g[:self.m_end] = gdot(delta.T, hddn).ravel()
        g[-self.shape[0]:] = delta.sum(axis=0)

        diff = Dsigmoid(hddn)
        dsparse_dha = -rho/rho_hat + (1-rho)/(1-rho_hat)
        dsc_dha = diff * (gdot(delta, params[:self.m_end].reshape(self.shape)) + self.beta*dsparse_dha/m)

        g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel()

        g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0)
        # clean up
        del delta, hddn, Z
        return g
Example #7
0
    def grad(self, params, inputs, targets, **kwargs):
        data = inputs
        for layer, (c1, c2) in izip(self.encoder,
                                    izip(self.enc[:-1], self.enc[1:])):
            data = layer.fprop(self.params[c1:c2], data)

        # possible spot for semisupervision?

        for layer, (c1, c2) in izip(self.decoder,
                                    izip(self.dec[:-1], self.dec[1:])):
            data = layer.fprop(self.params[c1:c2], data)

        _, delta = self._score(data, inputs, error=True)

        g = gzeros(self.psize)
        for layer, (c1, c2) in izip(self.decoder[::-1],
                                    izip(self.dec[-2::-1], self.dec[:0:-1])):
            delta = layer.bprop(params=params[c1:c2],
                                grad=g[c1:c2],
                                delta=delta)

        # in case: fuse in gradient from semisupervision

        for layer, (c1, c2) in izip(self.encoder[::-1],
                                    izip(self.enc[-2::-1], self.enc[:0:-1])):
            delta = layer.bprop(params=params[c1:c2],
                                grad=g[c1:c2],
                                delta=delta)
        return g
Example #8
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)

        hddn = self.activ(
            gpu.dot(inpts, params[: self.m_end].reshape(self.shape)) + params[self.m_end : self.m_end + self.shape[1]]
        )
        _hddn = hddn.as_numpy_array()
        idxs = np.argsort(_hddn, axis=1)
        _hddn[range(_hddn.shape[0]), idxs[:, self.ak :].T] = 0
        hddn = gpu.garray(_hddn)
        Z = gdot(hddn, params[: self.m_end].reshape(self.shape).T) + params[-self.shape[0] :]

        _, delta = self.score(Z, inpts, error=True)

        g[: self.m_end] = gdot(delta.T, hddn).ravel()
        g[-self.shape[0] :] = delta.sum(axis=0)

        dsc_dha = gdot(delta, params[: self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn)

        g[: self.m_end] += gdot(inpts.T, dsc_dha).ravel()

        g[self.m_end : -self.shape[0]] = dsc_dha.sum(axis=0)
        # clean up
        del delta
        return g
Example #9
0
File: rbm.py Project: osdf/gpustack
    def pt_init(self, H=bernoulli, V=bernoulli, init_var=1e-2, init_bias=0., 
            rho=0.5, lmbd=0., l2=0., **kwargs):
        pt_params = gzeros(self.m_end + self.shape[1] + self.shape[0])
        if init_var is None:
            init_heur = 4*np.sqrt(6./(self.shape[0]+self.shape[1]))
            pt_params[:self.m_end] = gpu.rand(self.m_end)
            pt_params[:self.m_end] *= 2
            pt_params[:self.m_end] -= 1
            pt_params[:self.m_end] *= init_heur
        else:
            pt_params[:self.m_end] = init_var * gpu.randn(self.m_end)
        pt_params[self.m_end:] = init_bias

        self.H = H
        self.V = V 
        self.activ = match_table[H]

        self.pt_score = self.reconstruction
        self.pt_grad = self.grad_cd1

        self.l2 = l2

        self.rho = rho
        self.lmbd = lmbd
        self.rho_hat = None

        return pt_params
Example #10
0
 def reload(self, _pt_params):
     """
     """
     if self.p is None:
         self.p = gzeros(self.size)
     pt_params = gpu.as_garray(_pt_params)
     self.prep_layer(pt_params)
     del pt_params
Example #11
0
 def reload(self, _pt_params):
     """
     """
     if self.p is None:
         self.p = gzeros(self.size)
     pt_params = gpu.as_garray(_pt_params)
     self.prep_layer(pt_params)
     del pt_params
Example #12
0
    def grad_cd1(self, params, inputs, **kwargs):
        """
        """
        g = gzeros(params.shape)

        n, _ = inputs.shape

        m_end = self.m_end
        V = self.shape[0]
        H = self.shape[1]
        wm = params[:m_end].reshape(self.shape)
        prec = params[-V:][:, gpu.newaxis]

        h1, h_sampled = self.H(inputs, wm=prec*wm, bias=params[m_end:m_end+H], sampling=True)
        v2, v_sampled = gauss(h_sampled, wm=(wm/prec).T, bias=params[-(2*V):-V], prec=prec.T, sampling=True)
        h2, _ = self.H(v2, wm=prec*wm, bias=params[m_end:m_end+H])

        #print h1[0,0], h_sampled[0,0], v2[0,0], v_sampled[0,0]
        # Note the negative sign: the gradient is 
        # supposed to point into 'wrong' direction.
        g[:m_end] = -gdot(inputs.T*prec, h1).ravel()
        g[:m_end] += gdot(v_sampled.T*prec, h2).ravel()
        g[:m_end] *= 1./n
        g[:m_end] += self.l2*params[:m_end]

        g[m_end:m_end+H] = -h1.sum(axis=0)
        g[m_end:m_end+H] += h2.sum(axis=0)
        g[m_end:m_end+H] *= 1./n

        g[-2*V:-V] = -inputs.sum(axis=0)
        g[-2*V:-V] += v_sampled.sum(axis=0)
        g[-2*V:-V] *= 1./n
        g[-2*V:-V] *= (prec**2).T

        #print gsum(g[:m_end]**2), gsum(g[m_end:m_end+H]**2), gsum(g[-2*V:-V]**2)
        # Gradient for square root of precision
        g[-V:] = -gsum(2*prec.T*inputs*(params[-2*V:-V] - inputs/2), axis=0) + gsum(gdot(inputs.T, h1)*wm, axis=1)
        g[-V:] += (gsum(2*prec.T*v_sampled*(params[-2*V:-V] - v_sampled/2), axis=0) + gsum(gdot(v_sampled.T, h2)*wm, axis=1))
        g[-V:] *= 1./n

        #print gsum(g[-V:]**2)
        if self.lmbd > 0.:
            if self.rho_hat is None:
                self.rho_hat = h1.mean(axis=0)
            else:
                self.rho_hat *= 0.9
                self.rho_hat += 0.1 * h1.mean(axis=0)
            dKL_drho_hat = (self.rho - self.rho_hat)/(self.rho_hat*(1-self.rho_hat))
            h1_1mh1 = h1*(1 - h1)
            g[m_end:m_end+H] -= self.lmbd/n * gsum(h1_1mh1, axis=0) * dKL_drho_hat
            g[:m_end] -= self.lmbd/n * (gdot(inputs.T * prec, h1_1mh1) * dKL_drho_hat).ravel()

        #g[:] = -g[:]
        return g
Example #13
0
 def pt_grad(self, params, inputs, targets, l2=0, **kwargs):
     g = gzeros(params.shape)
     Z = self.activ(gpu.dot(inputs, params[:self.m_end].reshape(self.shape)) + params[self.m_end:])
     _, delta = self.score(Z, targets, error=True)
     # necessary?
     delta = self.C * delta
     g[:self.m_end] = gdot(inputs.T, delta).ravel() + params[:self.m_end]
     g[self.m_end:] = delta.sum(axis=0)
     # clean up
     del delta
     return g
Example #14
0
    def grad(self, params, inputs, targets, **kwargs):
        data = inputs
        for layer, (c1, c2) in izip(self, izip(self.cuts[:-1], self.cuts[1:])):
            data = layer.fprop(self.params[c1:c2], data)

        _, delta = self._score(data, targets, error=True)

        g = gzeros(self.psize)
        for layer, (c1, c2) in izip(self[::-1], izip(self.cuts[-2::-1], self.cuts[:0:-1])):
            delta = layer.bprop(params=params[c1:c2], grad=g[c1:c2], delta=delta)
        return g
Example #15
0
    def pt_init(self, score=None, init_var=1e-2, init_bias=0., l2=0., SI=15, **kwargs):
        pt_params = gzeros(self.m_end + self.shape[1] + self.shape[0])
        if init_var is None:
            pt_params[:self.m_end] = gpu.garray(init_SI(self.shape, sparsity=SI)).ravel()
        else:
            pt_params[:self.m_end] = init_var * gpu.randn(self.m_end)

        pt_params[self.m_end:] = init_bias
        self.score = score

        self.l2 = l2
        return pt_params
Example #16
0
    def pt_grad(self, params, inputs, targets, l2=0, **kwargs):
        g = gzeros(params.shape)
        
        Z = self.activ(gpu.dot(inputs, params[:self.m_end].reshape(self.shape)) + params[self.m_end:])
        _, delta = self.score(Z, targets, error=True)

        g[:self.m_end] = gdot(inputs.T, delta).ravel()
        
        g[self.m_end:] = delta.sum(axis=0)
        # clean up
        del delta
        return g
Example #17
0
    def pt_init(self, score=None, init_var=1e-2, init_bias=0., l2=0., SI=15, **kwargs):
        pt_params = gzeros(self.m_end + self.shape[0])
        if init_var is None:
            pt_params[:self.m_end] = gpu.garray(init_SI(self.shape, sparsity=SI)).ravel()
        else:
            pt_params[:self.m_end] = init_var * gpu.randn(self.m_end)

        pt_params[self.m_end:] = init_bias
        self.score = score

        self.l2 = l2
        return pt_params
Example #18
0
    def grad_cd1(self, params, inputs, **kwargs):
        """
        """
        g = gzeros(params.shape)

        n, _ = inputs.shape

        m_end = self.m_end
        V = self.shape[0]
        H = self.shape[1]
        wm = params[:m_end].reshape(self.shape)

        h1, h_sampled = self.H(inputs,
                               wm=wm,
                               bias=params[m_end:-V],
                               sampling=True)
        v2, _ = self.V(h_sampled, wm=wm.T, bias=params[-V:])
        h2, _ = self.H(v2, wm=wm, bias=params[m_end:-V])

        # Note the negative sign: the gradient is
        # supposed to point into 'wrong' direction,
        # because the used optimizer likes to minimize.
        g[:m_end] = -gdot(inputs.T, h1).ravel()
        g[:m_end] += gdot(v2.T, h2).ravel()
        g[:m_end] *= 1. / n
        g[:m_end] += self.l2 * params[:m_end]

        g[m_end:-V] = -h1.mean(axis=0)
        g[m_end:-V] += h2.mean(axis=0)

        g[-V:] = -inputs.mean(axis=0)
        g[-V:] += v2.mean(axis=0)

        if self.rho_hat is None:
            self.rho_hat = h1.mean(axis=0)
        else:
            self.rho_hat *= 0.9
            self.rho_hat += 0.1 * h1.mean(axis=0)
        dKL_drho_hat = (self.rho - self.rho_hat) / (self.rho_hat *
                                                    (1 - self.rho_hat))
        h1_1mh1 = h1 * (1 - h1)
        g[m_end:-V] -= self.lmbd / n * gsum(h1_1mh1, axis=0) * dKL_drho_hat
        g[:m_end] -= self.lmbd / n * (gdot(inputs.T, h1_1mh1) *
                                      dKL_drho_hat).ravel()

        return g
Example #19
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)

        hddn = self.activ(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)), self.theta)
        Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:]

        _, delta = self.score(Z, inpts, error=True)

        g[:self.m_end] = gdot(delta.T, hddn).ravel()
        g[-self.shape[0]:] = delta.sum(axis=0)

        dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn)

        g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel()

        # clean up
        del delta
        return g
Example #20
0
    def pt_init(self, init_var=1e-2, init_bias=0., avg_nxyf=0.1, avg_nfh=0.1, rho=0.5, lmbd=0., l2=0., **kwargs):
        """
        """
        pt_params = gzeros(self.size + self.shape[0][0] + self.shape[0][1])
        pt_params[:self._cum_xyh] = init_var * gpu.randn(self._cum_xyh)

        self.pt_score = self.reconstruction
        self.pt_grad = self.cd1_3way_grad

        self.avg_nxyf = avg_nxyf
        self.avg_nfh = avg_nfh

        self.l2 = l2
        self.rho = rho
        self.lmbd = lmbd
        self.rho_hat = None
        
        return pt_params
Example #21
0
    def pretrain(self, schedule):
        super(DAE, self).pretrain(schedule=schedule)

        p = self.params.as_numpy_array()

        pretrained = schedule["pretrained"]

        # How many parameters in the unrolled model?
        _dec = []
        _enc = [0]
        self.psize = 0
        for layer in self:
            _enc.append(layer.shape[0] * layer.shape[1] + layer.shape[1])
            _dec.append(layer.shape[0] * layer.shape[1] + layer.shape[0])
            self.psize += _enc[-1] + _dec[-1]
        self.enc = np.cumsum(_enc)
        _dec.append(0)
        _dec.reverse()
        self.dec = np.cumsum(_dec) + self.enc[-1]

        # Build up encoder and decoder
        self.encoder = []
        self.params = gzeros(self.psize)
        for layer, (c1, c2) in izip(self, izip(self.enc[:-1], self.enc[1:])):
            self.encoder.append(layer)
            self.params[c1:c2] = p[c1:c2]
            layer.p = self.params[c1:c2]
        self.decoder = []
        for layer, (c1, c2) in izip(self[-1::-1],
                                    izip(self.dec[:-1], self.dec[1:])):
            l = layer.transpose(self.params[c1:c2])
            if pretrained:
                l.p[:l.m_end] = layer.p[:layer.m_end].reshape(
                    layer.shape).T.ravel()
            self.decoder.append(l)

        # Fix missing activations of decoder
        for i, layer in enumerate(self[-2::-1]):
            self.decoder[i].activ = layer.activ
        self.decoder[-1].activ = idnty

        msg = {"msg": "DAE unrolled: %s" % self}
        munk.taggify(self.logging, "pretty").send(msg)
Example #22
0
    def pt_grad(self, params, noisy_inpts, targets, l2=0., **kwargs):
        g = gzeros(params.shape)

        hddn = self.activ(gpu.dot(noisy_inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]])
        Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:]

        _, delta = self.score(Z, targets, error=True)

        g[:self.m_end] = gdot(delta.T, hddn).ravel()
        g[-self.shape[0]:] = delta.sum(axis=0)

        dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn)

        g[:self.m_end] += gdot(noisy_inpts.T, dsc_dha).ravel()

        g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0)
        # clean up
        del delta
        return g
Example #23
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)
        m, _ = inpts.shape

        hddn = self.activ(gdot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.size])
        Z = gdot(hddn, params[self.size:-self.shape[0]].reshape(self.Tshape)) + params[-self.shape[0]:]

        _, delta = self.score(Z, inpts, error=True)

        g[self.end:-self.shape[0]] = gdot(hddn.T, delta).ravel()
        g[-self.shape[0]:] = delta.sum(axis=0)

        diff = diff_table[self.activ](hddn)
        dsc_dha = diff * gdot(delta, params[:self.m_end].reshape(self.shape))

        g[:self.m_end] = gdot(inpts.T, dsc_dha).ravel()
        g[self.m_end:self.size] = dsc_dha.sum(axis=0)
        # clean up
        del delta, hddn, Z
        return g
Example #24
0
File: rbm.py Project: osdf/gpustack
    def grad_cd1(self, params, inputs, **kwargs):
        """
        """
        g = gzeros(params.shape)

        n, _ = inputs.shape

        m_end = self.m_end
        V = self.shape[0]
        H = self.shape[1]
        wm = params[:m_end].reshape(self.shape)

        h1, h_sampled = self.H(inputs, wm=wm, bias=params[m_end:-V], sampling=True)
        v2, _ = self.V(h_sampled, wm=wm.T, bias=params[-V:])
        h2, _ = self.H(v2, wm=wm, bias=params[m_end:-V])

        # Note the negative sign: the gradient is 
        # supposed to point into 'wrong' direction,
        # because the used optimizer likes to minimize.
        g[:m_end] = -gdot(inputs.T, h1).ravel()
        g[:m_end] += gdot(v2.T, h2).ravel()
        g[:m_end] *= 1./n
        g[:m_end] += self.l2*params[:m_end]

        g[m_end:-V] = -h1.mean(axis=0)
        g[m_end:-V] += h2.mean(axis=0)

        g[-V:] = -inputs.mean(axis=0)
        g[-V:] += v2.mean(axis=0)

        if self.rho_hat is None:
            self.rho_hat = h1.mean(axis=0)
        else:
            self.rho_hat *= 0.9
            self.rho_hat += 0.1 * h1.mean(axis=0)
        dKL_drho_hat = (self.rho - self.rho_hat)/(self.rho_hat*(1-self.rho_hat))
        h1_1mh1 = h1*(1 - h1)
        g[m_end:-V] -= self.lmbd/n * gsum(h1_1mh1, axis=0) * dKL_drho_hat
        g[:m_end] -= self.lmbd/n * (gdot(inputs.T, h1_1mh1) * dKL_drho_hat).ravel()

        return g
Example #25
0
    def pretrain(self, schedule):
        super(DAE, self).pretrain(schedule=schedule)

        p = self.params.as_numpy_array()

        pretrained = schedule["pretrained"]

        # How many parameters in the unrolled model?
        _dec = []
        _enc = [0]
        self.psize = 0
        for layer in self:
            _enc.append(layer.shape[0] * layer.shape[1] + layer.shape[1])
            _dec.append(layer.shape[0] * layer.shape[1] + layer.shape[0])
            self.psize += _enc[-1] + _dec[-1]
        self.enc = np.cumsum(_enc)
        _dec.append(0)
        _dec.reverse()
        self.dec = np.cumsum(_dec) + self.enc[-1]

        # Build up encoder and decoder
        self.encoder = []
        self.params = gzeros(self.psize)
        for layer, (c1, c2) in izip(self, izip(self.enc[:-1], self.enc[1:])):
            self.encoder.append(layer)
            self.params[c1:c2] = p[c1:c2]
            layer.p = self.params[c1:c2]
        self.decoder = []
        for layer, (c1, c2) in izip(self[-1::-1], izip(self.dec[:-1], self.dec[1:])):
            l = layer.transpose(self.params[c1:c2])
            if pretrained:
                l.p[: l.m_end] = layer.p[: layer.m_end].reshape(layer.shape).T.ravel()
            self.decoder.append(l)

        # Fix missing activations of decoder
        for i, layer in enumerate(self[-2::-1]):
            self.decoder[i].activ = layer.activ
        self.decoder[-1].activ = idnty

        msg = {"msg": "DAE unrolled: %s" % self}
        munk.taggify(self.logging, "pretty").send(msg)
Example #26
0
 def pt_init(self, score=None, init_var=1e-2, init_bias=0., **kwargs):
     pt_params = gzeros(self.size + self.m_end + self.shape[0])
     if init_var is None:
         init_heur = 4*np.sqrt(6./(self.shape[0]+self.shape[1]))
         pt_params[:self.m_end] = gpu.rand(self.m_end)
         pt_params[:self.m_end] *= 2
         pt_params[:self.m_end] -= 1
         pt_params[:self.m_end] *= init_heur
         
         pt_params[self.size:-self.shape[0]] = gpu.rand(self.m_end)
         pt_params[self.size:-self.shape[0]] *= 2
         pt_params[self.size:-self.shape[0]] -= 1
         pt_params[self.size:-self.shape[0]] *= init_heur
     else: 
         pt_params[:self.m_end] = init_var * gpu.randn(self.m_end)
         pt_params[self.size:-self.shape[0]] = init_var * gpu.randn(self.m_end)
     
     pt_params[self.m_end:self.size] = init_bias
     pt_params[-self.shape[0]:] = init_bias
     self.score = score
     return pt_params
Example #27
0
    def grad(self, params, inputs, targets, **kwargs):
        data = inputs
        for layer, (c1, c2) in izip(self.encoder, izip(self.enc[:-1], self.enc[1:])):
            data = layer.fprop(self.params[c1:c2], data)

        # possible spot for semisupervision?

        for layer, (c1, c2) in izip(self.decoder, izip(self.dec[:-1], self.dec[1:])):
            data = layer.fprop(self.params[c1:c2], data)

        _, delta = self._score(data, inputs, error=True)

        g = gzeros(self.psize)
        for layer, (c1, c2) in izip(self.decoder[::-1], izip(self.dec[-2::-1], self.dec[:0:-1])):
            delta = layer.bprop(params=params[c1:c2], grad=g[c1:c2], delta=delta)

        # in case: fuse in gradient from semisupervision

        for layer, (c1, c2) in izip(self.encoder[::-1], izip(self.enc[-2::-1], self.enc[:0:-1])):
            delta = layer.bprop(params=params[c1:c2], grad=g[c1:c2], delta=delta)
        return g
Example #28
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)

        hddn = self.activ(
            gpu.dot(inpts, params[:self.m_end].reshape(self.shape)),
            self.theta)
        Z = gdot(hddn, params[:self.m_end].reshape(
            self.shape).T) + params[-self.shape[0]:]

        _, delta = self.score(Z, inpts, error=True)

        g[:self.m_end] = gdot(delta.T, hddn).ravel()
        g[-self.shape[0]:] = delta.sum(axis=0)

        dsc_dha = gdot(delta, params[:self.m_end].reshape(
            self.shape)) * diff_table[self.activ](hddn)

        g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel()

        # clean up
        del delta
        return g
Example #29
0
    def pt_grad(self, params, inpts, **kwargs):
        g = gzeros(params.shape)

        hddn = self.activ(gpu.dot(inpts, params[:self.m_end].reshape(self.shape)) + params[self.m_end:self.m_end+self.shape[1]])
        _hddn= hddn.as_numpy_array()
        idxs = np.argsort(_hddn, axis=1)
        _hddn[range(_hddn.shape[0]), idxs[:, self.ak:].T] = 0
        hddn = gpu.garray(_hddn)
        Z = gdot(hddn, params[:self.m_end].reshape(self.shape).T) + params[-self.shape[0]:]

        _, delta = self.score(Z, inpts, error=True)

        g[:self.m_end] = gdot(delta.T, hddn).ravel()
        g[-self.shape[0]:] = delta.sum(axis=0)

        dsc_dha = gdot(delta, params[:self.m_end].reshape(self.shape)) * diff_table[self.activ](hddn)

        g[:self.m_end] += gdot(inpts.T, dsc_dha).ravel()

        g[self.m_end:-self.shape[0]] = dsc_dha.sum(axis=0)
        # clean up
        del delta
        return g
Example #30
0
    def pt_init(self, init_var=1e-2, init_bias=0., rho=0.5, lmbd=0., 
            l2=0., SI=15, **kwargs):
        """
        """
        # 2*self.shape[0]: precision parameters have size shape[0]
        pt_params = gzeros(self.m_end + self.shape[1] + 2*self.shape[0])
        if init_var is None:
            pt_params[:self.m_end] = gpu.garray(init_SI(self.shape, sparsity=SI)).ravel()
        else:
            pt_params[:self.m_end] = init_var * gpu.randn(self.m_end)
        pt_params[self.m_end:-self.shape[0]] = init_bias
        pt_params[-self.shape[0]:] = 1.

        self.pt_score = self.reconstruction
        self.pt_grad = self.grad_cd1

        self.l2 = l2

        self.rho = rho
        self.lmbd = lmbd
        self.rho_hat = None

        return pt_params
Example #31
0
 def __init__(self, ind, schedule):
     gpu.seed_rand(seed=None)
     self.logging = schedule["logging"]
     self.psize = 0
     cuts = [0]
     self.stack = schedule["stack"]
     for layer in self.stack:
         ltype = layer["type"]
         units = layer["units"]
         l = ltype.__new__(ltype)
         l.__init__(shape=(ind, units), **layer)
         self.psize += l.size
         self.append(l)
         cuts.append(l.size)
         ind = units
     self.params = gzeros(self.psize)
     self.cuts = np.cumsum(cuts)
     for layer, (c1, c2) in izip(self, izip(self.cuts[:-1], self.cuts[1:])):
         layer.p = self.params[c1:c2]
     if "score" in schedule:
         self._score = schedule["score"]
     else:
         print("You may have a problem: _score_ is NONE")
         self._score = None
Example #32
0
    def grad_cd1(self, params, inputs, **kwargs):
        """
        """
        g = gzeros(params.shape)

        n, _ = inputs.shape

        m_end = self.m_end
        V = self.shape[0]
        H = self.shape[1]
        wm = params[:m_end].reshape(self.shape)
        prec = params[-V:][:, gpu.newaxis]

        h1, h_sampled = self.H(inputs,
                               wm=prec * wm,
                               bias=params[m_end:m_end + H],
                               sampling=True)
        v2, v_sampled = gauss(h_sampled,
                              wm=(wm / prec).T,
                              bias=params[-(2 * V):-V],
                              prec=prec.T,
                              sampling=True)
        h2, _ = self.H(v2, wm=prec * wm, bias=params[m_end:m_end + H])

        #print h1[0,0], h_sampled[0,0], v2[0,0], v_sampled[0,0]
        # Note the negative sign: the gradient is
        # supposed to point into 'wrong' direction.
        g[:m_end] = -gdot(inputs.T * prec, h1).ravel()
        g[:m_end] += gdot(v_sampled.T * prec, h2).ravel()
        g[:m_end] *= 1. / n
        g[:m_end] += self.l2 * params[:m_end]

        g[m_end:m_end + H] = -h1.sum(axis=0)
        g[m_end:m_end + H] += h2.sum(axis=0)
        g[m_end:m_end + H] *= 1. / n

        g[-2 * V:-V] = -inputs.sum(axis=0)
        g[-2 * V:-V] += v_sampled.sum(axis=0)
        g[-2 * V:-V] *= 1. / n
        g[-2 * V:-V] *= (prec**2).T

        #print gsum(g[:m_end]**2), gsum(g[m_end:m_end+H]**2), gsum(g[-2*V:-V]**2)
        # Gradient for square root of precision
        g[-V:] = -gsum(2 * prec.T * inputs * (params[-2 * V:-V] - inputs / 2),
                       axis=0) + gsum(gdot(inputs.T, h1) * wm, axis=1)
        g[-V:] += (gsum(2 * prec.T * v_sampled *
                        (params[-2 * V:-V] - v_sampled / 2),
                        axis=0) + gsum(gdot(v_sampled.T, h2) * wm, axis=1))
        g[-V:] *= 1. / n

        #print gsum(g[-V:]**2)
        if self.lmbd > 0.:
            if self.rho_hat is None:
                self.rho_hat = h1.mean(axis=0)
            else:
                self.rho_hat *= 0.9
                self.rho_hat += 0.1 * h1.mean(axis=0)
            dKL_drho_hat = (self.rho - self.rho_hat) / (self.rho_hat *
                                                        (1 - self.rho_hat))
            h1_1mh1 = h1 * (1 - h1)
            g[m_end:m_end +
              H] -= self.lmbd / n * gsum(h1_1mh1, axis=0) * dKL_drho_hat
            g[:m_end] -= self.lmbd / n * (gdot(inputs.T * prec, h1_1mh1) *
                                          dKL_drho_hat).ravel()

        #g[:] = -g[:]
        return g
Example #33
0
    def cd1_3way_grad(self, params, inputs, **kwargs):
        """
        """
        g = gzeros(params.shape)
        x, y = inputs
        n, _ = x.shape

        #print self.avg_nxyf, self.avg_nfh

        weights_xf = params[:self.xf_sz].reshape(self.xfshape)
        weights_yf = params[self.xf_sz:self._cum_xy].reshape(self.yfshape)
        weights_fh = params[self._cum_xy:self._cum_xyh].reshape(self.fhshape)
        bias_h = params[self._cum_xyh:self.size]
        bias_x = params[self.size:-self.shape[0][1]]
        bias_y = params[-self.shape[0][1]:]

        # normalize weights
        sq_xf = weights_xf * weights_xf
        norm_xf = gpu.sqrt(sq_xf.sum(axis=0)) + SMALL
        sq_yf = weights_yf * weights_yf
        norm_yf = gpu.sqrt(sq_yf.sum(axis=0)) + SMALL
 
        norm_xyf = (norm_xf.mean() + norm_yf.mean())/2.
        self.avg_nxyf *= 0.95
        self.avg_nxyf += (0.05 * norm_xyf)
        weights_xf *= (self.avg_nxyf / norm_xf)
        weights_yf *= (self.avg_nxyf / norm_yf)

        sq_fh = weights_fh*weights_fh
        norm_fh = gpu.sqrt(sq_fh.sum(axis=1)) + SMALL
        self.avg_nfh *= 0.95
        self.avg_nfh += (0.05 * norm_fh.mean())
        weights_fh *= (self.avg_nfh / norm_fh[:, gpu.newaxis])
        # normalization done

        factors_x = gdot(x, weights_xf) 
        factors_y = gdot(y, weights_yf)
        factors = factors_x * factors_y

        h, h_sampled = bernoulli(factors, wm=weights_fh, bias=bias_h, sampling=True)
        factors_h = gdot(h_sampled, weights_fh.T)

        g[:self.xf_sz] = -gdot(x.T, factors_y*factors_h).ravel()
        g[self.xf_sz:self._cum_xy] = -gdot(y.T, factors_x*factors_h).ravel()
        g[self._cum_xy:self._cum_xyh] = -gdot(factors.T, h_sampled).ravel()
        g[self._cum_xyh:self.size] = -h.sum(axis=0)
        g[self.size:-self.shape[0][1]] = -x.sum(axis=0) 
        g[-self.shape[0][1]:] = -y.sum(axis=0)

        # 3way cd
        way = np.random.rand() > 0.5
        if way:
            # reconstruct y (output) first.
            tmp = factors_x * factors_h
            y1, _ = self.V(tmp, wm=weights_yf.T, bias=bias_y)
            factors_y[:] = gdot(y1, weights_yf)
            # then reconstruct x (input).
            tmp = factors_y * factors_h
            x1, _ = self.V(tmp, wm=weights_xf.T, bias=bias_x)
            factors_x[:] = gdot(x1, weights_xf)
        else:
            # reconstruct x (input) first.
            tmp = factors_y * factors_h
            x1, _ = self.V(tmp, wm=weights_xf.T, bias=bias_x)
            factors_x[:] = gdot(x1, weights_xf)
            # then reconstruct y (output).
            tmp = factors_x * factors_h
            y1, _ = self.V(tmp, wm=weights_yf.T, bias=bias_y)
            factors_y[:] = gdot(y1, weights_yf)

        factors[:] = factors_x * factors_y
        h1, _ = bernoulli(factors, wm=weights_fh, bias=bias_h)
        factors_h[:] = gdot(h1, weights_fh.T)

        g[:self.xf_sz] += gdot(x1.T, factors_y*factors_h).ravel()
        g[:self.xf_sz] *= 1./n

        g[self.xf_sz:self._cum_xy] += gdot(y1.T, factors_x*factors_h).ravel()
        g[self.xf_sz:self._cum_xy] *= 1./n

        g[self._cum_xy:self._cum_xyh] += gdot(factors.T, h1).ravel()
        g[self._cum_xy:self._cum_xyh] *= 1./n

        g[self._cum_xyh:self.size] += h1.sum(axis=0)
        g[self._cum_xyh:self.size] *= 1./n

        g[self.size:-self.shape[0][1]] += x1.sum(axis=0)
        g[self.size:-self.shape[0][1]] *= 1./n

        g[-self.shape[0][1]:] += y1.sum(axis=0)
        g[-self.shape[0][1]:] *= 1./n

        return g