Example #1
0
    def __init__(self, get_optimizer, theano_warning='raise'):

        v = self.v
        w = self.w
        theanofunction = lazytheanofunc('warn', mode='FAST_RUN')
        theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN')

        # Create theano expressions
        x, z = ndict.ordereddicts(self.variables())
        self.var_x, self.var_z, = x, z

        # Helper variables
        A = T.fmatrix('A')
        self.var_A = A

        # Get gradient symbols
        allvars = list(x.values()) + list(
            z.values()) + [A]  # note: '+' concatenates lists

        # TODO: more beautiful/standardized way of setting distributions
        # (should be even simpler than this)
        self.dist_qz = {}
        self.dist_px = {}
        self.dist_pz = {}

        logpx, logpz, logqz = self.factors(x, z, A)

        if get_optimizer == None:

            def get_optimizer(w, g):
                from collections import OrderedDict
                updates = OrderedDict()
                for i in w:
                    updates[w[i]] = w[i]
                return updates

        # Log-likelihood lower bound
        self.f_L = theanofunction(allvars, [logpx, logpz, logqz])
        L = (logpx + logpz - logqz).sum()
        g = T.grad(L, list(v.values()) + list(w.values()))
        gv, gw = dict(list(zip(list(v.keys()), g[0:len(v)]))), dict(
            list(zip(list(w.keys()), g[len(v):len(v) + len(w)])))
        updates = get_optimizer(v, gv)
        updates.update(get_optimizer(w, gw))

        #self.profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
        #self.f_evalAndUpdate = theano.function(allvars, [logpx + logpz - logqz], updates=updates_w, mode=self.profmode)
        #theano.printing.debugprint(self.f_evalAndUpdate)

        self.f_eval = theanofunction(allvars, [logpx + logpz - logqz])
        self.f_evalAndUpdate = theanofunction(allvars, [logpx + logpz - logqz],
                                              updates=updates)
Example #2
0
    def __init__(self, get_optimizer, theano_warning="raise"):

        v = self.v
        w = self.w
        theanofunction = lazytheanofunc("warn", mode="FAST_RUN")
        theanofunction_silent = lazytheanofunc("ignore", mode="FAST_RUN")

        # Create theano expressions
        x, z = ndict.ordereddicts(self.variables())
        self.var_x, self.var_z, = x, z

        # Helper variables
        A = T.fmatrix("A")
        self.var_A = A

        # Get gradient symbols
        allvars = x.values() + z.values() + [A]  # note: '+' concatenates lists

        # TODO: more beautiful/standardized way of setting distributions
        # (should be even simpler than this)
        self.dist_qz = {}
        self.dist_px = {}
        self.dist_pz = {}

        logpx, logpz, logqz = self.factors(x, z, A)

        if get_optimizer == None:

            def get_optimizer(w, g):
                from collections import OrderedDict

                updates = OrderedDict()
                for i in w:
                    updates[w[i]] = w[i]
                return updates

        # Log-likelihood lower bound
        self.f_L = theanofunction(allvars, [logpx, logpz, logqz])
        L = (logpx + logpz - logqz).sum()
        g = T.grad(L, v.values() + w.values())
        gv, gw = dict(zip(v.keys(), g[0 : len(v)])), dict(zip(w.keys(), g[len(v) : len(v) + len(w)]))
        updates = get_optimizer(v, gv)
        updates.update(get_optimizer(w, gw))

        # self.profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
        # self.f_evalAndUpdate = theano.function(allvars, [logpx + logpz - logqz], updates=updates_w, mode=self.profmode)
        # theano.printing.debugprint(self.f_evalAndUpdate)

        self.f_eval = theanofunction(allvars, [logpx + logpz - logqz])
        self.f_evalAndUpdate = theanofunction(allvars, [logpx + logpz - logqz], updates=updates)
Example #3
0
    def __init__(self, theano_warning='raise'):

        theanofunction = lazytheanofunc('warn', mode='FAST_RUN')
        theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN')

        # Create theano expressions
        v, w, x, z = ndict.ordereddicts(self.variables())
        self.var_v, self.var_w, self.var_x, self.var_z, = v, w, x, z

        # Helper variables
        A = T.dmatrix('A')
        self.var_A = A

        # Get gradient symbols
        allvars = list(v.values()) + list(w.values()) + list(
            x.values()) + list(z.values()) + [A
                                              ]  # note: '+' concatenates lists

        # TODO: more beautiful/standardized way of setting distributions
        # (should be even simpler than this)
        self.dist_qz = {}
        self.dist_px = {}
        self.dist_pz = {}

        logpv, logpw, logpx, logpz, logqz = self.factors(v, w, x, z, A)

        # Log-likelihood lower bound
        self.f_L = theanofunction(allvars, [logpx, logpz, logqz])
        L = (logpx + logpz - logqz).sum()
        dL_dw = T.grad(L, list(v.values()) + list(w.values()))
        self.f_dL_dw = theanofunction(allvars, [logpx, logpz, logqz] + dL_dw)

        weights = T.dmatrix()
        dL_weighted_dw = T.grad((weights * (logpx + logpz - logqz)).sum(),
                                list(v.values()) + list(w.values()))
        self.f_dL_weighted_dw = theanofunction(
            allvars + [weights],
            [logpx + logpz - logqz, weights *
             (logpx + logpz - logqz)] + dL_weighted_dw)

        # prior
        dlogpw_dw = T.grad(logpv + logpw,
                           list(v.values()) + list(w.values()),
                           disconnected_inputs='ignore')
        self.f_logpw = theanofunction(
            list(v.values()) + list(w.values()), [logpv, logpw])
        self.f_dlogpw_dw = theanofunction(
            list(v.values()) + list(w.values()), [logpv, logpw] + dlogpw_dw)
Example #4
0
 def factors(self, w, x, z, A):
     # Define logp(w)
     logpw = 0
     for i in range(len(self.n_units)-1):
         logpw += ap.logpdfs.normal(w['w'+str(i)], 0, self.prior_sd).sum()
         logpw += ap.logpdfs.normal(w['b'+str(i)], 0, self.prior_sd).sum()
     
     if self.nonlinearity == 'tanh':
         f = T.tanh
     elif self.nonlinearity == 'sigmoid':
         f = T.nnet.sigmoid
     elif self.nonlinearity == 'softplus':
         f = T.nnet.softplus
     else:
         raise Exception("Unknown nonlinarity "+self.nonlinearity)
     
     # Define logp(x)
     hiddens  = [T.dot(w['w0'], x['x']) + T.dot(w['b0'], A)]
     for i in range(1, len(self.n_units)-1):
         hiddens.append(T.dot(w['w'+str(i)], f(hiddens[-1])) + T.dot(w['b'+str(i)], A))
     
     self.p = T.nnet.softmax(hiddens[-1].T).T
     self.entropy = T.nnet.categorical_crossentropy(self.p.T, self.p.T).T
     
     logpx = (- T.nnet.categorical_crossentropy(self.p.T, x['y'].T).T).reshape((1,-1))
     
     # function for distribution q(z|x)
     theanofunc = lazytheanofunc('ignore', mode='FAST_RUN')
     self.dist_px['y'] = theanofunc([x['x']] + w.values() + [A], self.p)
     
     logpz = 0 * A
     return logpw, logpx, logpz
Example #5
0
 def factors(self, w, x, z, A):
     # Define logp(w)
     logpw = 0
     for i in range(len(self.n_units)-1):
         logpw += ap.logpdfs.normal(w['w'+str(i)], 0, self.prior_sd).sum()
         logpw += ap.logpdfs.normal(w['b'+str(i)], 0, self.prior_sd).sum()
     
     if self.nonlinearity == 'tanh':
         f = T.tanh
     elif self.nonlinearity == 'sigmoid':
         f = T.nnet.sigmoid
     elif self.nonlinearity == 'softplus':
         f = T.nnet.softplus
     else:
         raise Exception("Unknown nonlinarity "+self.nonlinearity)
     
     # Define logp(x)
     hiddens  = [T.dot(w['w0'], x['x']) + T.dot(w['b0'], A)]
     for i in range(1, len(self.n_units)-1):
         hiddens.append(T.dot(w['w'+str(i)], f(hiddens[-1])) + T.dot(w['b'+str(i)], A))
     
     self.p = T.nnet.softmax(hiddens[-1].T).T
     self.entropy = T.nnet.categorical_crossentropy(self.p.T, self.p.T).T
     
     logpx = (- T.nnet.categorical_crossentropy(self.p.T, x['y'].T).T).reshape((1,-1))
     
     # function for distribution q(z|x)
     theanofunc = lazytheanofunc('ignore', mode='FAST_RUN')
     self.dist_px['y'] = theanofunc([x['x']] + list(w.values()) + [A], self.p)
     
     logpz = 0 * A
     return logpw, logpx, logpz
Example #6
0
 def __init__(self, theano_warning='raise'):
     
     theanofunction = lazytheanofunc('warn', mode='FAST_RUN')
     theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN')
     
     # Create theano expressions
     v, w, x, z = ndict.ordereddicts(self.variables())
     self.var_v, self.var_w, self.var_x, self.var_z, = v, w, x, z
     
     # Helper variables
     A = T.dmatrix('A')
     self.var_A = A
     
     # Get gradient symbols
     allvars = v.values() + w.values() + x.values() + z.values() + [A] # note: '+' concatenates lists
     
     # TODO: more beautiful/standardized way of setting distributions
     # (should be even simpler than this) 
     self.dist_qz = {}
     self.dist_px = {}
     self.dist_pz = {}
     
     logpv, logpw, logpx, logpz, logqz = self.factors(v, w, x, z, A)
     
     # Log-likelihood lower bound
     self.f_L = theanofunction(allvars, [logpx, logpz, logqz])
     L = (logpx + logpz - logqz).sum()
     dL_dw = T.grad(L, v.values() + w.values())
     self.f_dL_dw = theanofunction(allvars, [logpx, logpz, logqz] + dL_dw)
     
     weights = T.dmatrix()
     dL_weighted_dw = T.grad((weights * (logpx + logpz - logqz)).sum(), v.values() + w.values())
     self.f_dL_weighted_dw = theanofunction(allvars + [weights], [logpx + logpz - logqz, weights*(logpx + logpz - logqz)] + dL_weighted_dw)
     
     # prior
     dlogpw_dw = T.grad(logpv + logpw, v.values() + w.values(), disconnected_inputs='ignore')
     self.f_logpw = theanofunction(v.values() + w.values(), [logpv, logpw])
     self.f_dlogpw_dw = theanofunction(v.values() + w.values(), [logpv, logpw] + dlogpw_dw)
Example #7
0
    def factors(self, v, w, x, z, A):
        '''
        z['eps'] are the independent epsilons (Gaussian with unit variance)
        x['x'] is the data
        x['y'] is categorial data (1-of-K coded)
        
        The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(y) + log p(x|y,z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x,y): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz
        
        let logpv and logpw be the (prior) density of the parameters
        '''
        
        def f_softplus(x): return T.log(T.exp(x) + 1)# - np.log(2)
        def f_rectlin(x): return x*(x>0)
        def f_rectlin2(x): return x*(x>0) + 0.01 * x
        nonlinear = {'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2}
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]
        
        # Compute q(z|x)
        hidden_q = [nonlinear_q(T.dot(v['w0'], x['x']) + T.dot(v['b0'], A))]
        for i in range(1, len(self.n_hidden_q)):
            hidden_q.append(nonlinear_q(T.dot(v['w'+str(i)], hidden_q[-1]) + T.dot(v['b'+str(i)], A)))
        
        q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(v['logvar_b'], A)
        else: raise Exception()
        
        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('ignore', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x']] + v.values() + [A], [q_mean, q_logvar])
        
        # Compute virtual sample
        _z = q_mean + T.exp(0.5 * q_logvar) * z['eps']
        
        # Compute log p(x|z)
        hidden_p = [nonlinear_p(T.dot(w['w0'], _z) + T.dot(w['b0'], A))]
        for i in range(1, len(self.n_hidden_p)):
            hidden_p.append(nonlinear_p(T.dot(w['w'+str(i)], hidden_p[-1]) + T.dot(w['b'+str(i)], A)))
        
        if self.type_px == 'bernoulli':
            px = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = - T.nnet.binary_crossentropy(px, x['x'])
            self.dist_px['x'] = theanofunc([_z] + w.values() + [A], px)
        elif self.type_px == 'gaussian'or self.type_px == 'sigmoidgaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            if self.type_px == 'sigmoidgaussian':
                x_mean = T.nnet.sigmoid(x_mean)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([_z] + w.values() + [A], [x_mean, x_logvar])
        else: raise Exception("")
        
        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(np.ones((1, self.n_x)), _logpx) # logpx = logp(x|z,w) + logp(y|x,w)
        
        # log p(y|x)
        _logpy = T.dot(w['out_w_y'], hidden_q[-1]) + T.dot(w['out_b_y'], A)
        py = T.nnet.softmax(_logpy.T).T
        logpy = (- T.nnet.categorical_crossentropy(py.T, x['y'].T).T).reshape((1,-1))
        self.dist_px['y'] = theanofunc([x['x']] + v.values() + w.values() + [A], py)
        
        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True)
        else:
            raise Exception("Unknown type_pz")
        
        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = - 0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True)
        else: raise Exception()
        
        # Note: logpv and logpw are a scalars
        def f_prior(_w, prior_sd=self.prior_sd):
            return ap.logpdfs.normal(_w, 0, prior_sd).sum()
        logpv = 0
        logpv += f_prior(v['w0'])
        for i in range(1, len(self.n_hidden_q)):
            logpv += f_prior(v['w'+str(i)])
        logpv += f_prior(v['mean_w'])
        if self.type_qz in ['gaussian','gaussianmarg']:
            logpv += f_prior(v['logvar_w'])
        
        logpw = 0
        logpw += f_prior(w['w0'])
        for i in range(1, len(self.n_hidden_p)):
            logpw += f_prior(w['w'+str(i)])
        logpw += f_prior(w['out_w'])
        logpw += f_prior(w['out_w_y'])
        if self.type_px in ['sigmoidgaussian', 'gaussian']:
            logpw += f_prior(w['out_logvar_w'])
        if self.type_pz == 'studentt':
            logpw += f_prior(w['logv'])
        
        # Custom grad, for when only 'x' is given (and not 'y')
        theanofunction = lazytheanofunc('warn', mode='FAST_RUN')
        dL2_dw = T.grad((logpx + logpz - logqz).sum(), v.values() + w.values(), disconnected_inputs = 'ignore')
        allvars = self.var_v.values() + self.var_w.values() + [x['x']] + z.values() + [A]
        self.f_dL2_dw = theanofunction(allvars, [logpx, logpz, logqz] + dL2_dw)
        
        return logpv, logpw, logpx+logpy, logpz, logqz
Example #8
0
    def factors(self, x, z, A):
        
        v = self.v
        w = self.w
        
        '''
        z is unused
        x['x'] is the data
        
        The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(x|z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz
        
        let logpv and logpw be the (prior) density of the parameters
        '''
        
        def f_softplus(x): return T.log(T.exp(x) + 1)# - np.log(2)
        def f_rectlin(x): return x*(x>0)
        def f_rectlin2(x): return x*(x>0) + 0.01 * x
        nonlinear = {'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2}
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]
        
        #rng = rng_curand.CURAND_RandomStreams(0)
        import theano.tensor.shared_randomstreams
        rng = theano.tensor.shared_randomstreams.RandomStreams(0)
        
        # Compute q(z|x,y)
        hidden_q = [nonlinear_q(T.dot(v['w0x'], x['x']) + T.dot(v['w0y'], x['y']) + T.dot(v['b0'], A))]
        for i in range(1, len(self.n_hidden_q)):
            hidden_q.append(nonlinear_q(T.dot(v['w'+str(i)], hidden_q[-1]) + T.dot(v['b'+str(i)], A)))
        
        q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(v['logvar_b'], A)
        else: raise Exception()
        
        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('warn', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x'], x['y']] + [A], [q_mean, q_logvar])
        
        # Compute virtual sample
        eps = rng.normal(size=q_mean.shape, dtype='float32')
        _z = q_mean + T.exp(0.5 * q_logvar) * eps
        
        # Compute log p(x|z)
        hidden_p = [nonlinear_p(T.dot(w['w0y'], x['y']) + T.dot(w['w0z'], _z) + T.dot(w['b0'], A))]
        for i in range(1, len(self.n_hidden_p)):
            hidden_p.append(nonlinear_p(T.dot(w['w'+str(i)], hidden_p[-1]) + T.dot(w['b'+str(i)], A)))
            if self.dropout:
                hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5)
        
        if self.type_px == 'bernoulli':
            p = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = - T.nnet.binary_crossentropy(p, x['x'])
            self.dist_px['x'] = theanofunc([x['y'], _z] + [A], p)
        elif self.type_px == 'gaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar])
        elif self.type_px == 'laplace':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.laplace(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([x['y'], _z] + [A], [x_mean, x_logvar])
            
        else: raise Exception("")
            
        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w)
        
        # log p(y) (prior of y)
        #_logpy = w['logpy']
        #if self.uniform_y: _logpy *= 0
        #py_model = T.nnet.softmax(T.dot(_logpy, A).T).T
        #logpy = (- T.nnet.categorical_crossentropy(py_model.T, x['y'].T).T).reshape((1,-1))
        #logpx += logpy
        #self.dist_px['y'] = theanofunc([A], py_model)

        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'mog':
            pz = 0
            for i in range(self.n_mixture):
                pz += T.exp(ap.logpdfs.normal2(_z, T.dot(w['mog_mean'+str(i)], A), T.dot(w['mog_logvar'+str(i)], A)))
            logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(float(self.n_mixture))
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True)
        else:
            raise Exception("Unknown type_pz")
        
        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = - 0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True)
        else: raise Exception()
                        
        # Note: logpv and logpw are a scalars
        def f_prior(_w, prior_sd=self.prior_sd):
            return ap.logpdfs.normal(_w, 0, prior_sd).sum()
        
        logpv = 0
        logpv += f_prior(v['w0x'])
        logpv += f_prior(v['w0y'])
        for i in range(1, len(self.n_hidden_q)):
            logpv += f_prior(v['w'+str(i)])
        logpv += f_prior(v['mean_w'])
        if self.type_qz in ['gaussian','gaussianmarg']:
            logpv += f_prior(v['logvar_w'])
        
        logpw = 0
        logpw += f_prior(w['w0y'])
        logpw += f_prior(w['w0z'])
        for i in range(1, len(self.n_hidden_p)):
            logpw += f_prior(w['w'+str(i)])
        logpw += f_prior(w['out_w'])
        if self.type_px in ['sigmoidgaussian', 'gaussian','laplace']:
            logpw += f_prior(w['out_logvar_w'])
        if self.type_pz == 'studentt':
            logpw += f_prior(w['logv'])
            
        #return logpv, logpw, logpx, logpz, logqz
        return logpx, logpz, logqz
Example #9
0
    def __init__(self, theano_warning='raise', hessian=True):

        theanofunction = lazytheanofunc('warn', mode='FAST_RUN')
        theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN')

        # Create theano expressions
        w, x, z = ndict.ordereddicts(self.variables())
        self.var_w, self.var_x, self.var_z, = w, x, z

        # Helper variables
        A = T.dmatrix('A')
        self.var_A = A

        # Get gradient symbols
        self.allvars = list(w.values()) + list(x.values()) + list(
            z.values()) + [A]  # note: '+' concatenates lists
        self.allvars_keys = list(w.keys()) + list(x.keys()) + list(
            z.keys()) + ['A']

        if False:
            # Put test values
            # needs fully implemented gen_xz(), which is not always the case
            # Also, the FD has no test values
            theano.config.compute_test_value = 'raise'
            _w = self.init_w()
            for i in _w:
                w[i].tag.test_value = _w[i]
            _x, _z, _ = self.gen_xz(_w, {}, {}, 10)
            _x, _z = self.xz_to_theano(_x, _z)
            for i in _x:
                x[i].tag.test_value = _x[i]
            for i in _z:
                z[i].tag.test_value = _z[i]

        # TODO: more beautiful/standardized way of setting distributions
        # (should be even simpler then this)
        self.dist_px = {}
        self.dist_pz = {}

        logpw, logpx, logpz = self.factors(w, x, z, A)
        self.var_logpw, self.var_logpx, self.var_logpz = logpw, logpx, logpz

        # Complete-data likelihood estimate
        logpxz = logpx.sum() + logpz.sum()
        self.f_logpxz = theanofunction(self.allvars, [logpx, logpz])

        dlogpxz_dwz = T.grad(logpxz, list(w.values()) + list(z.values()))
        self.f_dlogpxz_dwz = theanofunction(self.allvars,
                                            [logpx, logpz] + dlogpxz_dwz)
        #self.f_dlogpxz_dw = theanofunction(allvars, [logpxz] + dlogpxz_dw)
        #self.f_dlogpxz_dz = theanofunction(allvars, [logpxz] + dlogpxz_dz)

        # prior
        dlogpw_dw = T.grad(logpw,
                           list(w.values()),
                           disconnected_inputs='ignore')
        self.f_logpw = theanofunction(list(w.values()), logpw)
        self.f_dlogpw_dw = theanofunction(list(w.values()),
                                          [logpw] + dlogpw_dw)

        if False:
            # MC-LIKELIHOOD
            logpx_max = logpx.max()
            logpxmc = T.log(T.exp(logpx - logpx_max).mean()) + logpx_max
            self.f_logpxmc = theanofunction(self.allvars, logpxmc)
            dlogpxmc_dw = T.grad(logpxmc,
                                 list(w.values()),
                                 disconnected_inputs=theano_warning)
            self.f_dlogpxmc_dw = theanofunction(self.allvars,
                                                [logpxmc] + dlogpxmc_dw)

        if True and len(z) > 0:
            # Fisher divergence (FD)
            gz = T.grad(logpxz, list(z.values()))
            gz2 = [T.dmatrix() for _ in gz]
            fd = 0
            for i in range(len(gz)):
                fd += T.sum((gz[i] - gz2[i])**2)
            dfd_dw = T.grad(fd, list(w.values()))
            self.f_dfd_dw = theanofunction(self.allvars + gz2,
                                           [logpx, logpz, fd] + dfd_dw)

        if False and hessian:
            # Hessian of logpxz wrt z (works best with n_batch=1)
            hessian_z = theano.gradient.hessian(logpxz, z_concat)
            self.f_hessian_z = theanofunction(self.allvars, hessian_z)
Example #10
0
 def __init__(self, theano_warning='raise', hessian=True):
     
     theanofunction = lazytheanofunc('warn', mode='FAST_RUN')
     theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN')
     
     # Create theano expressions
     w, x, z = ndict.ordereddicts(self.variables())
     self.var_w, self.var_x, self.var_z, = w, x, z
     
     # Helper variables
     A = T.dmatrix('A')
     self.var_A = A
     
     # Get gradient symbols
     self.allvars = w.values()  + x.values() + z.values() + [A] # note: '+' concatenates lists
     self.allvars_keys = w.keys() + x.keys() + z.keys() + ['A']
     
     if False:
         # Put test values
         # needs fully implemented gen_xz(), which is not always the case
         # Also, the FD has no test values
         theano.config.compute_test_value = 'raise'
         _w = self.init_w()
         for i in _w: w[i].tag.test_value = _w[i]
         _x, _z, _ = self.gen_xz(_w, {}, {}, 10)
         _x, _z = self.xz_to_theano(_x, _z)
         for i in _x: x[i].tag.test_value = _x[i]
         for i in _z: z[i].tag.test_value = _z[i]
     
     # TODO: more beautiful/standardized way of setting distributions
     # (should be even simpler then this) 
     self.dist_px = {}
     self.dist_pz = {}
     
     logpw, logpx, logpz = self.factors(w, x, z, A)
     self.var_logpw, self.var_logpx, self.var_logpz = logpw, logpx, logpz
     
     # Complete-data likelihood estimate
     logpxz = logpx.sum() + logpz.sum()
     self.f_logpxz = theanofunction(self.allvars, [logpx, logpz])
     
     dlogpxz_dwz = T.grad(logpxz, w.values() + z.values())
     self.f_dlogpxz_dwz = theanofunction(self.allvars, [logpx, logpz] + dlogpxz_dwz)
     #self.f_dlogpxz_dw = theanofunction(allvars, [logpxz] + dlogpxz_dw)
     #self.f_dlogpxz_dz = theanofunction(allvars, [logpxz] + dlogpxz_dz)
     
     # prior
     dlogpw_dw = T.grad(logpw, w.values(), disconnected_inputs='ignore')
     self.f_logpw = theanofunction(w.values(), logpw)
     self.f_dlogpw_dw = theanofunction(w.values(), [logpw] + dlogpw_dw)
     
     if False:
         # MC-LIKELIHOOD
         logpx_max = logpx.max()
         logpxmc = T.log(T.exp(logpx - logpx_max).mean()) + logpx_max
         self.f_logpxmc = theanofunction(self.allvars, logpxmc)
         dlogpxmc_dw = T.grad(logpxmc, w.values(), disconnected_inputs=theano_warning)
         self.f_dlogpxmc_dw = theanofunction(self.allvars, [logpxmc] + dlogpxmc_dw)
     
     if True and len(z) > 0:
         # Fisher divergence (FD)
         gz = T.grad(logpxz, z.values())
         gz2 = [T.dmatrix() for _ in gz]
         fd = 0
         for i in range(len(gz)):
             fd += T.sum((gz[i]-gz2[i])**2)
         dfd_dw = T.grad(fd, w.values())
         self.f_dfd_dw = theanofunction(self.allvars + gz2, [logpx, logpz, fd] + dfd_dw)
         
     if False and hessian:
         # Hessian of logpxz wrt z (works best with n_batch=1)
         hessian_z = theano.gradient.hessian(logpxz, z_concat)
         self.f_hessian_z = theanofunction(self.allvars, hessian_z)
Example #11
0
    def factors(self, x, z, A):
        
        v = self.v
        w = self.w
        
        '''
        z is unused
        x['x'] is the data
        
        The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(x|z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz
        
        let logpv and logpw be the (prior) density of the parameters
        '''
        
        # Compute q(z|x)
        hidden_q = [x['x']]
        
        def f_softplus(x): return T.log(T.exp(x) + 1)# - np.log(2)
        def f_rectlin(x): return x*(x>0)
        def f_rectlin2(x): return x*(x>0) + 0.01 * x
        nonlinear = {'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2}
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]
        
        #rng = rng_curand.CURAND_RandomStreams(0)
        import theano.tensor.shared_randomstreams
        rng = theano.tensor.shared_randomstreams.RandomStreams(0)
        
        # TOTAL HACK
        #hidden_q.append(nonlinear_q(T.dot(v['scale0'], A) * T.dot(w['out_w'].T, hidden_q[-1]) + T.dot(v['b0'], A)))
        #hidden_q.append(nonlinear_q(T.dot(v['scale1'], A) * T.dot(w['w1'].T, hidden_q[-1]) + T.dot(v['b1'], A)))
        for i in range(len(self.n_hidden_q)):
            hidden_q.append(nonlinear_q(T.dot(v['w'+str(i)], hidden_q[-1]) + T.dot(v['b'+str(i)], A)))
            if self.dropout:
                hidden_q[-1] *= 2. * (rng.uniform(size=hidden_q[-1].shape, dtype='float32') > .5)
                
        q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(v['logvar_b'], A)
        else: raise Exception()
        
        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('warn', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x']] + [A], [q_mean, q_logvar])
        
        # Compute virtual sample
        eps = rng.normal(size=q_mean.shape, dtype='float32')
        _z = q_mean + T.exp(0.5 * q_logvar) * eps
        
        # Compute log p(x|z)
        hidden_p = [_z]
        for i in range(len(self.n_hidden_p)):
            hidden_p.append(nonlinear_p(T.dot(w['w'+str(i)], hidden_p[-1]) + T.dot(w['b'+str(i)], A)))
            if self.dropout:
                hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5)
        
        if self.type_px == 'bernoulli':
            p = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = - T.nnet.binary_crossentropy(p, x['x'])
            self.dist_px['x'] = theanofunc([_z] + [A], p)
        elif self.type_px == 'gaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])
        elif self.type_px == 'bounded01':
            x_mean = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            x_logvar = T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            # Make it a mixture between uniform and Gaussian
            w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A))
            _logpx = T.log(w_unif + (1-w_unif) * T.exp(_logpx))
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])
        else: raise Exception("")
            
        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w)
        
        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            logpz = -0.5 * (np.log(2 * np.pi) + (q_mean**2 + T.exp(q_logvar))).sum(axis=0, keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'mog':
            pz = 0
            for i in range(self.n_mixture):
                pz += T.exp(ap.logpdfs.normal2(_z, T.dot(w['mog_mean'+str(i)], A), T.dot(w['mog_logvar'+str(i)], A)))
            logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(float(self.n_mixture))
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True)
        else:
            raise Exception("Unknown type_pz")
        
        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = - 0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True)
        else: raise Exception()
                        
        # [new part] Fisher divergence of latent variables
        if self.var_smoothing > 0:
            dlogq_dz = T.grad(logqz.sum(), _z) # gives error when using gaussianmarg instead of gaussian
            dlogp_dz = T.grad((logpx + logpz).sum(), _z)
            FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True)
            # [end new part]
            logqz -= self.var_smoothing * FD
        
        # Note: logpv and logpw are a scalars
        if True:
            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.normal(_w, 0, prior_sd).sum()
        else:
            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.standard_laplace(_w / prior_sd).sum()
            
        return logpx, logpz, logqz
Example #12
0
    def __init__(self, get_optimizer, theano_warning='raise'):

        v = self.v
        w = self.w
        theanofunction = lazytheanofunc('warn', mode='FAST_RUN')
        theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN')

        # Create theano expressions
        x, z = ndict.ordereddicts(self.variables())
        self.var_x, self.var_z, = x, z

        # Helper variables
        A = T.fmatrix('A')
        self.var_A = A
        '''
        # Get gradient symbols
        print 'model, x'
        for (d, xx) in x.items():
          print d
          print xx.shape
          
        print x.values()
        '''

        allvars = x.values() + z.values() + [A]  # note: '+' concatenates lists

        # TODO: more beautiful/standardized way of setting distributions
        # (should be even simpler than this)
        self.dist_qz = {}
        self.dist_px = {}
        self.dist_pz = {}

        factors = self.factors(x, z, A)
        if len(factors) == 3:
            (logpx, logpz, logqz) = factors
            cost = 0
            sparsity_penalty = 0
        else:
            (logpx, logpz, logqz, cost, sparsity_penalty) = factors

        if get_optimizer == None:

            def get_optimizer(w, g):
                from collections import OrderedDict
                updates = OrderedDict()
                for i in w:
                    updates[w[i]] = w[i]
                return updates

        # Log-likelihood lower bound
        self.f_L = theanofunction(
            allvars, [logpx, logpz, logqz, cost, sparsity_penalty])
        L = (logpx + logpz - logqz).sum() - cost - sparsity_penalty

        g = T.grad(L, v.values() + w.values())
        gv, gw = dict(zip(v.keys(), g[0:len(v)])), dict(
            zip(w.keys(), g[len(v):len(v) + len(w)]))
        updates = get_optimizer(v, gv)
        updates.update(get_optimizer(w, gw))

        #self.profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
        #self.f_evalAndUpdate = theano.function(allvars, [logpx + logpz - logqz], updates=updates_w, mode=self.profmode)
        #theano.printing.debugprint(self.f_evalAndUpdate)

        self.f_eval_test = theanofunction(
            allvars, [logpx + logpz - logqz, logpx, logpz, -logqz])
        self.f_eval = theanofunction(allvars, [logpx + logpz - logqz])
        self.f_evalAndUpdate = theanofunction(allvars, [logpx + logpz - logqz],
                                              updates=updates)
        self.f_eval_for_classcondition_prior = theanofunction(
            allvars, [logpx - logqz])
Example #13
0
    def factors(self, x, z, A):
        
        v = self.v         # parameters of recognition model. 
        w = self.w         # parameters of generative model. 
        
        '''
        z is unused
        x['x'] is the data
        
        The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(x|z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz
        
        let logpv and logpw be the (prior) density of the parameters
        '''
        
        # Compute q(z|x)
        hidden_q = [x['x']]
        hidden_q_s = [x['x']]
        
        def f_softplus(x): return T.log(T.exp(x) + 1)# - np.log(2)
        def f_rectlin(x): return x*(x>0)
        def f_rectlin2(x): return x*(x>0) + 0.01 * x
        nonlinear = {'tanh': T.tanh, 'sigmoid': T.nnet.sigmoid, 'softplus': f_softplus, 'rectlin': f_rectlin, 'rectlin2': f_rectlin2}
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]
        
        #rng = rng_curand.CURAND_RandomStreams(0)
        import theano.tensor.shared_randomstreams
        rng = theano.tensor.shared_randomstreams.RandomStreams(0)
        
        # TOTAL HACK
        #hidden_q.append(nonlinear_q(T.dot(v['scale0'], A) * T.dot(w['out_w'].T, hidden_q[-1]) + T.dot(v['b0'], A)))
        #hidden_q.append(nonlinear_q(T.dot(v['scale1'], A) * T.dot(w['w1'].T, hidden_q[-1]) + T.dot(v['b1'], A)))
        for i in range(len(self.n_hidden_q)):
            hidden_q.append(nonlinear_q(T.dot(v['w'+str(i)], hidden_q[-1]) + T.dot(v['b'+str(i)], A)))
            hidden_q_s.append(T.nnet.sigmoid(T.dot(v['w'+str(i)], hidden_q_s[-1]) + T.dot(v['b'+str(i)], A)))
            if self.dropout:
                hidden_q[-1] *= 2. * (rng.uniform(size=hidden_q[-1].shape, dtype='float32') > .5)
                hidden_q_s[-1] *= 2. * (rng.uniform(size=hidden_q_s[-1].shape, dtype='float32') > .5)
        
        '''
        print 'mm_model'
        for (d, xx) in x.items():
          print d
        '''
        
        #print 'x', x['mean_prior'].type
        #print 'T', (T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)).type
        
        if not self.train_residual:
            q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        else:
            q_mean = x['mean_prior'] + T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        #q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        
        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(v['logvar_b'], A)
        else: raise Exception()
        
        ell = cast32(self.ell)
        self.param_c = shared32(0)
        sv = self.sv

        a_a = cast32(self.average_activation)
        s_w = cast32(self.sparsity_weight)
        
        def activate():
            res = 0
            if self.super_to_mean:
                lenw = len(v['W'].get_value())
                res += T.dot(v['W'][:-1,:].T, q_mean)
                res += T.dot(v['W'][lenw-1:lenw,:].T, A)
            else:
                lenw = len(v['W'].get_value())
                for (hi, hidden) in enumerate(hidden_q[1+sv:]):
                    res += T.dot(v['W'][sum(self.n_hidden_q[sv:sv+hi]):sum(self.n_hidden_q[sv:sv+hi+1]),:].T, hidden)
                res += T.dot(v['W'][lenw-1:lenw,:].T, A)
            return res
        predy = T.argmax(activate(), axis=0)

        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('warn', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x'], x['mean_prior']] + [A], [q_mean, q_logvar])
        self.dist_qz['hidden'] = theanofunc([x['x'], x['mean_prior']] + [A], hidden_q[1:])
        self.dist_qz['predy'] = theanofunc([x['x'], x['mean_prior']] + [A], predy)
        
        # compute cost (posterior regularization).
        true_resp = (activate() * x['y']).sum(axis=0, keepdims=True)
        T.addbroadcast(true_resp, 0)

        cost = self.param_c * (ell * (1-x['y']) + activate() - true_resp).max(axis=0).sum()  \
                        + self.Lambda * (v['W'] * v['W']).sum()
        
        # compute the sparsity penalty
        sparsity_penalty = 0
        for i in range(1, len(hidden_q_s)):
            sparsity_penalty += (a_a*T.log(a_a/(hidden_q_s[i].mean(axis=1))) + (1-a_a)*T.log((1-a_a)/(1-(hidden_q_s[i].mean(axis=1))))).sum(axis=0)
        sparsity_penalty *= s_w

        # Compute virtual sample
        eps = rng.normal(size=q_mean.shape, dtype='float32')
        _z = q_mean + T.exp(0.5 * q_logvar) * eps
        
        # Compute log p(x|z)
        hidden_p = [_z]
        for i in range(len(self.n_hidden_p)):
            hidden_p.append(nonlinear_p(T.dot(w['w'+str(i)], hidden_p[-1]) + T.dot(w['b'+str(i)], A)))
            if self.dropout:
                hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape, dtype='float32') > .5)
        
        if self.type_px == 'bernoulli':
            p = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = - T.nnet.binary_crossentropy(p, x['x'])
            self.dist_px['x'] = theanofunc([_z] + [A], p)
        elif self.type_px == 'gaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])
        elif self.type_px == 'bounded01':
            x_mean = T.nnet.sigmoid(T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            x_logvar = T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            # Make it a mixture between uniform and Gaussian
            w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A))
            _logpx = T.log(w_unif + (1-w_unif) * T.exp(_logpx))
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])
        else: raise Exception("")
            
        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(shared32(np.ones((1, self.n_x))), _logpx) # logpx = log p(x|z,w)
        
        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            if not self.train_residual:
                logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + ((q_mean-x['mean_prior'])**2 + T.exp(q_logvar))/self.sigma_square).sum(axis=0, keepdims=True)
            else:
                logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + (q_mean**2 + T.exp(q_logvar))/self.sigma_square).sum(axis=0, keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'mog':
            pz = 0
            for i in range(self.n_mixture):
                pz += T.exp(ap.logpdfs.normal2(_z, T.dot(w['mog_mean'+str(i)], A), T.dot(w['mog_logvar'+str(i)], A)))
            logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(float(self.n_mixture))
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']), A)).sum(axis=0, keepdims=True)
        else:
            raise Exception("Unknown type_pz")
        
        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = - 0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0, keepdims=True)
        else: raise Exception()
                        
        # [new part] Fisher divergence of latent variables
        if self.var_smoothing > 0:
            dlogq_dz = T.grad(logqz.sum(), _z) # gives error when using gaussianmarg instead of gaussian
            dlogp_dz = T.grad((logpx + logpz).sum(), _z)
            FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True)
            # [end new part]
            logqz -= self.var_smoothing * FD
        
        # Note: logpv and logpw are a scalars
        if True:
            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.normal(_w, 0, prior_sd).sum()
        else:
            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.standard_laplace(_w / prior_sd).sum()
            
        return logpx, logpz, logqz, cost, sparsity_penalty
Example #14
0
    def factors(self, v, w, x, z, A):
        '''
        z['eps'] is the independent epsilons (Gaussian with unit variance)
        x['x'] is the data
        x['y'] is categorial data (1-of-K coded)

        The names of list z[...] may be confusing here: the latent variable z is not included in the list z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(y) + log p(x|y,z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x,y): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz

        let logpv and logpw be the (prior) density of the parameters
        '''
        def f_softplus(x):
            return T.log(T.exp(x) + 1)  # - np.log(2)

        def f_rectlin(x):
            return x * (x > 0)

        def f_rectlin2(x):
            return x * (x > 0) + 0.01 * x

        nonlinear = {
            'tanh': T.tanh,
            'sigmoid': T.nnet.sigmoid,
            'softplus': f_softplus,
            'rectlin': f_rectlin,
            'rectlin2': f_rectlin2
        }
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]

        # Compute q(z|x,y)
        hidden_q = [
            nonlinear_q(
                T.dot(v['w0x'], x['x']) + T.dot(v['w0y'], x['y']) +
                T.dot(v['b0'], A))
        ]
        for i in range(1, len(self.n_hidden_q)):
            hidden_q.append(
                nonlinear_q(
                    T.dot(v['w' + str(i)], hidden_q[-1]) +
                    T.dot(v['b' + str(i)], A)))

        q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(
                v['logvar_b'], A)
        else:
            raise Exception()

        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('ignore', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x'], x['y']] + list(v.values()) +
                                       [A], [q_mean, q_logvar])

        # Compute virtual sample
        _z = q_mean + T.exp(0.5 * q_logvar) * z['eps']

        # Compute log p(x|y,z)
        hidden_p = [
            nonlinear_p(
                T.dot(w['w0y'], x['y']) + T.dot(w['w0z'], _z) +
                T.dot(w['b0'], A))
        ]
        for i in range(1, len(self.n_hidden_p)):
            hidden_p.append(
                nonlinear_p(
                    T.dot(w['w' + str(i)], hidden_p[-1]) +
                    T.dot(w['b' + str(i)], A)))

        if self.type_px == 'bernoulli':
            p = T.nnet.sigmoid(
                T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = -T.nnet.binary_crossentropy(p, x['x'])
            self.dist_px['x'] = theanofunc([x['y'], _z] + list(w.values()) +
                                           [A], p)
        elif self.type_px == 'gaussian' or self.type_px == 'sigmoidgaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            if self.type_px == 'sigmoidgaussian':
                x_mean = T.nnet.sigmoid(x_mean)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(
                w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([x['y'], _z] + list(w.values()) +
                                           [A], [x_mean, x_logvar])
        else:
            raise Exception("")

        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(np.ones((1, self.n_x)),
                      _logpx)  # logpx = logp(y|w) + logp(x|z,w)

        # log p(y) (prior of y)
        _logpy = w['logpy']
        if self.uniform_y: _logpy *= 0
        py_model = T.nnet.softmax(T.dot(_logpy, A).T).T
        logpy = (
            -T.nnet.categorical_crossentropy(py_model.T, x['y'].T).T).reshape(
                (1, -1))
        logpx += logpy
        self.dist_px['y'] = theanofunc(list(w.values()) + [A], py_model)

        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            logpz = -0.5 * (np.log(2 * np.pi) +
                            (q_mean**2 + T.exp(q_logvar))).sum(axis=0,
                                                               keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']),
                                                  A)).sum(axis=0,
                                                          keepdims=True)
        else:
            raise Exception("Unknown type_pz")

        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(
                axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0,
                                                                 keepdims=True)
        else:
            raise Exception()

        # Note: logpv and logpw are a scalars
        def f_prior(_w, prior_sd=self.prior_sd):
            return ap.logpdfs.normal(_w, 0, prior_sd).sum()

        logpv = 0
        logpv += f_prior(v['w0x'])
        logpv += f_prior(v['w0y'])
        for i in range(1, len(self.n_hidden_q)):
            logpv += f_prior(v['w' + str(i)])
        logpv += f_prior(v['mean_w'])
        if self.type_qz in ['gaussian', 'gaussianmarg']:
            logpv += f_prior(v['logvar_w'])

        logpw = 0
        logpw += f_prior(w['w0y'])
        logpw += f_prior(w['w0z'])
        for i in range(1, len(self.n_hidden_p)):
            logpw += f_prior(w['w' + str(i)])
        logpw += f_prior(w['out_w'])
        if self.type_px in ['sigmoidgaussian', 'gaussian']:
            logpw += f_prior(w['out_logvar_w'])
        if self.type_pz == 'studentt':
            logpw += f_prior(w['logv'])

        return logpv, logpw, logpx, logpz, logqz
Example #15
0
    def factors(self, x, z, A):

        v = self.v
        w = self.w
        '''
        z is unused
        x['x'] is the data
        
        The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(x|z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz
        
        let logpv and logpw be the (prior) density of the parameters
        '''
        def f_softplus(x):
            return T.log(T.exp(x) + 1)  # - np.log(2)

        def f_rectlin(x):
            return x * (x > 0)

        def f_rectlin2(x):
            return x * (x > 0) + 0.01 * x

        nonlinear = {
            'tanh': T.tanh,
            'sigmoid': T.nnet.sigmoid,
            'softplus': f_softplus,
            'rectlin': f_rectlin,
            'rectlin2': f_rectlin2
        }
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]

        #rng = rng_curand.CURAND_RandomStreams(0)
        import theano.tensor.shared_randomstreams
        rng = theano.tensor.shared_randomstreams.RandomStreams(0)

        # Compute q(z|x,y)
        hidden_q = [
            nonlinear_q(
                T.dot(v['w0x'], x['x']) + T.dot(v['w0y'], x['y']) +
                T.dot(v['b0'], A))
        ]
        for i in range(1, len(self.n_hidden_q)):
            hidden_q.append(
                nonlinear_q(
                    T.dot(v['w' + str(i)], hidden_q[-1]) +
                    T.dot(v['b' + str(i)], A)))

        q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(
                v['logvar_b'], A)
        else:
            raise Exception()

        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('warn', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x'], x['y']] + [A],
                                       [q_mean, q_logvar])

        # Compute virtual sample
        eps = rng.normal(size=q_mean.shape, dtype='float32')
        _z = q_mean + T.exp(0.5 * q_logvar) * eps

        # Compute log p(x|z)
        hidden_p = [
            nonlinear_p(
                T.dot(w['w0y'], x['y']) + T.dot(w['w0z'], _z) +
                T.dot(w['b0'], A))
        ]
        for i in range(1, len(self.n_hidden_p)):
            hidden_p.append(
                nonlinear_p(
                    T.dot(w['w' + str(i)], hidden_p[-1]) +
                    T.dot(w['b' + str(i)], A)))
            if self.dropout:
                hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape,
                                                  dtype='float32') > .5)

        if self.type_px == 'bernoulli':
            p = T.nnet.sigmoid(
                T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = -T.nnet.binary_crossentropy(p, x['x'])
            self.dist_px['x'] = theanofunc([x['y'], _z] + [A], p)
        elif self.type_px == 'gaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(
                w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([x['y'], _z] + [A],
                                           [x_mean, x_logvar])
        elif self.type_px == 'laplace':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(
                w['out_logvar_b'], A)
            _logpx = ap.logpdfs.laplace(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([x['y'], _z] + [A],
                                           [x_mean, x_logvar])

        else:
            raise Exception("")

        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(shared32(np.ones((1, self.n_x))),
                      _logpx)  # logpx = log p(x|z,w)

        # log p(y) (prior of y)
        #_logpy = w['logpy']
        #if self.uniform_y: _logpy *= 0
        #py_model = T.nnet.softmax(T.dot(_logpy, A).T).T
        #logpy = (- T.nnet.categorical_crossentropy(py_model.T, x['y'].T).T).reshape((1,-1))
        #logpx += logpy
        #self.dist_px['y'] = theanofunc([A], py_model)

        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            logpz = -0.5 * (np.log(2 * np.pi) +
                            (q_mean**2 + T.exp(q_logvar))).sum(axis=0,
                                                               keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'mog':
            pz = 0
            for i in range(self.n_mixture):
                pz += T.exp(
                    ap.logpdfs.normal2(_z, T.dot(w['mog_mean' + str(i)], A),
                                       T.dot(w['mog_logvar' + str(i)], A)))
            logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(
                float(self.n_mixture))
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']),
                                                  A)).sum(axis=0,
                                                          keepdims=True)
        else:
            raise Exception("Unknown type_pz")

        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(
                axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0,
                                                                 keepdims=True)
        else:
            raise Exception()

        # Note: logpv and logpw are a scalars
        def f_prior(_w, prior_sd=self.prior_sd):
            return ap.logpdfs.normal(_w, 0, prior_sd).sum()

        logpv = 0
        logpv += f_prior(v['w0x'])
        logpv += f_prior(v['w0y'])
        for i in range(1, len(self.n_hidden_q)):
            logpv += f_prior(v['w' + str(i)])
        logpv += f_prior(v['mean_w'])
        if self.type_qz in ['gaussian', 'gaussianmarg']:
            logpv += f_prior(v['logvar_w'])

        logpw = 0
        logpw += f_prior(w['w0y'])
        logpw += f_prior(w['w0z'])
        for i in range(1, len(self.n_hidden_p)):
            logpw += f_prior(w['w' + str(i)])
        logpw += f_prior(w['out_w'])
        if self.type_px in ['sigmoidgaussian', 'gaussian', 'laplace']:
            logpw += f_prior(w['out_logvar_w'])
        if self.type_pz == 'studentt':
            logpw += f_prior(w['logv'])

        #return logpv, logpw, logpx, logpz, logqz
        return logpx, logpz, logqz
Example #16
0
    def factors(self, v, w, x, z, A):
        '''
        z['eps'] is the independent epsilons (Gaussian with unit variance)
        x['x'] is the data
        
        The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(x|z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz
        
        let logpv and logpw be the (prior) density of the parameters
        '''

        # Compute q(z|x)
        hidden_q = [x['x']]

        def f_softplus(x):
            return T.log(T.exp(x) + 1)  # - np.log(2)

        def f_rectlin(x):
            return x * (x > 0)

        def f_rectlin2(x):
            return x * (x > 0) + 0.01 * x

        nonlinear = {
            'tanh': T.tanh,
            'sigmoid': T.nnet.sigmoid,
            'softplus': f_softplus,
            'rectlin': f_rectlin,
            'rectlin2': f_rectlin2
        }
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]

        for i in range(len(self.n_hidden_q)):
            hidden_q.append(
                nonlinear_q(
                    T.dot(v['w' + str(i)], hidden_q[-1]) +
                    T.dot(v['b' + str(i)], A)))

        q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(
                v['logvar_b'], A)
        else:
            raise Exception()

        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('warn', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x']] + v.values() + [A],
                                       [q_mean, q_logvar])

        # Compute virtual sample
        _z = q_mean + T.exp(0.5 * q_logvar) * z['eps']

        # Compute log p(x|z)
        hidden_p = [_z]
        for i in range(len(self.n_hidden_p)):
            hidden_p.append(
                nonlinear_p(
                    T.dot(w['w' + str(i)], hidden_p[-1]) +
                    T.dot(w['b' + str(i)], A)))

        if self.type_px == 'bernoulli':
            p = T.nnet.sigmoid(
                T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = -T.nnet.binary_crossentropy(p, x['x'])
            self.dist_px['x'] = theanofunc([_z] + w.values() + [A], p)
        elif self.type_px == 'gaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(
                w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([_z] + w.values() + [A],
                                           [x_mean, x_logvar])
        elif self.type_px == 'bounded01':
            x_mean = T.nnet.sigmoid(
                T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            x_logvar = T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            # Make it a mixture between uniform and Gaussian
            w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A))
            _logpx = T.log(w_unif + (1 - w_unif) * T.exp(_logpx))
            self.dist_px['x'] = theanofunc([_z] + w.values() + [A],
                                           [x_mean, x_logvar])
        else:
            raise Exception("")

        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(np.ones((1, self.n_x)), _logpx)  # logpx = log p(x|z,w)

        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            logpz = -0.5 * (np.log(2 * np.pi) +
                            (q_mean**2 + T.exp(q_logvar))).sum(axis=0,
                                                               keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']),
                                                  A)).sum(axis=0,
                                                          keepdims=True)
        else:
            raise Exception("Unknown type_pz")

        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(
                axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0,
                                                                 keepdims=True)
        else:
            raise Exception()

        # [new part] Fisher divergence of latent variables
        if self.var_smoothing > 0:
            dlogq_dz = T.grad(
                logqz.sum(),
                _z)  # gives error when using gaussianmarg instead of gaussian
            dlogp_dz = T.grad((logpx + logpz).sum(), _z)
            FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True)
            # [end new part]
            logqz -= self.var_smoothing * FD

        # Note: logpv and logpw are a scalars
        if True:

            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.normal(_w, 0, prior_sd).sum()
        else:

            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.standard_laplace(_w / prior_sd).sum()

        logpv = 0
        for i in range(len(self.n_hidden_q)):
            logpv += f_prior(v['w' + str(i)])
        logpv += f_prior(v['mean_w'])
        if self.type_qz in ['gaussian', 'gaussianmarg']:
            logpv += f_prior(v['logvar_w'])

        logpw = 0
        for i in range(len(self.n_hidden_p)):
            logpw += f_prior(w['w' + str(i)])
        logpw += f_prior(w['out_w'])
        if self.type_px == 'gaussian':
            logpw += f_prior(w['out_logvar_w'])
        if self.type_pz == 'studentt':
            logpw += f_prior(w['logv'])

        return logpv, logpw, logpx, logpz, logqz
Example #17
0
    def factors(self, x, z, A):

        v = self.v  # parameters of recognition model.
        w = self.w  # parameters of generative model.
        '''
        z is unused
        x['x'] is the data
        
        The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(x|z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz
        
        let logpv and logpw be the (prior) density of the parameters
        '''

        # Compute q(z|x)
        hidden_q = [x['x']]
        hidden_q_s = [x['x']]

        def f_softplus(x):
            return T.log(T.exp(x) + 1)  # - np.log(2)

        def f_rectlin(x):
            return x * (x > 0)

        def f_rectlin2(x):
            return x * (x > 0) + 0.01 * x

        nonlinear = {
            'tanh': T.tanh,
            'sigmoid': T.nnet.sigmoid,
            'softplus': f_softplus,
            'rectlin': f_rectlin,
            'rectlin2': f_rectlin2
        }
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]

        #rng = rng_curand.CURAND_RandomStreams(0)
        import theano.tensor.shared_randomstreams
        rng = theano.tensor.shared_randomstreams.RandomStreams(0)

        # TOTAL HACK
        #hidden_q.append(nonlinear_q(T.dot(v['scale0'], A) * T.dot(w['out_w'].T, hidden_q[-1]) + T.dot(v['b0'], A)))
        #hidden_q.append(nonlinear_q(T.dot(v['scale1'], A) * T.dot(w['w1'].T, hidden_q[-1]) + T.dot(v['b1'], A)))
        for i in range(len(self.n_hidden_q)):
            hidden_q.append(
                nonlinear_q(
                    T.dot(v['w' + str(i)], hidden_q[-1]) +
                    T.dot(v['b' + str(i)], A)))
            hidden_q_s.append(
                T.nnet.sigmoid(
                    T.dot(v['w' + str(i)], hidden_q_s[-1]) +
                    T.dot(v['b' + str(i)], A)))
            if self.dropout:
                hidden_q[-1] *= 2. * (rng.uniform(size=hidden_q[-1].shape,
                                                  dtype='float32') > .5)
                hidden_q_s[-1] *= 2. * (rng.uniform(size=hidden_q_s[-1].shape,
                                                    dtype='float32') > .5)
        '''
        print 'mm_model'
        for (d, xx) in x.items():
          print d
        '''

        #print 'x', x['mean_prior'].type
        #print 'T', (T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)).type

        if not self.train_residual:
            q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        else:
            q_mean = x['mean_prior'] + T.dot(
                v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        #q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)

        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(
                v['logvar_b'], A)
        else:
            raise Exception()

        ell = cast32(self.ell)
        self.param_c = shared32(0)
        sv = self.sv

        a_a = cast32(self.average_activation)
        s_w = cast32(self.sparsity_weight)

        def activate():
            res = 0
            if self.super_to_mean:
                lenw = len(v['W'].get_value())
                res += T.dot(v['W'][:-1, :].T, q_mean)
                res += T.dot(v['W'][lenw - 1:lenw, :].T, A)
            else:
                lenw = len(v['W'].get_value())
                for (hi, hidden) in enumerate(hidden_q[1 + sv:]):
                    res += T.dot(
                        v['W'][sum(self.n_hidden_q[sv:sv + hi]
                                   ):sum(self.n_hidden_q[sv:sv + hi +
                                                         1]), :].T, hidden)
                res += T.dot(v['W'][lenw - 1:lenw, :].T, A)
            return res

        predy = T.argmax(activate(), axis=0)

        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('warn', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x'], x['mean_prior']] + [A],
                                       [q_mean, q_logvar])
        self.dist_qz['hidden'] = theanofunc([x['x'], x['mean_prior']] + [A],
                                            hidden_q[1:])
        self.dist_qz['predy'] = theanofunc([x['x'], x['mean_prior']] + [A],
                                           predy)

        # compute cost (posterior regularization).
        true_resp = (activate() * x['y']).sum(axis=0, keepdims=True)
        T.addbroadcast(true_resp, 0)

        cost = self.param_c * (ell * (1-x['y']) + activate() - true_resp).max(axis=0).sum()  \
                        + self.Lambda * (v['W'] * v['W']).sum()

        # compute the sparsity penalty
        sparsity_penalty = 0
        for i in range(1, len(hidden_q_s)):
            sparsity_penalty += (a_a * T.log(a_a /
                                             (hidden_q_s[i].mean(axis=1))) +
                                 (1 - a_a) * T.log(
                                     (1 - a_a) /
                                     (1 - (hidden_q_s[i].mean(axis=1))))).sum(
                                         axis=0)
        sparsity_penalty *= s_w

        # Compute virtual sample
        eps = rng.normal(size=q_mean.shape, dtype='float32')
        _z = q_mean + T.exp(0.5 * q_logvar) * eps

        # Compute log p(x|z)
        hidden_p = [_z]
        for i in range(len(self.n_hidden_p)):
            hidden_p.append(
                nonlinear_p(
                    T.dot(w['w' + str(i)], hidden_p[-1]) +
                    T.dot(w['b' + str(i)], A)))
            if self.dropout:
                hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape,
                                                  dtype='float32') > .5)

        if self.type_px == 'bernoulli':
            p = T.nnet.sigmoid(
                T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = -T.nnet.binary_crossentropy(p, x['x'])
            self.dist_px['x'] = theanofunc([_z] + [A], p)
        elif self.type_px == 'gaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(
                w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])
        elif self.type_px == 'bounded01':
            x_mean = T.nnet.sigmoid(
                T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            x_logvar = T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            # Make it a mixture between uniform and Gaussian
            w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A))
            _logpx = T.log(w_unif + (1 - w_unif) * T.exp(_logpx))
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])
        else:
            raise Exception("")

        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(shared32(np.ones((1, self.n_x))),
                      _logpx)  # logpx = log p(x|z,w)

        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            if not self.train_residual:
                logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + (
                    (q_mean - x['mean_prior'])**2 + T.exp(q_logvar)) /
                                self.sigma_square).sum(axis=0, keepdims=True)
            else:
                logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) +
                                (q_mean**2 + T.exp(q_logvar)) /
                                self.sigma_square).sum(axis=0, keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'mog':
            pz = 0
            for i in range(self.n_mixture):
                pz += T.exp(
                    ap.logpdfs.normal2(_z, T.dot(w['mog_mean' + str(i)], A),
                                       T.dot(w['mog_logvar' + str(i)], A)))
            logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(
                float(self.n_mixture))
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']),
                                                  A)).sum(axis=0,
                                                          keepdims=True)
        else:
            raise Exception("Unknown type_pz")

        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(
                axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0,
                                                                 keepdims=True)
        else:
            raise Exception()

        # [new part] Fisher divergence of latent variables
        if self.var_smoothing > 0:
            dlogq_dz = T.grad(
                logqz.sum(),
                _z)  # gives error when using gaussianmarg instead of gaussian
            dlogp_dz = T.grad((logpx + logpz).sum(), _z)
            FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True)
            # [end new part]
            logqz -= self.var_smoothing * FD

        # Note: logpv and logpw are a scalars
        if True:

            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.normal(_w, 0, prior_sd).sum()
        else:

            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.standard_laplace(_w / prior_sd).sum()

        return logpx, logpz, logqz, cost, sparsity_penalty
Example #18
0
    def factors(self, x, z, A):

        v = self.v
        w = self.w
        '''
        z is unused
        x['x'] is the data
        
        The names of dict z[...] may be confusing here: the latent variable z is not included in the dict z[...],
        but implicitely computed from epsilon and parameters in w.

        z is computed with g(.) from eps and variational parameters
        let logpx be the generative model density: log p(x|z) where z=g(.)
        let logpz be the prior of Z plus the entropy of q(z|x): logp(z) + H_q(z|x)
        So the lower bound L(x) = logpx + logpz
        
        let logpv and logpw be the (prior) density of the parameters
        '''

        # Compute q(z|x)
        hidden_q = [x['x']]

        def f_softplus(x):
            return T.log(T.exp(x) + 1)  # - np.log(2)

        def f_rectlin(x):
            return x * (x > 0)

        def f_rectlin2(x):
            return x * (x > 0) + 0.01 * x

        nonlinear = {
            'tanh': T.tanh,
            'sigmoid': T.nnet.sigmoid,
            'softplus': f_softplus,
            'rectlin': f_rectlin,
            'rectlin2': f_rectlin2
        }
        nonlinear_q = nonlinear[self.nonlinear_q]
        nonlinear_p = nonlinear[self.nonlinear_p]

        #rng = rng_curand.CURAND_RandomStreams(0)
        import theano.tensor.shared_randomstreams
        rng = theano.tensor.shared_randomstreams.RandomStreams(0)

        # TOTAL HACK
        #hidden_q.append(nonlinear_q(T.dot(v['scale0'], A) * T.dot(w['out_w'].T, hidden_q[-1]) + T.dot(v['b0'], A)))
        #hidden_q.append(nonlinear_q(T.dot(v['scale1'], A) * T.dot(w['w1'].T, hidden_q[-1]) + T.dot(v['b1'], A)))
        for i in range(len(self.n_hidden_q)):
            hidden_q.append(
                nonlinear_q(
                    T.dot(v['w' + str(i)], hidden_q[-1]) +
                    T.dot(v['b' + str(i)], A)))
            if self.dropout:
                hidden_q[-1] *= 2. * (rng.uniform(size=hidden_q[-1].shape,
                                                  dtype='float32') > .5)

        if not self.train_residual:
            q_mean = T.dot(v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)
        else:
            q_mean = x['mean_prior'] + T.dot(
                v['mean_w'], hidden_q[-1]) + T.dot(v['mean_b'], A)

        if self.type_qz == 'gaussian' or self.type_qz == 'gaussianmarg':
            q_logvar = T.dot(v['logvar_w'], hidden_q[-1]) + T.dot(
                v['logvar_b'], A)
        else:
            raise Exception()

        # function for distribution q(z|x)
        theanofunc = lazytheanofunc('warn', mode='FAST_RUN')
        self.dist_qz['z'] = theanofunc([x['x'], x['mean_prior']] + [A],
                                       [q_mean, q_logvar])
        self.dist_qz['hidden'] = theanofunc([x['x'], x['mean_prior']] + [A],
                                            hidden_q[1:])

        # Compute virtual sample
        eps = rng.normal(size=q_mean.shape, dtype='float32')
        _z = q_mean + T.exp(0.5 * q_logvar) * eps

        # Compute log p(x|z)
        hidden_p = [_z]
        for i in range(len(self.n_hidden_p)):
            hidden_p.append(
                nonlinear_p(
                    T.dot(w['w' + str(i)], hidden_p[-1]) +
                    T.dot(w['b' + str(i)], A)))
            if self.dropout:
                hidden_p[-1] *= 2. * (rng.uniform(size=hidden_p[-1].shape,
                                                  dtype='float32') > .5)

        if self.type_px == 'bernoulli':
            p = T.nnet.sigmoid(
                T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            _logpx = -T.nnet.binary_crossentropy(p, x['x'])
            self.dist_px['x'] = theanofunc([_z] + [A], p)
        elif self.type_px == 'gaussian':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(
                w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])
        elif self.type_px == 'bounded01':
            x_mean = T.nnet.sigmoid(
                T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A))
            x_logvar = T.dot(w['out_logvar_b'], A)
            _logpx = ap.logpdfs.normal2(x['x'], x_mean, x_logvar)
            # Make it a mixture between uniform and Gaussian
            w_unif = T.nnet.sigmoid(T.dot(w['out_unif'], A))
            _logpx = T.log(w_unif + (1 - w_unif) * T.exp(_logpx))
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])
        elif self.type_px == 'exponential':
            log_x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            _logpx = ap.logpdfs.exp(x['x'], log_x_mean)
            self.dist_px['x'] = theanofunc([_z] + [A], [log_x_mean])
        elif self.type_px == 'mixture':
            x_mean = T.dot(w['out_w'], hidden_p[-1]) + T.dot(w['out_b'], A)
            x_logvar = T.dot(w['out_logvar_w'], hidden_p[-1]) + T.dot(
                w['out_logvar_b'], A)

            normal_list = np.asarray([1, 6, 10, 14, 18])
            exponential_list = np.asarray(
                [0, 3, 5, 9, 13, 17, 21, 22, 23, 24, 25, 26, 27])
            uniform_list = np.asarray([2, 4, 7, 11, 15, 19])
            threemodal_list = np.asarray([8, 12, 16, 20])

            #print type(x['x'])

            _logpx_normal = ap.logpdfs.normal2(x['x'][normal_list, :],
                                               x_mean[normal_list, :],
                                               x_logvar[normal_list, :])

            #print type(x_mean)

            _logpx_exponential = ap.logpdfs.exp(x['x'][exponential_list, :],
                                                x_mean[exponential_list, :])
            _logpx = _logpx_normal + _logpx_exponential
            self.dist_px['x'] = theanofunc([_z] + [A], [x_mean, x_logvar])

        else:
            raise Exception("")

        # Note: logpx is a row vector (one element per sample)
        logpx = T.dot(shared32(np.ones((1, self.n_x))),
                      _logpx)  # logpx = log p(x|z,w)

        # log p(z) (prior of z)
        if self.type_pz == 'gaussianmarg':
            if not self.train_residual:
                logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) + (
                    (q_mean - x['mean_prior'])**2 + T.exp(q_logvar)) /
                                self.sigma_square).sum(axis=0, keepdims=True)
            else:
                logpz = -0.5 * (np.log(2 * np.pi * self.sigma_square) +
                                (q_mean**2 + T.exp(q_logvar)) /
                                self.sigma_square).sum(axis=0, keepdims=True)
        elif self.type_pz == 'gaussian':
            logpz = ap.logpdfs.standard_normal(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'mog':
            pz = 0
            for i in range(self.n_mixture):
                pz += T.exp(
                    ap.logpdfs.normal2(_z, T.dot(w['mog_mean' + str(i)], A),
                                       T.dot(w['mog_logvar' + str(i)], A)))
            logpz = T.log(pz).sum(axis=0, keepdims=True) - self.n_z * np.log(
                float(self.n_mixture))
        elif self.type_pz == 'laplace':
            logpz = ap.logpdfs.standard_laplace(_z).sum(axis=0, keepdims=True)
        elif self.type_pz == 'studentt':
            logpz = ap.logpdfs.studentt(_z, T.dot(T.exp(w['logv']),
                                                  A)).sum(axis=0,
                                                          keepdims=True)
        else:
            raise Exception("Unknown type_pz")

        # loq q(z|x) (entropy of z)
        if self.type_qz == 'gaussianmarg':
            logqz = -0.5 * (np.log(2 * np.pi) + 1 + q_logvar).sum(
                axis=0, keepdims=True)
        elif self.type_qz == 'gaussian':
            logqz = ap.logpdfs.normal2(_z, q_mean, q_logvar).sum(axis=0,
                                                                 keepdims=True)
        else:
            raise Exception()

        # [new part] Fisher divergence of latent variables
        if self.var_smoothing > 0:
            dlogq_dz = T.grad(
                logqz.sum(),
                _z)  # gives error when using gaussianmarg instead of gaussian
            dlogp_dz = T.grad((logpx + logpz).sum(), _z)
            FD = 0.5 * ((dlogq_dz - dlogp_dz)**2).sum(axis=0, keepdims=True)
            # [end new part]
            logqz -= self.var_smoothing * FD

        # Note: logpv and logpw are a scalars
        if True:

            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.normal(_w, 0, prior_sd).sum()
        else:

            def f_prior(_w, prior_sd=self.prior_sd):
                return ap.logpdfs.standard_laplace(_w / prior_sd).sum()

        return logpx, logpz, logqz
Example #19
0
    def __init__(self, get_optimizer, theano_warning='raise'):
        
        v = self.v
        w = self.w
        theanofunction = lazytheanofunc('warn', mode='FAST_RUN')
        theanofunction_silent = lazytheanofunc('ignore', mode='FAST_RUN')
        
        # Create theano expressions
        x, z = ndict.ordereddicts(self.variables())
        self.var_x, self.var_z, = x, z
        
        # Helper variables
        A = T.fmatrix('A')
        self.var_A = A
        
        '''
        # Get gradient symbols
        print 'model, x'
        for (d, xx) in x.items():
          print d
          print xx.shape
          
        print x.values()
        '''
        
        allvars = x.values() + z.values() + [A] # note: '+' concatenates lists
        
        # TODO: more beautiful/standardized way of setting distributions
        # (should be even simpler than this) 
        self.dist_qz = {}
        self.dist_px = {}
        self.dist_pz = {}
        
        factors = self.factors(x, z, A)
        if len(factors) == 3:
            (logpx, logpz, logqz) = factors
            cost = 0
            sparsity_penalty = 0
        else:
            (logpx, logpz, logqz, cost, sparsity_penalty) = factors

        if get_optimizer == None:
            def get_optimizer(w, g):
                from collections import OrderedDict
                updates = OrderedDict()
                for i in w: updates[w[i]] = w[i]
                return updates

        # Log-likelihood lower bound
        self.f_L = theanofunction(allvars, [logpx, logpz, logqz, cost, sparsity_penalty])
        L = (logpx + logpz - logqz).sum() - cost - sparsity_penalty

        g = T.grad(L, v.values() + w.values())
        gv, gw = dict(zip(v.keys(), g[0:len(v)])), dict(zip(w.keys(), g[len(v):len(v)+len(w)]))
        updates = get_optimizer(v, gv)
        updates.update(get_optimizer(w, gw))
        
        #self.profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker())
        #self.f_evalAndUpdate = theano.function(allvars, [logpx + logpz - logqz], updates=updates_w, mode=self.profmode)
        #theano.printing.debugprint(self.f_evalAndUpdate)
        
        self.f_eval_test = theanofunction(allvars, [logpx + logpz - logqz, logpx, logpz, -logqz])
        self.f_eval = theanofunction(allvars, [logpx + logpz - logqz])
        self.f_evalAndUpdate = theanofunction(allvars, [logpx + logpz - logqz], updates=updates)
        self.f_eval_for_classcondition_prior = theanofunction(allvars, [logpx - logqz])