def build_model(self, train_set, test_set, validation_set=None):
        super(NADE, self).build_model(train_set, test_set, validation_set)

        xhat = get_output(self.model, self.sym_x)
        loss = -((-binary_crossentropy(xhat, self.sym_x)).sum(axis=1)).mean()
        updates = sgd(loss, get_all_params(self.model), self.sym_lr)

        inputs = [self.sym_index, self.sym_batchsize, self.sym_lr]
        x_batch = self.sh_train_x[self.batch_slice]
        x_batch = self._srng.binomial(size=x_batch.shape,
                                      n=1,
                                      p=x_batch,
                                      dtype=theano.config.floatX)
        givens = {self.sym_x: x_batch}
        f_train = theano.function(inputs, [loss],
                                  updates=updates,
                                  givens=givens)

        subset = 1000  # Only take a subset, in order not to receive memory errors.
        givens = {self.sym_x: self.sh_test_x[:subset]}
        f_test = theano.function([], [loss], givens=givens)

        f_validate = None
        if validation_set is not None:
            givens = {self.sym_x: self.sh_valid_x[:subset]}
            f_validate = theano.function([], [loss], givens=givens)

        self.train_args['inputs']['batchsize'] = 100
        self.train_args['inputs']['learningrate'] = 1e-2
        self.train_args['outputs']['like.'] = '%0.6f'
        self.test_args['outputs']['like.'] = '%0.6f'
        self.validate_args['outputs']['like.'] = '%0.6f'
        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
Example #2
0
 def _classification_error(self, x, t):
     y = get_output(self.l_y, x,
                    deterministic=True).mean(axis=(1,
                                                   2))  # Mean over samples.
     t_class = T.argmax(t, axis=1)
     y_class = T.argmax(y, axis=1)
     missclass = T.sum(T.neq(y_class, t_class))
     return (missclass.astype(theano.config.floatX) /
             t.shape[0].astype(theano.config.floatX)) * 100.
    def __init__(self, n_x, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, nonlinearity=rectify,
                 px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234):
        """
        Initialize an skip deep generative model consisting of
        discriminative classifier q(y|a,x),
        generative model P p(a|z,y) and p(x|a,z,y),
        inference model Q q(a|x) and q(z|a,x,y).
        Weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_x: Number of inputs.
        :param n_a: Number of auxiliary.
        :param n_z: Number of latent.
        :param n_y: Number of classes.
        :param qa_hid: List of number of deterministic hidden q(a|x).
        :param qz_hid: List of number of deterministic hidden q(z|a,x,y).
        :param qy_hid: List of number of deterministic hidden q(y|a,x).
        :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y).
        :param nonlinearity: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'.
        :param batchnorm: Boolean value for batch normalization.
        :param seed: The random seed.
        """
        super(SDGMSSL, self).__init__(n_x, qz_hid + px_hid, n_a + n_z, nonlinearity)
        self.x_dist = x_dist
        self.n_y = n_y
        self.n_x = n_x
        self.n_a = n_a
        self.n_z = n_z
        self.batchnorm = batchnorm
        self._srng = RandomStreams(seed)

        # Decide Glorot initializaiton of weights.
        init_w = 1e-3
        hid_w = ""
        if nonlinearity == rectify or nonlinearity == softplus:
            hid_w = "relu"

        # Define symbolic variables for theano functions.
        self.sym_beta = T.scalar('beta')  # scaling constant beta
        self.sym_x_l = T.matrix('x')  # labeled inputs
        self.sym_t_l = T.matrix('t')  # labeled targets
        self.sym_x_u = T.matrix('x')  # unlabeled inputs
        self.sym_bs_l = T.iscalar('bs_l')  # number of labeled data
        self.sym_samples = T.iscalar('samples')  # MC samples
        self.sym_z = T.matrix('z')  # latent variable z
        self.sym_a = T.matrix('a')  # auxiliary variable a

        # Assist methods for collecting the layers
        def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal):
            dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None)
            if batchnorm:
                dense = BatchNormLayer(dense)
            return NonlinearityLayer(dense, self.transf)

        def stochastic_layer(layer_in, n, samples, nonlin=None):
            mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin)
            logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin)
            return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar

        # Input layers
        l_x_in = InputLayer((None, n_x))
        l_y_in = InputLayer((None, n_y))

        # Auxiliary q(a|x)
        l_qa_x = l_x_in
        for hid in qa_hid:
            l_qa_x = dense_layer(l_qa_x, hid)
        l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(l_qa_x, n_a, self.sym_samples)

        # Classifier q(y|a,x)
        l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0]))
        l_x_to_qy = DenseLayer(l_x_in, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        self.l_x_to_qy = l_x_to_qy
        l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1))
        l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0]))
        if batchnorm:
            l_qy_xa = BatchNormLayer(l_qy_xa)
        l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf)
        if len(qy_hid) > 1:
            for hid in qy_hid[1:]:
                l_qy_xa = dense_layer(l_qy_xa, hid)
        l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax)

        # Recognition q(z|x,a,y)
        l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0]))
        l_x_to_qz = DenseLayer(l_x_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1))
        l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1))
        l_qz_axy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0]))
        if batchnorm:
            l_qz_axy = BatchNormLayer(l_qz_axy)
        l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf)
        if len(qz_hid) > 1:
            for hid in qz_hid[1:]:
                l_qz_axy = dense_layer(l_qz_axy, hid)
        l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(l_qz_axy, n_z, 1)

        # Generative p(a|z,y)
        l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1))
        l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0]))
        l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]])
        if batchnorm:
            l_pa_zy = BatchNormLayer(l_pa_zy)
        l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf)
        if len(pa_hid) > 1:
            for hid in pa_hid[1:]:
                l_pa_zy = dense_layer(l_pa_zy, hid)
        l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1)

        # Generative p(x|a,z,y)
        l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0]))
        l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1))
        l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0]))
        l_px_azy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]])
        if batchnorm:
            l_px_azy = BatchNormLayer(l_px_azy)
        l_px_azy = NonlinearityLayer(l_px_azy, self.transf)
        if len(px_hid) > 1:
            for hid in px_hid[1:]:
                l_px_azy = dense_layer(l_px_azy, hid)

        if x_dist == 'bernoulli':
            l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid)
        elif x_dist == 'multinomial':
            l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), softmax)
        elif x_dist == 'gaussian':
            l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(l_px_azy, n_x, 1, px_nonlinearity)

        # Reshape all the model layers to have the same size
        self.l_x_in = l_x_in
        self.l_y_in = l_y_in
        self.l_a_in = l_qa_x

        self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a))
        self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1))
        self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1))

        self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z))
        self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z))
        self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z))

        self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y))

        self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a))
        self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a))
        self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a))

        self.l_px = ReshapeLayer(l_px_azy, (-1, self.sym_samples, 1, n_x))
        self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None
        self.l_px_logvar = ReshapeLayer(l_px_zy_logvar,
                                        (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None

        # Predefined functions
        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2))
        self.f_qy = theano.function(inputs, outputs)

        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2))
        self.f_qa = theano.function(inputs, outputs)

        inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l}
        outputs = get_output(self.l_pa, inputs, deterministic=True)
        self.f_pa = theano.function([self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        inputs = {l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l}
        outputs = get_output(self.l_px, inputs, deterministic=True)
        self.f_px = theano.function([self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        # Define model parameters
        self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px])
        self.trainable_model_params = get_all_params([self.l_qy, self.l_pa, self.l_px], trainable=True)
    def build_model(self, train_set_unlabeled, train_set_labeled, test_set, validation_set=None):
        """
        Build the auxiliary deep generative model from the initialized hyperparameters.
        Define the lower bound term and compile it into a training function.
        :param train_set_unlabeled: Unlabeled train set containing variables x, t.
        :param train_set_labeled: Unlabeled train set containing variables x, t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(SDGMSSL, self).build_model(train_set_unlabeled, test_set, validation_set)

        sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0], dtype=theano.config.floatX), borrow=True)
        sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1], dtype=theano.config.floatX), borrow=True)
        n = self.sh_train_x.shape[0].astype(theano.config.floatX)  # no. of data points
        n_l = sh_train_x_l.shape[0].astype(theano.config.floatX)  # no. of labeled data points

        # Define the layers for the density estimation used in the lower bound.
        l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu, self.l_qa_logvar)
        l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar)
        l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8)

        l_log_pz = StandardNormalLogDensityLayer(self.l_qz)
        l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu, self.l_pa_logvar)
        if self.x_dist == 'bernoulli':
            l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in)
        elif self.x_dist == 'multinomial':
            l_log_px = MultinomialLogDensityLayer(self.l_px, self.l_x_in)
        elif self.x_dist == 'gaussian':
            l_log_px = GaussianLogDensityLayer(self.l_x_in, self.l_px_mu, self.l_px_logvar)

        def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px):
            lb = log_px + log_py + log_pz + log_pa - log_qa - log_qz
            return lb

        # Lower bound for labeled data
        out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy]
        inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l}
        out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False)
        log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out
        # Prior p(y) expecting that all classes are evenly distributed
        py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y)))
        log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l, log_py_l, log_px_zy_l)
        lb_l = lb_l.mean(axis=(1, 2))  # Mean over the sampling dimensions
        log_qy_ax_l *= (self.sym_beta * (n / n_l))  # Scale the supervised cross entropy with the alpha constant
        lb_l -= log_qy_ax_l.mean(axis=(1, 2))  # Collect the lower bound term and mean over sampling dimensions

        # Lower bound for unlabeled data
        bs_u = self.sym_x_u.shape[0]

        # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y
        # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form
        #               x_repeat                     t_repeat
        #  [[x[0,0], x[0,1], ..., x[0,n_x]]         [[1, 0, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [1, 0, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 1, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [0, 1, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 0, 1]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]]         [0, 0, 1]]
        t_eye = T.eye(self.n_y, k=0)
        t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape((-1, self.n_y))
        x_u = self.sym_x_u.reshape((1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape((-1, self.n_x))

        # Since the expectation of var a is outside the integration we calculate E_q(a|x) first
        a_x_u = get_output(self.l_qa, self.sym_x_u, batch_norm_update_averages=True, batch_norm_use_averages=False)
        a_x_u_rep = a_x_u.reshape((1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y, axis=0).reshape(
            (-1, self.n_a))
        out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px]
        inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep}
        out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False)
        log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out
        # Prior p(y) expecting that all classes are evenly distributed
        py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y)))
        log_py_u = -categorical_crossentropy(py_u, t_u).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u, log_py_u, log_px_zy_u)
        lb_u = lb_u.reshape((self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2))
        inputs = {self.l_x_in: self.sym_x_u, self.l_a_in: a_x_u.reshape((-1, self.n_a))}
        y_u = get_output(self.l_qy, inputs, batch_norm_update_averages=True, batch_norm_use_averages=False).mean(
            axis=(1, 2))
        y_u += 1e-8  # Ensure that we get no NANs when calculating the entropy
        y_u /= T.sum(y_u, axis=1, keepdims=True)
        lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1)

        if self.batchnorm:
            # TODO: implement the BN layer correctly.
            inputs = {self.l_x_in: self.sym_x_u, self.l_y_in: y_u, self.l_a_in: a_x_u}
            get_output(out_layers, inputs, weighting=None, batch_norm_update_averages=True,
                       batch_norm_use_averages=False)

        # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients
        weight_priors = 0.0
        for p in self.trainable_model_params:
            if 'W' not in str(p):
                continue
            weight_priors += log_normal(p, 0, 1).sum()

        # Collect the lower bound and scale it with the weight priors.
        elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n
        lb_labeled = -lb_l.mean()
        lb_unlabeled = -lb_u.mean()

        grads_collect = T.grad(elbo, self.trainable_model_params)
        params_collect = self.trainable_model_params
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads_collect, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1, sym_beta2)

        # Training function
        indices = self._srng.choice(size=[self.sym_bs_l], a=sh_train_x_l.shape[0], replace=False)
        x_batch_l = sh_train_x_l[indices]
        t_batch_l = sh_train_t_l[indices]
        x_batch_u = self.sh_train_x[self.batch_slice]
        if self.x_dist == 'bernoulli':  # Sample bernoulli input.
            x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX)
            x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX)

        givens = {self.sym_x_l: x_batch_l,
                  self.sym_x_u: x_batch_u,
                  self.sym_t_l: t_batch_l}
        inputs = [self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta,
                  self.sym_lr, sym_beta1, sym_beta2, self.sym_samples]
        outputs = [elbo, lb_labeled, lb_unlabeled]
        f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates)

        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize_unlabeled'] = 100
        self.train_args['inputs']['batchsize_labeled'] = 100
        self.train_args['inputs']['beta'] = 0.1
        self.train_args['inputs']['learningrate'] = 3e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['outputs']['lb'] = '%0.4f'
        self.train_args['outputs']['lb-labeled'] = '%0.4f'
        self.train_args['outputs']['lb-unlabeled'] = '%0.4f'

        # Validation and test function
        y = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2))
        class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100
        givens = {self.sym_x_l: self.sh_test_x,
                  self.sym_t_l: self.sh_test_t}
        f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens)

        # Test args.  Note that these can be changed during or prior to training.
        self.test_args['inputs']['samples'] = 1
        self.test_args['outputs']['test'] = '%0.2f%%'

        f_validate = None
        if validation_set is not None:
            givens = {self.sym_x_l: self.sh_valid_x,
                      self.sym_t_l: self.sh_valid_t}
            f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens)
        # Default validation args. Note that these can be changed during or prior to training.
        self.validate_args['inputs']['samples'] = 1
        self.validate_args['outputs']['validation'] = '%0.2f%%'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
Example #5
0
    def __init__(self, n_x, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, nonlinearity=rectify,
                 px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234):
        """
        Initialize an skip deep generative model consisting of
        discriminative classifier q(y|a,x),
        generative model P p(a|z,y) and p(x|a,z,y),
        inference model Q q(a|x) and q(z|a,x,y).
        Weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_x: Number of inputs.
        :param n_a: Number of auxiliary.
        :param n_z: Number of latent.
        :param n_y: Number of classes.
        :param qa_hid: List of number of deterministic hidden q(a|x).
        :param qz_hid: List of number of deterministic hidden q(z|a,x,y).
        :param qy_hid: List of number of deterministic hidden q(y|a,x).
        :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y).
        :param nonlinearity: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'.
        :param batchnorm: Boolean value for batch normalization.
        :param seed: The random seed.
        """
        super(SDGMSSL, self).__init__(n_x, qz_hid + px_hid, n_a + n_z, nonlinearity)
        self.x_dist = x_dist
        self.n_y = n_y
        self.n_x = n_x
        self.n_a = n_a
        self.n_z = n_z
        self.batchnorm = batchnorm
        self._srng = RandomStreams(seed)

        # Decide Glorot initializaiton of weights.
        init_w = 1e-3
        hid_w = ""
        if nonlinearity == rectify or nonlinearity == softplus:
            hid_w = "relu"

        # Define symbolic variables for theano functions.
        self.sym_beta = T.scalar('beta')  # scaling constant beta
        self.sym_x_l = T.matrix('x')  # labeled inputs
        self.sym_t_l = T.matrix('t')  # labeled targets
        self.sym_x_u = T.matrix('x')  # unlabeled inputs
        self.sym_bs_l = T.iscalar('bs_l')  # number of labeled data
        self.sym_samples = T.iscalar('samples')  # MC samples
        self.sym_z = T.matrix('z')  # latent variable z
        self.sym_a = T.matrix('a')  # auxiliary variable a

        # Assist methods for collecting the layers
        def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal):
            dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None)
            if batchnorm:
                dense = BatchNormLayer(dense)
            return NonlinearityLayer(dense, self.transf)

        def stochastic_layer(layer_in, n, samples, nonlin=None):
            mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin)
            logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin)
            return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar

        # Input layers
        l_x_in = InputLayer((None, n_x))
        l_y_in = InputLayer((None, n_y))

        # Auxiliary q(a|x)
        l_qa_x = l_x_in
        for hid in qa_hid:
            l_qa_x = dense_layer(l_qa_x, hid)
        l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(l_qa_x, n_a, self.sym_samples)

        # Classifier q(y|a,x)
        l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0]))
        l_x_to_qy = DenseLayer(l_x_in, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1))
        l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0]))
        if batchnorm:
            l_qy_xa = BatchNormLayer(l_qy_xa)
        l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf)
        if len(qy_hid) > 1:
            for hid in qy_hid[1:]:
                l_qy_xa = dense_layer(l_qy_xa, hid)
        l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax)

        # Recognition q(z|x,a,y)
        l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0]))
        l_x_to_qz = DenseLayer(l_x_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1))
        l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1))
        l_qz_axy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0]))
        if batchnorm:
            l_qz_axy = BatchNormLayer(l_qz_axy)
        l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf)
        if len(qz_hid) > 1:
            for hid in qz_hid[1:]:
                l_qz_axy = dense_layer(l_qz_axy, hid)
        l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(l_qz_axy, n_z, 1)

        # Generative p(a|z,y)
        l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1))
        l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0]))
        l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]])
        if batchnorm:
            l_pa_zy = BatchNormLayer(l_pa_zy)
        l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf)
        if len(pa_hid) > 1:
            for hid in pa_hid[1:]:
                l_pa_zy = dense_layer(l_pa_zy, hid)
        l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1)

        # Generative p(x|a,z,y)
        l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0]))
        l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1))
        l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None)
        l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0]))
        l_px_azy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]])
        if batchnorm:
            l_px_azy = BatchNormLayer(l_px_azy)
        l_px_azy = NonlinearityLayer(l_px_azy, self.transf)
        if len(px_hid) > 1:
            for hid in px_hid[1:]:
                l_px_azy = dense_layer(l_px_azy, hid)

        if x_dist == 'bernoulli':
            l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid)
        elif x_dist == 'multinomial':
            l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), softmax)
        elif x_dist == 'gaussian':
            l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(l_px_azy, n_x, 1, px_nonlinearity)

        # Reshape all the model layers to have the same size
        self.l_x_in = l_x_in
        self.l_y_in = l_y_in
        self.l_a_in = l_qa_x

        self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a))
        self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1))
        self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1))

        self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z))
        self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z))
        self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z))

        self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y))

        self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a))
        self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a))
        self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a))

        self.l_px = ReshapeLayer(l_px_azy, (-1, self.sym_samples, 1, n_x))
        self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None
        self.l_px_logvar = ReshapeLayer(l_px_zy_logvar,
                                        (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None

        # Predefined functions
        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2))
        self.f_qy = theano.function(inputs, outputs)

        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2))
        self.f_qa = theano.function(inputs, outputs)

        inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l}
        outputs = get_output(self.l_pa, inputs, deterministic=True)
        self.f_pa = theano.function([self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        inputs = {l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l}
        outputs = get_output(self.l_px, inputs, deterministic=True)
        self.f_px = theano.function([self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        # Define model parameters
        self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px])
        self.trainable_model_params = get_all_params([self.l_qy, self.l_pa, self.l_px], trainable=True)
    def build_model(self, train_set, test_set, validation_set=None):
        """
        :param train_set_unlabeled: Unlabeled train set containing variables x, t.
        :param train_set_labeled: Unlabeled train set containing variables x, t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(CVAE, self).build_model(train_set, test_set, validation_set)

        n = self.sh_train_x.shape[0].astype(
            theano.config.floatX)  # no. of data points

        # Define the layers for the density estimation used in the lower bound.
        l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu,
                                           self.l_qz_logvar)
        l_log_pz = StandardNormalLogDensityLayer(self.l_qz)

        l_x_in = ReshapeLayer(self.l_x_in, (-1, self.seq_length * self.n_x))
        if self.x_dist == 'bernoulli':
            l_px = ReshapeLayer(
                self.l_px,
                (-1, self.sym_samples, 1, self.seq_length * self.n_x))
            l_log_px = BernoulliLogDensityLayer(l_px, l_x_in)
        elif self.x_dist == 'multinomial':
            l_px = ReshapeLayer(
                self.l_px,
                (-1, self.sym_samples, 1, self.seq_length * self.n_x))
            l_log_px = MultinomialLogDensityLayer(l_px, l_x_in)
        elif self.x_dist == 'gaussian':
            l_px_mu = ReshapeLayer(
                self.l_px_mu,
                (-1, self.sym_samples, 1, self.seq_length * self.n_x))
            l_px_logvar = ReshapeLayer(
                self.l_px_logvar,
                (-1, self.sym_samples, 1, self.seq_length * self.n_x))
            l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar)
        elif self.x_dist == 'linear':
            l_log_px = self.l_px

        self.sym_warmup = T.fscalar('warmup')

        def lower_bound(log_pz, log_qz, log_px):
            return log_px + (log_pz - log_qz) * (1. - self.sym_warmup - 0.1)

        # Lower bound
        out_layers = [l_log_pz, l_log_qz, l_log_px]
        inputs = {self.l_x_in: self.sym_x}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pz, log_qz, log_px = out

        # If the decoder output is linear we need the reconstruction error
        if self.x_dist == 'linear':
            log_px = -aggregate(squared_error(log_px.mean(axis=(1, 2)),
                                              self.sym_x),
                                mode='mean')

        lb = lower_bound(log_pz, log_qz, log_px)
        lb = lb.mean(axis=(1, 2))  # Mean over the sampling dimensions

        if self.batchnorm:
            # TODO: implement the BN layer correctly.
            inputs = {self.l_x_in: self.sym_x}
            get_output(out_layers,
                       inputs,
                       weighting=None,
                       batch_norm_update_averages=True,
                       batch_norm_use_averages=False)

        # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients
        weight_priors = 0.0
        for p in self.trainable_model_params:
            if 'W' not in str(p):
                continue
            weight_priors += log_normal(p, 0, 1).sum()

        # Collect the lower bound and scale it with the weight priors.
        elbo = lb.mean()
        cost = (elbo * n + weight_priors) / -n

        grads_collect = T.grad(cost, self.trainable_model_params)
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads_collect, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        updates = adam(mgrads, self.trainable_model_params, self.sym_lr,
                       sym_beta1, sym_beta2)
        # updates = rmsprop(mgrads, self.trainable_model_params, self.sym_lr + (0*sym_beta1*sym_beta2))

        # Training function
        x_batch = self.sh_train_x[self.batch_slice]
        if self.x_dist == 'bernoulli':  # Sample bernoulli input.
            x_batch = self._srng.binomial(size=x_batch.shape,
                                          n=1,
                                          p=x_batch,
                                          dtype=theano.config.floatX)

        givens = {self.sym_x: x_batch}
        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1,
            sym_beta2, self.sym_samples, self.sym_warmup
        ]
        outputs = [
            log_px.mean(),
            log_pz.mean(),
            log_qz.mean(), elbo, self.sym_warmup
        ]
        f_train = theano.function(inputs=inputs,
                                  outputs=outputs,
                                  givens=givens,
                                  updates=updates)

        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize'] = 100
        self.train_args['inputs']['learningrate'] = 1e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['inputs']['warmup'] = 0.1
        self.train_args['outputs']['log p(x)'] = '%0.6f'
        self.train_args['outputs']['log p(z)'] = '%0.6f'
        self.train_args['outputs']['log q(z)'] = '%0.6f'
        self.train_args['outputs']['elbo train'] = '%0.6f'
        self.train_args['outputs']['warmup'] = '%0.3f'

        # Validation and test function
        givens = {self.sym_x: self.sh_test_x}
        f_test = theano.function(inputs=[self.sym_samples, self.sym_warmup],
                                 outputs=[elbo],
                                 givens=givens)

        # Test args.  Note that these can be changed during or prior to training.
        self.test_args['inputs']['samples'] = 1
        self.test_args['inputs']['warmup'] = 0.1
        self.test_args['outputs']['elbo test'] = '%0.6f'

        f_validate = None
        if validation_set is not None:
            givens = {self.sym_x: self.sh_valid_x}
            f_validate = theano.function(inputs=[self.sym_samples],
                                         outputs=[elbo],
                                         givens=givens)
            # Default validation args. Note that these can be changed during or prior to training.
            self.validate_args['inputs']['samples'] = 1
            self.validate_args['outputs']['elbo validation'] = '%0.6f'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
    def build_model(self, train_set, test_set, validation_set=None):
        """
        :param train_set_unlabeled: Unlabeled train set containing variables x, t.
        :param train_set_labeled: Unlabeled train set containing variables x, t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(RAE, self).build_model(train_set, test_set, validation_set)

        # Cost
        inputs = {self.l_x_in: self.sym_x}
        # px = get_output(self.l_px, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False)
        px = get_output(self.l_px, inputs)
        cost = aggregate(squared_error(px, self.sym_x), mode='mean')
        # cost += 1e-4 * regularize_network_params(self.l_px, l2)

        grads_collect = T.grad(cost, self.trainable_model_params)
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads_collect, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        updates = adam(mgrads, self.trainable_model_params, self.sym_lr,
                       sym_beta1, sym_beta2)
        # updates = rmsprop(mgrads, self.trainable_model_params, self.sym_lr + (0*sym_beta1*sym_beta2))

        # Training function
        x_batch = self.sh_train_x[self.batch_slice]

        givens = {self.sym_x: x_batch}
        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1,
            sym_beta2
        ]
        outputs = [cost]
        f_train = theano.function(inputs=inputs,
                                  outputs=outputs,
                                  givens=givens,
                                  updates=updates)

        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize'] = 100
        self.train_args['inputs']['learningrate'] = 3e-3
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['outputs']['cost train'] = '%0.6f'

        # Validation and test function
        givens = {self.sym_x: self.sh_test_x}
        f_test = theano.function(inputs=[], outputs=[cost], givens=givens)

        # Test args.  Note that these can be changed during or prior to training.
        self.test_args['outputs']['cost test'] = '%0.6f'

        f_validate = None
        if validation_set is not None:
            givens = {self.sym_x: self.sh_valid_x}
            f_validate = theano.function(inputs=[],
                                         outputs=[cost],
                                         givens=givens)

            # Default validation args. Note that these can be changed during or prior to training.
            self.validate_args['outputs']['cost val'] = '%0.6f'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
Example #8
0
    def __init__(self,
                 n_c,
                 n_l,
                 n_a,
                 n_z,
                 n_y,
                 qa_hid,
                 qz_hid,
                 qy_hid,
                 px_hid,
                 pa_hid,
                 filters,
                 nonlinearity=rectify,
                 px_nonlinearity=None,
                 x_dist='bernoulli',
                 batchnorm=False,
                 seed=1234):
        """
        Initialize an skip deep generative model consisting of
        discriminative classifier q(y|a,x),
        generative model P p(a|z,y) and p(x|a,z,y),
        inference model Q q(a|x) and q(z|a,x,y).
        Weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_c: Number of input channels.
        :param n_l: Number of lengths.
        :param n_a: Number of auxiliary.
        :param n_z: Number of latent.
        :param n_y: Number of classes.
        :param qa_hid: List of number of deterministic hidden q(a|x).
        :param qz_hid: List of number of deterministic hidden q(z|a,x,y).
        :param qy_hid: List of number of deterministic hidden q(y|a,x).
        :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y).
        :param nonlinearity: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'.
        :param batchnorm: Boolean value for batch normalization.
        :param seed: The random seed.
        """
        super(CSDGM, self).__init__(n_c, qz_hid + px_hid, n_a + n_z,
                                    nonlinearity)
        self.x_dist = x_dist
        self.n_y = n_y
        self.n_c = n_c
        self.n_l = n_l
        self.n_a = n_a
        self.n_z = n_z
        self.batchnorm = batchnorm
        self._srng = RandomStreams(seed)

        # Decide Glorot initializaiton of weights.
        init_w = 1e-3
        hid_w = ""
        if nonlinearity == rectify or nonlinearity == softplus:
            hid_w = "relu"

        pool_layers = []

        # Define symbolic variables for theano functions.
        self.sym_beta = T.scalar('beta')  # scaling constant beta
        self.sym_x_l = T.tensor3('x')  # labeled inputs
        self.sym_t_l = T.matrix('t')  # labeled targets
        self.sym_x_u = T.tensor3('x')  # unlabeled inputs
        self.sym_bs_l = T.iscalar('bs_l')  # number of labeled data
        self.sym_samples = T.iscalar('samples')  # MC samples
        self.sym_z = T.matrix('z')  # latent variable z
        self.sym_a = T.matrix('a')  # auxiliary variable a
        self.sym_warmup = T.fscalar('warmup')  # warmup to scale KL term

        # Assist methods for collecting the layers
        def dense_layer(layer_in,
                        n,
                        dist_w=init.GlorotNormal,
                        dist_b=init.Normal):
            dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w),
                               None)
            if batchnorm:
                dense = BatchNormLayer(dense)
            return NonlinearityLayer(dense, self.transf)

        def stochastic_layer(layer_in, n, samples, nonlin=None):
            mu = DenseLayer(layer_in, n, init.Normal(init_w),
                            init.Normal(init_w), nonlin)
            logvar = DenseLayer(layer_in, n, init.Normal(init_w),
                                init.Normal(init_w), nonlin)
            return SampleLayer(mu, logvar, eq_samples=samples,
                               iw_samples=1), mu, logvar

        def conv_layer(layer_in,
                       filter,
                       stride=(1, 1),
                       pool=1,
                       name='conv',
                       dist_w=init.GlorotNormal,
                       dist_b=init.Normal):
            l_conv = Conv2DLayer(layer_in,
                                 num_filters=filter,
                                 filter_size=(3, 1),
                                 stride=stride,
                                 pad='full',
                                 W=dist_w(hid_w),
                                 b=dist_b(init_w),
                                 name=name)
            if pool > 1:
                l_conv = MaxPool2DLayer(l_conv, pool_size=(pool, 1))
                pool_layers.append(l_conv)
            return l_conv

        # Input layers
        l_y_in = InputLayer((None, n_y))
        l_x_in = InputLayer((None, n_l, n_c), name='Input')

        # Reshape input
        l_x_in_reshp = ReshapeLayer(l_x_in, (-1, 1, n_l, n_c))
        print("l_x_in_reshp", l_x_in_reshp.output_shape)

        # CNN encoder implementation
        l_conv_enc = l_x_in_reshp
        for filter, stride, pool in filters:
            l_conv_enc = conv_layer(l_conv_enc, filter, stride, pool)
            print("l_conv_enc", l_conv_enc.output_shape)

        # Pool along last 2 axes
        l_global_pool_enc = GlobalPoolLayer(l_conv_enc, pool_function=T.mean)
        l_enc = dense_layer(l_global_pool_enc, n_z)
        print("l_enc", l_enc.output_shape)

        # Auxiliary q(a|x)
        l_qa_x = l_enc
        for hid in qa_hid:
            l_qa_x = dense_layer(l_qa_x, hid)
        l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(
            l_qa_x, n_a, self.sym_samples)

        # Classifier q(y|a,x)
        l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_qy = ReshapeLayer(l_qa_to_qy,
                                  (-1, self.sym_samples, 1, qy_hid[0]))
        l_x_to_qy = DenseLayer(l_enc, qy_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1))
        l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]),
                               (-1, qy_hid[0]))
        if batchnorm:
            l_qy_xa = BatchNormLayer(l_qy_xa)
        l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf)
        if len(qy_hid) > 1:
            for hid in qy_hid[1:]:
                l_qy_xa = dense_layer(l_qy_xa, hid)
        l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(),
                             init.Normal(init_w), softmax)

        # Recognition q(z|x,a,y)
        l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_qz = ReshapeLayer(l_qa_to_qz,
                                  (-1, self.sym_samples, 1, qz_hid[0]))
        l_x_to_qz = DenseLayer(l_enc, qz_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1))
        l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1))
        l_qz_axy = ReshapeLayer(
            ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]),
            (-1, qz_hid[0]))
        if batchnorm:
            l_qz_axy = BatchNormLayer(l_qz_axy)
        l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf)
        if len(qz_hid) > 1:
            for hid in qz_hid[1:]:
                l_qz_axy = dense_layer(l_qz_axy, hid)
        l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(
            l_qz_axy, n_z, 1)

        # Generative p(a|z,y)
        l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1))
        l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qz_to_pa = ReshapeLayer(l_qz_to_pa,
                                  (-1, self.sym_samples, 1, pa_hid[0]))
        l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]),
                               [-1, pa_hid[0]])
        if batchnorm:
            l_pa_zy = BatchNormLayer(l_pa_zy)
        l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf)
        if len(pa_hid) > 1:
            for hid in pa_hid[1:]:
                l_pa_zy = dense_layer(l_pa_zy, hid)
        l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1)

        # Generative p(x|a,z,y)
        l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_px = ReshapeLayer(l_qa_to_px,
                                  (-1, self.sym_samples, 1, px_hid[0]))
        l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1))
        l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qz_to_px = ReshapeLayer(l_qz_to_px,
                                  (-1, self.sym_samples, 1, px_hid[0]))
        l_px_azy = ReshapeLayer(
            ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]),
            [-1, px_hid[0]])
        if batchnorm:
            l_px_azy = BatchNormLayer(l_px_azy)
        l_px_azy = NonlinearityLayer(l_px_azy, self.transf)

        # Note that px_hid[0] has to be equal to the number filters in the first convolution. Otherwise add a
        # dense layers here.

        # Inverse pooling
        l_global_depool = InverseLayer(l_px_azy, l_global_pool_enc)
        print("l_global_depool", l_global_depool.output_shape)

        # Reverse pool layer order
        pool_layers = pool_layers[::-1]

        # Decode
        l_deconv = l_global_depool
        for idx, filter in enumerate(filters[::-1]):
            filter, stride, pool = filter
            if pool > 1:
                l_deconv = InverseLayer(l_deconv, pool_layers[idx])
            l_deconv = Conv2DLayer(l_deconv,
                                   num_filters=filter,
                                   filter_size=(3, 1),
                                   stride=(stride, 1),
                                   W=init.GlorotNormal('relu'))
            print("l_deconv", l_deconv.output_shape)

        # The last l_conv layer should give us the input shape
        l_px_azy = Conv2DLayer(l_deconv,
                               num_filters=1,
                               filter_size=(3, 1),
                               pad='same',
                               nonlinearity=None)
        print("l_dec", l_px_azy.output_shape)

        # Flatten first two dimensions
        l_px_azy = ReshapeLayer(l_px_azy, (-1, n_c))

        if x_dist == 'bernoulli':
            l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(),
                                  init.Normal(init_w), sigmoid)
        elif x_dist == 'multinomial':
            l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(),
                                  init.Normal(init_w), softmax)
        elif x_dist == 'gaussian':
            l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(
                l_px_azy, n_c, self.sym_samples, px_nonlinearity)
        elif x_dist == 'linear':
            l_px_azy = DenseLayer(l_px_azy, n_c, nonlinearity=None)

        # Reshape all the model layers to have the same size
        self.l_x_in = l_x_in
        self.l_y_in = l_y_in
        self.l_a_in = l_qa_x

        self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a))
        self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1))
        self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1))

        self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z))
        self.l_qz_mu = ReshapeLayer(l_qz_axy_mu,
                                    (-1, self.sym_samples, 1, n_z))
        self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar,
                                        (-1, self.sym_samples, 1, n_z))

        self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y))

        self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a))
        self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a))
        self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar,
                                        (-1, self.sym_samples, 1, n_a))

        # Here we assume that we pass (batch size * segment length, number of features) to the sample layer from
        # which we then get (batch size * segment length, samples, IW samples, features)
        self.l_px = ReshapeLayer(l_px_azy, (-1, n_l, self.sym_samples, 1, n_c))
        self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, n_l, self.sym_samples, 1, n_c)) \
            if x_dist == "gaussian" else None
        self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, n_l, self.sym_samples, 1, n_c)) \
            if x_dist == "gaussian" else None

        # Predefined functions
        inputs = {l_x_in: self.sym_x_l}
        outputs = get_output(self.l_qy, inputs,
                             deterministic=True).mean(axis=(1, 2))
        self.f_qy = theano.function([self.sym_x_l, self.sym_samples], outputs)

        outputs = get_output(l_qa_x, inputs, deterministic=True)
        self.f_qa = theano.function([self.sym_x_l, self.sym_samples], outputs)

        inputs = {l_x_in: self.sym_x_l, l_y_in: self.sym_t_l}
        outputs = get_output(l_qz_axy, inputs, deterministic=True)
        self.f_qz = theano.function(
            [self.sym_x_l, self.sym_t_l, self.sym_samples], outputs)

        inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l}
        outputs = get_output(self.l_pa, inputs,
                             deterministic=True).mean(axis=(1, 2))
        self.f_pa = theano.function(
            [self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        inputs = {
            l_x_in: self.sym_x_l,
            l_qa_x: self.sym_a,
            l_qz_axy: self.sym_z,
            l_y_in: self.sym_t_l
        }
        outputs = get_output(self.l_px, inputs,
                             deterministic=True).mean(axis=(2, 3))
        self.f_px = theano.function([
            self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l,
            self.sym_samples
        ], outputs)

        outputs = get_output(self.l_px_mu, inputs,
                             deterministic=True).mean(axis=(2, 3))
        self.f_mu = theano.function([
            self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l,
            self.sym_samples
        ], outputs)

        outputs = get_output(self.l_px_logvar, inputs,
                             deterministic=True).mean(axis=(2, 3))
        self.f_var = theano.function([
            self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l,
            self.sym_samples
        ], outputs)

        # Define model parameters
        self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px])
        self.trainable_model_params = get_all_params(
            [self.l_qy, self.l_pa, self.l_px], trainable=True)
Example #9
0
    def __init__(self,
                 n_x,
                 n_a,
                 n_z,
                 n_y,
                 a_hidden,
                 z_hidden,
                 xhat_hidden,
                 y_hidden,
                 trans_func=rectify,
                 x_dist='bernoulli'):
        """
        Initialize an auxiliary deep generative model consisting of
        discriminative classifier q(y|a,x),
        generative model P p(xhat|z,y),
        inference model Q q(a|x) and q(z|x,y).
        All weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_x: Number of inputs.
        :param n_a: Number of auxiliary.
        :param n_z: Number of latent.
        :param n_y: Number of classes.
        :param a_hidden: List of number of deterministic hidden q(a|x).
        :param z_hidden: List of number of deterministic hidden q(z|x,y).
        :param xhat_hidden: List of number of deterministic hidden p(xhat|z,y).
        :param y_hidden: List of number of deterministic hidden q(y|a,x).
        :param trans_func: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli' or 'gaussian'.
        """
        super(ADGMSSL, self).__init__(n_x, a_hidden + z_hidden + xhat_hidden,
                                      n_a + n_z, trans_func)
        self.y_hidden = y_hidden
        self.x_dist = x_dist
        self.n_y = n_y
        self.n_x = n_x
        self.n_a = n_a
        self.n_z = n_z

        self._srng = RandomStreams()

        self.sym_beta = T.scalar(
            'beta')  # symbolic upscaling of the discriminative term.
        self.sym_x_l = T.matrix('x')  # symbolic labeled inputs
        self.sym_t_l = T.matrix('t')  # symbolic labeled targets
        self.sym_x_u = T.matrix('x')  # symbolic unlabeled inputs
        self.sym_bs_l = T.iscalar(
            'bs_l'
        )  # symbolic number of labeled data_preparation points in batch
        self.sym_samples = T.iscalar(
            'samples')  # symbolic number of Monte Carlo samples
        self.sym_y = T.matrix('y')
        self.sym_z = T.matrix('z')

        ### Input layers ###
        l_x_in = InputLayer((None, n_x))
        l_y_in = InputLayer((None, n_y))

        ### Auxiliary q(a|x) ###
        l_a_x = l_x_in
        for hid in a_hidden:
            l_a_x = DenseLayer(l_a_x, hid, init.GlorotNormal('relu'),
                               init.Normal(1e-3), self.transf)
        l_a_x_mu = DenseLayer(l_a_x, n_a, init.GlorotNormal(),
                              init.Normal(1e-3), None)
        l_a_x_logvar = DenseLayer(l_a_x, n_a, init.GlorotNormal(),
                                  init.Normal(1e-3), None)
        l_a_x = SampleLayer(l_a_x_mu,
                            l_a_x_logvar,
                            eq_samples=self.sym_samples)
        # Reshape all layers to align them for multiple samples in the lower bound calculation.
        l_a_x_reshaped = ReshapeLayer(l_a_x, (-1, self.sym_samples, 1, n_a))
        l_a_x_mu_reshaped = DimshuffleLayer(l_a_x_mu, (0, 'x', 'x', 1))
        l_a_x_logvar_reshaped = DimshuffleLayer(l_a_x_logvar, (0, 'x', 'x', 1))

        ### Classifier q(y|a,x) ###
        # Concatenate the input x and the output of the auxiliary MLP.
        l_a_to_y = DenseLayer(l_a_x, y_hidden[0], init.GlorotNormal('relu'),
                              init.Normal(1e-3), None)
        l_a_to_y = ReshapeLayer(l_a_to_y,
                                (-1, self.sym_samples, 1, y_hidden[0]))
        l_x_to_y = DenseLayer(l_x_in, y_hidden[0], init.GlorotNormal('relu'),
                              init.Normal(1e-3), None)
        l_x_to_y = DimshuffleLayer(l_x_to_y, (0, 'x', 'x', 1))
        l_y_xa = ReshapeLayer(ElemwiseSumLayer([l_a_to_y, l_x_to_y]),
                              (-1, y_hidden[0]))
        l_y_xa = NonlinearityLayer(l_y_xa, self.transf)

        if len(y_hidden) > 1:
            for hid in y_hidden[1:]:
                l_y_xa = DenseLayer(l_y_xa, hid, init.GlorotUniform('relu'),
                                    init.Normal(1e-3), self.transf)
        l_y_xa = DenseLayer(l_y_xa, n_y, init.GlorotUniform(),
                            init.Normal(1e-3), softmax)
        l_y_xa_reshaped = ReshapeLayer(l_y_xa, (-1, self.sym_samples, 1, n_y))

        ### Recognition q(z|x,y) ###
        # Concatenate the input x and y.
        l_x_to_z = DenseLayer(l_x_in, z_hidden[0], init.GlorotNormal('relu'),
                              init.Normal(1e-3), None)
        l_x_to_z = DimshuffleLayer(l_x_to_z, (0, 'x', 'x', 1))
        l_y_to_z = DenseLayer(l_y_in, z_hidden[0], init.GlorotNormal('relu'),
                              init.Normal(1e-3), None)
        l_y_to_z = DimshuffleLayer(l_y_to_z, (0, 'x', 'x', 1))
        l_z_xy = ReshapeLayer(ElemwiseSumLayer([l_x_to_z, l_y_to_z]),
                              [-1, z_hidden[0]])
        l_z_xy = NonlinearityLayer(l_z_xy, self.transf)

        if len(z_hidden) > 1:
            for hid in z_hidden[1:]:
                l_z_xy = DenseLayer(l_z_xy, hid, init.GlorotNormal('relu'),
                                    init.Normal(1e-3), self.transf)
        l_z_axy_mu = DenseLayer(l_z_xy, n_z, init.GlorotNormal(),
                                init.Normal(1e-3), None)
        l_z_axy_logvar = DenseLayer(l_z_xy, n_z, init.GlorotNormal(),
                                    init.Normal(1e-3), None)
        l_z_xy = SampleLayer(l_z_axy_mu,
                             l_z_axy_logvar,
                             eq_samples=self.sym_samples)
        # Reshape all layers to align them for multiple samples in the lower bound calculation.
        l_z_axy_mu_reshaped = DimshuffleLayer(l_z_axy_mu, (0, 'x', 'x', 1))
        l_z_axy_logvar_reshaped = DimshuffleLayer(l_z_axy_logvar,
                                                  (0, 'x', 'x', 1))
        l_z_axy_reshaped = ReshapeLayer(l_z_xy, (-1, self.sym_samples, 1, n_z))

        ### Generative p(xhat|z,y) ###
        # Concatenate the input x and y.
        l_y_to_xhat = DenseLayer(l_y_in, xhat_hidden[0],
                                 init.GlorotNormal('relu'), init.Normal(1e-3),
                                 None)
        l_y_to_xhat = DimshuffleLayer(l_y_to_xhat, (0, 'x', 'x', 1))
        l_z_to_xhat = DenseLayer(l_z_xy, xhat_hidden[0],
                                 init.GlorotNormal('relu'), init.Normal(1e-3),
                                 None)
        l_z_to_xhat = ReshapeLayer(l_z_to_xhat,
                                   (-1, self.sym_samples, 1, xhat_hidden[0]))
        l_xhat_zy = ReshapeLayer(ElemwiseSumLayer([l_z_to_xhat, l_y_to_xhat]),
                                 [-1, xhat_hidden[0]])
        l_xhat_zy = NonlinearityLayer(l_xhat_zy, self.transf)
        if len(xhat_hidden) > 1:
            for hid in xhat_hidden[1:]:
                l_xhat_zy = DenseLayer(l_xhat_zy, hid,
                                       init.GlorotNormal('relu'),
                                       init.Normal(1e-3), self.transf)
        if x_dist == 'bernoulli':
            l_xhat_zy_mu_reshaped = None
            l_xhat_zy_logvar_reshaped = None
            l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(),
                                   init.Normal(1e-3), sigmoid)
        elif x_dist == 'multinomial':
            l_xhat_zy_mu_reshaped = None
            l_xhat_zy_logvar_reshaped = None
            l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(),
                                   init.Normal(1e-3), softmax)
        elif x_dist == 'gaussian':
            l_xhat_zy_mu = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(),
                                      init.Normal(1e-3), None)
            l_xhat_zy_logvar = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(),
                                          init.Normal(1e-3), None)
            l_xhat_zy = SampleLayer(l_xhat_zy_mu,
                                    l_xhat_zy_logvar,
                                    eq_samples=1)
            l_xhat_zy_mu_reshaped = ReshapeLayer(
                l_xhat_zy_mu, (-1, self.sym_samples, 1, n_x))
            l_xhat_zy_logvar_reshaped = ReshapeLayer(
                l_xhat_zy_logvar, (-1, self.sym_samples, 1, n_x))
        l_xhat_zy_reshaped = ReshapeLayer(l_xhat_zy,
                                          (-1, self.sym_samples, 1, n_x))

        ### Various class variables ###
        self.l_x_in = l_x_in
        self.l_y_in = l_y_in
        self.l_a_mu = l_a_x_mu_reshaped
        self.l_a_logvar = l_a_x_logvar_reshaped
        self.l_a = l_a_x_reshaped
        self.l_z_mu = l_z_axy_mu_reshaped
        self.l_z_logvar = l_z_axy_logvar_reshaped
        self.l_z = l_z_axy_reshaped
        self.l_y = l_y_xa_reshaped
        self.l_xhat_mu = l_xhat_zy_mu_reshaped
        self.l_xhat_logvar = l_xhat_zy_logvar_reshaped
        self.l_xhat = l_xhat_zy_reshaped

        self.model_params = get_all_params([self.l_xhat, self.l_y])

        ### Calculate networks shapes for documentation ###
        self.qa_shapes = self.get_model_shape(get_all_params(l_a_x))
        self.qy_shapes = self.get_model_shape(
            get_all_params(l_y_xa))[len(self.qa_shapes) - 1:]
        self.qz_shapes = self.get_model_shape(get_all_params(l_z_xy))
        self.px_shapes = self.get_model_shape(
            get_all_params(l_xhat_zy))[(len(self.qz_shapes) - 1):]

        ### Predefined functions for generating xhat and y ###
        inputs = {l_z_xy: self.sym_z, self.l_y_in: self.sym_y}
        outputs = get_output(self.l_xhat, inputs,
                             deterministic=True).mean(axis=(1, 2))
        inputs = [self.sym_z, self.sym_y, self.sym_samples]
        self.f_xhat = theano.function(inputs, outputs)

        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_y, self.sym_x_l,
                             deterministic=True).mean(axis=(1, 2))
        self.f_y = theano.function(inputs, outputs)

        self.y_params = get_all_params(
            self.l_y, trainable=True)[(len(a_hidden) + 2) * 2::]
        self.xhat_params = get_all_params(self.l_xhat, trainable=True)
Example #10
0
    def build_model(self, train_set, test_set, validation_set=None):
        """
        Build the auxiliary deep generative model from the initialized hyperparameters.
        Define the lower bound term and compile it into a training function.
        :param train_set: Train set containing variables x, t.
        for the unlabeled data_preparation in the train set, we define 0's in t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(ADGMSSL, self).build_model(train_set, test_set, validation_set)

        # Define the layers for the density estimation used in the lower bound.
        l_log_pa = GaussianMarginalLogDensityLayer(self.l_a_mu,
                                                   self.l_a_logvar)
        l_log_pz = GaussianMarginalLogDensityLayer(self.l_z_mu,
                                                   self.l_z_logvar)
        l_log_qa_x = GaussianMarginalLogDensityLayer(1, self.l_a_logvar)
        l_log_qz_xy = GaussianMarginalLogDensityLayer(1, self.l_z_logvar)
        l_log_qy_ax = MultinomialLogDensityLayer(self.l_y,
                                                 self.l_y_in,
                                                 eps=1e-8)
        if self.x_dist == 'bernoulli':
            l_px_zy = BernoulliLogDensityLayer(self.l_xhat, self.l_x_in)
        elif self.x_dist == 'multinomial':
            l_px_zy = MultinomialLogDensityLayer(self.l_xhat, self.l_x_in)
        elif self.x_dist == 'gaussian':
            l_px_zy = GaussianLogDensityLayer(self.l_x_in, self.l_xhat_mu,
                                              self.l_xhat_logvar)

        ### Compute lower bound for labeled data_preparation ###
        out_layers = [
            l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy, l_log_qy_ax
        ]
        inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l}
        log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = get_output(
            out_layers, inputs)
        py_l = softmax(T.zeros(
            (self.sym_x_l.shape[0], self.n_y)))  # non-informative prior
        log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_l = log_pa_l + log_pz_l + log_py_l + log_px_zy_l - log_qa_x_l - log_qz_axy_l
        # Upscale the discriminative term with a weight.
        log_qy_ax_l *= self.sym_beta
        xhat_grads_l = T.grad(lb_l.mean(axis=(1, 2)).sum(), self.xhat_params)
        y_grads_l = T.grad(log_qy_ax_l.mean(axis=(1, 2)).sum(), self.y_params)
        lb_l += log_qy_ax_l
        lb_l = lb_l.mean(axis=(1, 2))

        ### Compute lower bound for unlabeled data_preparation ###
        bs_u = self.sym_x_u.shape[0]  # size of the unlabeled data_preparation.
        t_eye = T.eye(self.n_y,
                      k=0)  # ones in diagonal and 0's elsewhere (bs x n_y).
        # repeat unlabeled t the number of classes for integration (bs * n_y) x n_y.
        t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u,
                                                            axis=1).reshape(
                                                                (-1, self.n_y))
        # repeat unlabeled x the number of classes for integration (bs * n_y) x n_x
        x_u = self.sym_x_u.reshape(
            (1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape(
                (-1, self.n_x))
        out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy]
        inputs = {self.l_x_in: x_u, self.l_y_in: t_u}
        log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = get_output(
            out_layers, inputs)
        py_u = softmax(T.zeros(
            (bs_u * self.n_y, self.n_y)))  # non-informative prior.
        log_py_u = -categorical_crossentropy(py_u, t_u).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_u = log_pa_u + log_pz_u + log_py_u + log_px_zy_u - log_qa_x_u - log_qz_axy_u
        lb_u = lb_u.reshape(
            (self.n_y, self.sym_samples, 1,
             bs_u)).transpose(3, 1, 2,
                              0).mean(axis=(1, 2))  # mean over samples.
        y_ax_u = get_output(self.l_y, self.sym_x_u)
        y_ax_u = y_ax_u.mean(axis=(1, 2))  # bs x n_y
        y_ax_u += 1e-8  # ensure that we get no NANs.
        y_ax_u /= T.sum(y_ax_u, axis=1, keepdims=True)
        xhat_grads_u = T.grad((y_ax_u * lb_u).sum(axis=1).sum(),
                              self.xhat_params)
        lb_u = (y_ax_u * (lb_u - T.log(y_ax_u))).sum(axis=1)
        y_grads_u = T.grad(lb_u.sum(), self.y_params)

        # Loss - regularizing with weight priors p(theta|N(0,1)) and clipping gradients
        y_weight_priors = 0.0
        for p in self.y_params:
            if 'W' not in str(p):
                continue
            y_weight_priors += log_normal(p, 0, 1).sum()
        y_weight_priors_grad = T.grad(y_weight_priors,
                                      self.y_params,
                                      disconnected_inputs='ignore')

        xhat_weight_priors = 0.0
        for p in self.xhat_params:
            if 'W' not in str(p):
                continue
            xhat_weight_priors += log_normal(p, 0, 1).sum()
        xhat_weight_priors_grad = T.grad(xhat_weight_priors,
                                         self.xhat_params,
                                         disconnected_inputs='ignore')

        n = self.sh_train_x.shape[0].astype(
            theano.config.floatX
        )  # no. of data_preparation points in train set
        n_b = n / self.sym_batchsize.astype(
            theano.config.floatX)  # no. of batches in train set
        y_grads = [T.zeros(p.shape) for p in self.y_params]
        for i in range(len(y_grads)):
            y_grads[i] = (y_grads_l[i] + y_grads_u[i])
            y_grads[i] *= n_b
            y_grads[i] += y_weight_priors_grad[i]
            y_grads[i] /= -n

        xhat_grads = [T.zeros(p.shape) for p in self.xhat_params]
        for i in range(len(xhat_grads)):
            xhat_grads[i] = (xhat_grads_l[i] + xhat_grads_u[i])
            xhat_grads[i] *= n_b
            xhat_grads[i] += xhat_weight_priors_grad[i]
            xhat_grads[i] /= -n

        params = self.y_params + self.xhat_params
        grads = y_grads + xhat_grads

        # Collect the lower bound and scale it with the weight priors.
        elbo = ((lb_l.sum() + lb_u.sum()) * n_b + y_weight_priors +
                xhat_weight_priors) / -n

        # Avoid vanishing and exploding gradients.
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        updates = adam(mgrads, params, self.sym_lr, sym_beta1, sym_beta2)

        ### Compile training function ###
        x_batch_l = self.sh_train_x[self.batch_slice][:self.sym_bs_l]
        x_batch_u = self.sh_train_x[self.batch_slice][self.sym_bs_l:]
        t_batch_l = self.sh_train_t[self.batch_slice][:self.sym_bs_l]
        if self.x_dist == 'bernoulli':  # Sample bernoulli input.
            x_batch_u = self._srng.binomial(size=x_batch_u.shape,
                                            n=1,
                                            p=x_batch_u,
                                            dtype=theano.config.floatX)
            x_batch_l = self._srng.binomial(size=x_batch_l.shape,
                                            n=1,
                                            p=x_batch_l,
                                            dtype=theano.config.floatX)
        givens = {
            self.sym_x_l: x_batch_l,
            self.sym_x_u: x_batch_u,
            self.sym_t_l: t_batch_l
        }
        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta,
            self.sym_lr, sym_beta1, sym_beta2, self.sym_samples
        ]
        f_train = theano.function(inputs=inputs,
                                  outputs=[elbo],
                                  givens=givens,
                                  updates=updates)
        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize'] = 200
        self.train_args['inputs']['batchsize_labeled'] = 100
        self.train_args['inputs']['beta'] = 1200.
        self.train_args['inputs']['learningrate'] = 3e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['outputs']['lb'] = '%0.4f'

        ### Compile testing function ###
        class_err_test = self._classification_error(self.sym_x_l, self.sym_t_l)
        givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t}
        f_test = theano.function(inputs=[self.sym_samples],
                                 outputs=[class_err_test],
                                 givens=givens)
        # Testing args.  Note that these can be changed during or prior to training.
        self.test_args['inputs']['samples'] = 1
        self.test_args['outputs']['err'] = '%0.2f%%'

        ### Compile validation function ###
        f_validate = None
        if validation_set is not None:
            class_err_valid = self._classification_error(
                self.sym_x_l, self.sym_t_l)
            givens = {
                self.sym_x_l: self.sh_valid_x,
                self.sym_t_l: self.sh_valid_t
            }
            inputs = [self.sym_samples]
            f_validate = theano.function(inputs=[self.sym_samples],
                                         outputs=[class_err_valid],
                                         givens=givens)
        # Default validation args. Note that these can be changed during or prior to training.
        self.validate_args['inputs']['samples'] = 1
        self.validate_args['outputs']['err'] = '%0.2f%%'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
    def __init__(self,
                 n_c,
                 n_z,
                 qz_hid,
                 px_hid,
                 enc_rnn=256,
                 dec_rnn=256,
                 n_l=28,
                 nonlinearity=rectify,
                 px_nonlinearity=None,
                 x_dist='bernoulli',
                 batchnorm=False,
                 seed=1234):
        """
        Weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_c: Number of inputs.
        :param n_z: Number of latent.
        :param qz_hid: List of number of deterministic hidden q(z|a,x,y).
        :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y).
        :param nonlinearity: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'.
        :param batchnorm: Boolean value for batch normalization.
        :param seed: The random seed.
        """
        super(RVAE, self).__init__(n_c, qz_hid + px_hid, n_z, nonlinearity)
        self.x_dist = x_dist
        self.n_x = n_c
        self.seq_length = n_l
        self.n_z = n_z
        self.batchnorm = batchnorm
        self._srng = RandomStreams(seed)

        # Decide Glorot initializaiton of weights.
        init_w = 1e-3
        hid_w = ""
        if nonlinearity == rectify or nonlinearity == softplus:
            hid_w = "relu"

        # Define symbolic variables for theano functions.
        self.sym_x = T.tensor3('x')  # inputs
        self.sym_z = T.matrix('z')
        self.sym_samples = T.iscalar('samples')  # MC samples
        self.sym_warmup = T.fscalar('warmup')

        # Assist methods for collecting the layers
        def dense_layer(layer_in,
                        n,
                        dist_w=init.GlorotNormal,
                        dist_b=init.Normal):
            dense = DenseLayer(layer_in,
                               num_units=n,
                               W=dist_w(hid_w),
                               b=dist_b(init_w),
                               nonlinearity=None)
            if batchnorm:
                dense = BatchNormLayer(dense)
            return NonlinearityLayer(dense, self.transf)

        def stochastic_layer(layer_in, n, samples, nonlin=None):
            mu = DenseLayer(layer_in,
                            n,
                            W=init.Normal(init_w, mean=.0),
                            b=init.Normal(init_w),
                            nonlinearity=nonlin)
            logvar = DenseLayer(layer_in,
                                n,
                                W=init.Normal(init_w, mean=.0),
                                b=init.Normal(init_w),
                                nonlinearity=nonlin)
            # logvar = ConstrainLayer(logvar, scale=1, max=T.log(-0.999 * self.sym_warmup + 1.0999))
            return SampleLayer(mu, logvar, eq_samples=samples,
                               iw_samples=1), mu, logvar

        def lstm_layer(input,
                       nunits,
                       return_final,
                       backwards=False,
                       name='LSTM'):
            ingate = Gate(W_in=init.Uniform(0.01),
                          W_hid=init.Uniform(0.01),
                          b=init.Constant(0.0))
            forgetgate = Gate(W_in=init.Uniform(0.01),
                              W_hid=init.Uniform(0.01),
                              b=init.Constant(5.0))
            cell = Gate(
                W_cell=None,
                nonlinearity=T.tanh,
                W_in=init.Uniform(0.01),
                W_hid=init.Uniform(0.01),
            )
            outgate = Gate(W_in=init.Uniform(0.01),
                           W_hid=init.Uniform(0.01),
                           b=init.Constant(0.0))

            lstm = LSTMLayer(input,
                             num_units=nunits,
                             backwards=backwards,
                             peepholes=False,
                             ingate=ingate,
                             forgetgate=forgetgate,
                             cell=cell,
                             outgate=outgate,
                             name=name,
                             only_return_final=return_final)
            return lstm

        # RNN encoder implementation
        l_x_in = InputLayer((None, n_l, n_c))
        l_enc_forward = lstm_layer(l_x_in,
                                   enc_rnn,
                                   return_final=True,
                                   backwards=False,
                                   name='enc_forward')
        l_enc_backward = lstm_layer(l_x_in,
                                    enc_rnn,
                                    return_final=True,
                                    backwards=True,
                                    name='enc_backward')
        l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward], axis=-1)
        l_enc = dense_layer(l_enc_concat, enc_rnn)

        # # Overwrite encoder
        # l_enc = dense_layer(l_x_in, enc_rnn)

        # Recognition q(z|x)
        l_qz = l_enc
        for hid in qz_hid:
            l_qz = dense_layer(l_qz, hid)

        # Reparameterisation and sample
        l_qz_mu = DenseLayer(l_qz,
                             n_z,
                             W=init.Normal(init_w, mean=1.0),
                             b=init.Normal(init_w),
                             nonlinearity=None)
        l_qz_logvar = DenseLayer(l_qz,
                                 n_z,
                                 init.Normal(init_w),
                                 init.Normal(init_w),
                                 nonlinearity=None)
        l_qz = SampleLayer(l_qz_mu,
                           l_qz_logvar,
                           eq_samples=self.sym_samples,
                           iw_samples=1)

        # Generative p(x|z)
        l_qz_repeat = RepeatLayer(l_qz, n=n_l)

        # Skip connection to encoder until warmup threshold is reached
        if T.ge(self.sym_warmup, 0.4):
            l_skip_enc_repeat = RepeatLayer(l_enc, n=n_l)
            l_qz_repeat = ConcatLayer([l_qz_repeat, l_skip_enc_repeat],
                                      axis=-1)

        l_dec_forward = lstm_layer(l_qz_repeat,
                                   dec_rnn,
                                   return_final=False,
                                   backwards=False,
                                   name='dec_forward')
        l_dec_backward = lstm_layer(l_qz_repeat,
                                    dec_rnn,
                                    return_final=False,
                                    backwards=True,
                                    name='dec_backward')
        l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1)
        l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn))
        l_dec = dense_layer(l_dec, dec_rnn)

        # # Overwrite decoder
        # l_dec = dense_layer(l_qz, n_l)

        # Add additional dense layers
        l_px = l_dec
        for hid in px_hid:
            l_px = dense_layer(l_px, hid)

        # Reshape the last dimension and perhaps model with a distribution
        if x_dist == 'bernoulli':
            l_px = DenseLayer(l_px, n_c, init.GlorotNormal(),
                              init.Normal(init_w), sigmoid)
        elif x_dist == 'multinomial':
            l_px = DenseLayer(l_px, n_c, init.GlorotNormal(),
                              init.Normal(init_w), softmax)
        elif x_dist == 'gaussian':
            l_px, l_px_mu, l_px_logvar = stochastic_layer(
                l_px, n_c, self.sym_samples, nonlin=px_nonlinearity)
        elif x_dist == 'linear':
            l_px = DenseLayer(l_px, n_c, nonlinearity=None)

        # Reshape all the model layers to have the same size
        self.l_x_in = l_x_in

        self.l_qz = ReshapeLayer(l_qz, (-1, self.sym_samples, 1, n_z))
        self.l_qz_mu = DimshuffleLayer(l_qz_mu, (0, 'x', 'x', 1))
        self.l_qz_logvar = DimshuffleLayer(l_qz_logvar, (0, 'x', 'x', 1))

        self.l_px = DimshuffleLayer(
            ReshapeLayer(l_px, (-1, n_l, self.sym_samples, 1, n_c)),
            (0, 2, 3, 1, 4))
        self.l_px_mu = DimshuffleLayer(ReshapeLayer(l_px_mu, (-1, n_l, self.sym_samples, 1, n_c)), (0, 2, 3, 1, 4)) \
            if x_dist == "gaussian" else None
        self.l_px_logvar = DimshuffleLayer(ReshapeLayer(l_px_logvar, (-1, n_l, self.sym_samples, 1, n_c)), (0, 2, 3, 1, 4)) \
            if x_dist == "gaussian" else None

        # Predefined functions
        inputs = {self.l_x_in: self.sym_x}
        outputs = get_output(l_qz, inputs, deterministic=True)
        self.f_qz = theano.function([self.sym_x, self.sym_samples],
                                    outputs,
                                    on_unused_input='warn')

        inputs = {l_qz: self.sym_z, self.l_x_in: self.sym_x}
        outputs = get_output(self.l_px, inputs,
                             deterministic=True).mean(axis=(1, 2))
        self.f_px = theano.function([self.sym_x, self.sym_z, self.sym_samples],
                                    outputs,
                                    on_unused_input='warn')

        if x_dist == "gaussian":
            outputs = get_output(self.l_px_mu, inputs,
                                 deterministic=True).mean(axis=(1, 2))
            self.f_mu = theano.function(
                [self.sym_x, self.sym_z, self.sym_samples],
                outputs,
                on_unused_input='ignore')

            outputs = get_output(self.l_px_logvar, inputs,
                                 deterministic=True).mean(axis=(1, 2))
            self.f_var = theano.function(
                [self.sym_x, self.sym_z, self.sym_samples],
                outputs,
                on_unused_input='ignore')

        # Define model parameters
        self.model_params = get_all_params([self.l_px])
        self.trainable_model_params = get_all_params([self.l_px],
                                                     trainable=True)
    def __init__(self,
                 n_l,
                 n_c,
                 n_a,
                 n_z,
                 n_y,
                 qa_hid,
                 qz_hid,
                 qy_hid,
                 px_hid,
                 pa_hid,
                 enc_rnn=256,
                 dec_rnn=256,
                 nonlinearity=rectify,
                 px_nonlinearity=None,
                 x_dist='bernoulli',
                 batchnorm=False,
                 seed=1234):
        """
        Initialize an skip deep generative model consisting of
        discriminative classifier q(y|a,x),
        generative model P p(a|z,y) and p(x|a,z,y),
        inference model Q q(a|x) and q(z|a,x,y).
        Weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_c: Number of inputs.
        :param n_a: Number of auxiliary.
        :param n_z: Number of latent.
        :param n_y: Number of classes.
        :param qa_hid: List of number of deterministic hidden q(a|x).
        :param qz_hid: List of number of deterministic hidden q(z|a,x,y).
        :param qy_hid: List of number of deterministic hidden q(y|a,x).
        :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y).
        :param nonlinearity: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'.
        :param batchnorm: Boolean value for batch normalization.
        :param seed: The random seed.
        """
        super(RSDGM, self).__init__(n_c, qz_hid + px_hid, n_a + n_z,
                                    nonlinearity)
        self.x_dist = x_dist
        self.n_y = n_y
        self.n_c = n_c
        self.n_a = n_a
        self.n_z = n_z
        self.n_l = n_l
        self.batchnorm = batchnorm
        self._srng = RandomStreams(seed)

        # Decide Glorot initializaiton of weights.
        init_w = 1e-3
        hid_w = ""
        if nonlinearity == rectify or nonlinearity == softplus:
            hid_w = "relu"

        # Define symbolic variables for theano functions.
        self.sym_beta = T.scalar('beta')  # scaling constant beta
        self.sym_x_l = T.tensor3('x_l')  # labeled inputs
        self.sym_t_l = T.matrix('t')  # labeled targets
        self.sym_x_u = T.tensor3('x_u')  # unlabeled inputs
        self.sym_bs_l = T.iscalar('bs_l')  # number of labeled data
        self.sym_samples = T.iscalar('samples')  # MC samples
        self.sym_z = T.matrix('z')  # latent variable z
        self.sym_a = T.matrix('a')  # auxiliary variable a
        self.sym_warmup = T.fscalar('warmup')  # warmup to dampen KL term

        # Assist methods for collecting the layers
        def dense_layer(layer_in,
                        n,
                        dist_w=init.GlorotNormal,
                        dist_b=init.Normal):
            dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w),
                               None)
            if batchnorm:
                dense = BatchNormLayer(dense)
            return NonlinearityLayer(dense, self.transf)

        def stochastic_layer(layer_in, n, samples, nonlin=None):
            mu = DenseLayer(layer_in, n, init.Normal(init_w),
                            init.Normal(init_w), nonlin)
            logvar = DenseLayer(layer_in, n, init.Normal(init_w),
                                init.Normal(init_w), nonlin)
            return SampleLayer(mu, logvar, eq_samples=samples,
                               iw_samples=1), mu, logvar

        def lstm_layer(input,
                       nunits,
                       return_final,
                       backwards=False,
                       name='LSTM'):
            ingate = Gate(W_in=init.Uniform(0.01),
                          W_hid=init.Uniform(0.01),
                          b=init.Constant(0.0))
            forgetgate = Gate(W_in=init.Uniform(0.01),
                              W_hid=init.Uniform(0.01),
                              b=init.Constant(5.0))
            cell = Gate(
                W_cell=None,
                nonlinearity=T.tanh,
                W_in=init.Uniform(0.01),
                W_hid=init.Uniform(0.01),
            )
            outgate = Gate(W_in=init.Uniform(0.01),
                           W_hid=init.Uniform(0.01),
                           b=init.Constant(0.0))

            lstm = LSTMLayer(input,
                             num_units=nunits,
                             backwards=backwards,
                             peepholes=False,
                             ingate=ingate,
                             forgetgate=forgetgate,
                             cell=cell,
                             outgate=outgate,
                             name=name,
                             only_return_final=return_final)

            rec = RecurrentLayer(input,
                                 nunits,
                                 W_in_to_hid=init.GlorotNormal('relu'),
                                 W_hid_to_hid=init.GlorotNormal('relu'),
                                 backwards=backwards,
                                 nonlinearity=rectify,
                                 only_return_final=return_final,
                                 name=name)
            return lstm

        # Input layers
        l_y_in = InputLayer((None, n_y))
        l_x_in = InputLayer((None, n_l, n_c))

        # RNN encoder implementation
        l_enc_forward = lstm_layer(l_x_in,
                                   enc_rnn,
                                   return_final=True,
                                   backwards=False,
                                   name='enc_forward')
        l_enc_backward = lstm_layer(l_x_in,
                                    enc_rnn,
                                    return_final=True,
                                    backwards=True,
                                    name='enc_backward')
        l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward])
        l_enc = dense_layer(l_enc_concat, enc_rnn)

        # Auxiliary q(a|x)
        l_qa_x = l_enc
        for hid in qa_hid:
            l_qa_x = dense_layer(l_qa_x, hid)
        l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(
            l_qa_x, n_a, self.sym_samples)

        # Classifier q(y|a,x)
        l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_qy = ReshapeLayer(l_qa_to_qy,
                                  (-1, self.sym_samples, 1, qy_hid[0]))
        l_x_to_qy = DenseLayer(l_enc, qy_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1))
        l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]),
                               (-1, qy_hid[0]))
        if batchnorm:
            l_qy_xa = BatchNormLayer(l_qy_xa)
        l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf)
        if len(qy_hid) > 1:
            for hid in qy_hid[1:]:
                l_qy_xa = dense_layer(l_qy_xa, hid)
        l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(),
                             init.Normal(init_w), softmax)

        # Recognition q(z|x,a,y)
        l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_qz = ReshapeLayer(l_qa_to_qz,
                                  (-1, self.sym_samples, 1, qz_hid[0]))
        l_x_to_qz = DenseLayer(l_enc, qz_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1))
        l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1))
        l_qz_axy = ReshapeLayer(
            ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]),
            (-1, qz_hid[0]))
        if batchnorm:
            l_qz_axy = BatchNormLayer(l_qz_axy)
        l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf)
        if len(qz_hid) > 1:
            for hid in qz_hid[1:]:
                l_qz_axy = dense_layer(l_qz_axy, hid)
        l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(
            l_qz_axy, n_z, 1)

        # Generative p(a|z,y)
        l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1))
        l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qz_to_pa = ReshapeLayer(l_qz_to_pa,
                                  (-1, self.sym_samples, 1, pa_hid[0]))
        l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]),
                               [-1, pa_hid[0]])
        if batchnorm:
            l_pa_zy = BatchNormLayer(l_pa_zy)
        l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf)
        if len(pa_hid) > 1:
            for hid in pa_hid[1:]:
                l_pa_zy = dense_layer(l_pa_zy, hid)
        l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1)

        # Generative p(x|a,z,y)
        l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qa_to_px = ReshapeLayer(l_qa_to_px,
                                  (-1, self.sym_samples, 1, px_hid[0]))
        l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w),
                               init.Normal(init_w), None)
        l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1))
        l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w),
                                init.Normal(init_w), None)
        l_qz_to_px = ReshapeLayer(l_qz_to_px,
                                  (-1, self.sym_samples, 1, px_hid[0]))
        l_px_azy = ReshapeLayer(
            ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]),
            [-1, px_hid[0]])
        if batchnorm:
            l_px_azy = BatchNormLayer(l_px_azy)
        l_px_azy = NonlinearityLayer(l_px_azy, self.transf)

        # RNN decoder implementation
        l_px_azy_repeat = RepeatLayer(l_px_azy, n=n_l)
        l_dec_forward = lstm_layer(l_px_azy_repeat,
                                   dec_rnn,
                                   return_final=False,
                                   backwards=False,
                                   name='dec_forward')
        l_dec_backward = lstm_layer(l_px_azy_repeat,
                                    dec_rnn,
                                    return_final=False,
                                    backwards=True,
                                    name='dec_backward')
        l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1)
        l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn))
        l_dec = dense_layer(l_dec, dec_rnn)

        l_px_azy = l_dec
        if len(px_hid) > 1:
            for hid in px_hid[1:]:
                l_px_azy = dense_layer(l_px_azy, hid)

        if x_dist == 'bernoulli':
            l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(),
                                  init.Normal(init_w), sigmoid)
        elif x_dist == 'multinomial':
            l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(),
                                  init.Normal(init_w), softmax)
        elif x_dist == 'gaussian':
            l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(
                l_px_azy, n_c, self.sym_samples, px_nonlinearity)

        # Reshape all the model layers to have the same size
        self.l_x_in = l_x_in
        self.l_y_in = l_y_in
        self.l_a_in = l_qa_x

        self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a))
        self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1))
        self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1))

        self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z))
        self.l_qz_mu = ReshapeLayer(l_qz_axy_mu,
                                    (-1, self.sym_samples, 1, n_z))
        self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar,
                                        (-1, self.sym_samples, 1, n_z))

        self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y))

        self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a))
        self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a))
        self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar,
                                        (-1, self.sym_samples, 1, n_a))

        self.l_px = ReshapeLayer(l_px_azy, (-1, n_l, self.sym_samples, 1, n_c))
        self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, n_l, self.sym_samples, 1, n_c)) \
            if x_dist == "gaussian" else None
        self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, n_l, self.sym_samples, 1, n_c)) \
            if x_dist == "gaussian" else None

        # Predefined functions
        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_qy, self.sym_x_l,
                             deterministic=True).mean(axis=(1, 2))
        self.f_qy = theano.function(inputs, outputs)

        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_qa, self.sym_x_l,
                             deterministic=True).mean(axis=(1, 2))
        self.f_qa = theano.function(inputs, outputs)

        inputs = {l_x_in: self.sym_x_l, l_y_in: self.sym_t_l}
        outputs = get_output(l_qz_axy, inputs, deterministic=True)
        self.f_qz = theano.function(
            [self.sym_x_l, self.sym_t_l, self.sym_samples], outputs)

        inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l}
        outputs = get_output(self.l_pa, inputs, deterministic=True)
        self.f_pa = theano.function(
            [self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        inputs = {
            l_qa_x: self.sym_a,
            l_qz_axy: self.sym_z,
            l_y_in: self.sym_t_l
        }
        outputs = get_output(self.l_px, inputs,
                             deterministic=True).mean(axis=(2, 3))
        self.f_px = theano.function(
            [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        outputs = get_output(self.l_px_mu, inputs,
                             deterministic=True).mean(axis=(2, 3))
        self.f_mu = theano.function(
            [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        outputs = get_output(self.l_px_logvar, inputs,
                             deterministic=True).mean(axis=(2, 3))
        self.f_var = theano.function(
            [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs)

        # Define model parameters
        self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px])
        self.trainable_model_params = get_all_params(
            [self.l_qy, self.l_pa, self.l_px], trainable=True)
 def get_output(self, x):
     return get_output(self.model, x, deterministic=True)
Example #14
0
    def build_model(self, train_set, test_set, validation_set=None):
        super(VAE, self).build_model(train_set, test_set, validation_set)

        # Density estimations
        l_log_pz = StandardNormalLogDensityLayer(self.l_z)
        l_log_qz_x = GaussianLogDensityLayer(self.l_z, self.l_z_mu,
                                             self.l_z_logvar)
        if self.x_dist == 'bernoulli':
            l_px_z = BernoulliLogDensityLayer(self.l_xhat, self.l_x_in)
        elif self.x_dist == 'gaussian':
            l_px_z = GaussianLogDensityLayer(self.l_x_in, self.l_xhat_mu,
                                             self.l_xhat_logvar)

        out_layers = [l_log_pz, l_log_qz_x, l_px_z]
        inputs = {self.l_x_in: self.sym_x}
        log_pz, log_qz_x, log_px_z = get_output(out_layers, inputs)
        lb = -(log_pz + log_px_z - log_qz_x).mean(axis=1).mean()

        all_params = get_all_params(self.l_xhat, trainable=True)
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        updates = adam(lb, all_params, self.sym_lr, sym_beta1, sym_beta2)

        x_batch = self.sh_train_x[self.batch_slice]
        if self.x_dist == 'bernoulli':
            x_batch = self._srng.binomial(size=x_batch.shape,
                                          n=1,
                                          p=x_batch,
                                          dtype=theano.config.floatX)
        givens = {self.sym_x: x_batch}
        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1,
            sym_beta2, self.sym_samples
        ]
        outputs = [lb]
        f_train = theano.function(inputs=inputs,
                                  outputs=outputs,
                                  givens=givens,
                                  updates=updates)
        # Training args
        self.train_args['inputs']['batchsize'] = 100
        self.train_args['inputs']['learningrate'] = 3e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['outputs']['lb'] = '%0.4f'

        givens = {self.sym_x: self.sh_test_x}
        inputs = [self.sym_samples]
        outputs = [lb]
        f_test = theano.function(inputs=inputs, outputs=outputs, givens=givens)
        # Testing args
        self.test_args['inputs']['samples'] = 1
        self.test_args['outputs']['lb'] = '%0.4f'

        f_validate = None
        if validation_set is not None:
            givens = {self.sym_x: self.sh_valid_x}
            inputs = [self.sym_samples]
            outputs = [lb]
            f_validate = theano.function(inputs=inputs,
                                         outputs=outputs,
                                         givens=givens)
            # Validation args
            self.validate_args['inputs']['samples'] = 1
            self.validate_args['outputs']['lb'] = '%0.4f'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def run_adgmssl_mnist():
    """
    Evaluate a auxiliary deep generative model on the mnist dataset with 100 evenly distributed labels.
    """

    # Load the mnist supervised dataset for evaluation.
    (train_x, train_t), (test_x, test_t), (valid_x, valid_t) = mnist.load_supervised(filter_std=0.0,
                                                                                     train_valid_combine=True)

    # Initialize the auxiliary deep generative model.
    model = ADGMSSL(n_x=train_x.shape[-1], n_a=100, n_z=100, n_y=10, a_hidden=[500, 500],
                    z_hidden=[500, 500], xhat_hidden=[500, 500], y_hidden=[500, 500],
                    trans_func=rectify, x_dist='bernoulli')

    model_id = 20151209002003  # Insert the trained model id here.
    model.load_model(model_id)  # Load trained model. See configurations in the log file.

    # Evaluate the test error of the ADGM.
    mean_evals = model.get_output(test_x, 100)  # 100 MC to get a good estimate for the auxiliary unit.
    t_class = np.argmax(test_t, axis=1)
    y_class = np.argmax(mean_evals, axis=1)
    class_err = np.sum(y_class != t_class) / 100.
    print "test set 100-samples: %0.2f%%." % class_err

    # Evaluate the active units in the auxiliary and latent distribution.
    f_a_mu_logvar = theano.function([model.sym_x_l], get_output([model.l_a_mu, model.l_a_logvar], model.sym_x_l))
    q_a_mu, q_a_logvar = f_a_mu_logvar(test_x)
    log_pa = -0.5 * (np.log(2 * np.pi) + (q_a_mu ** 2 + np.exp(q_a_logvar)))
    log_qa_x = -0.5 * (np.log(2 * np.pi) + 1 + q_a_logvar)
    diff_pa_qa_x = (log_pa - log_qa_x).mean(axis=(1, 2))
    mean_diff_pa_qa_x = np.abs(np.mean(diff_pa_qa_x, axis=0))

    inputs = {model.l_x_in: model.sym_x_l, model.l_y_in: model.sym_t_l}
    f_z_mu_logvar = theano.function([model.sym_x_l, model.sym_t_l],
                                    get_output([model.l_z_mu, model.l_z_logvar], inputs))
    q_z_mu, q_z_logvar = f_z_mu_logvar(test_x, test_t)
    log_pz = -0.5 * (np.log(2 * np.pi) + (q_z_mu ** 2 + np.exp(q_z_logvar)))
    log_qz_x = -0.5 * (np.log(2 * np.pi) + 1 + q_z_logvar)
    diff_pz_qz_x = (log_pz - log_qz_x).mean(axis=(1, 2))
    mean_diff_pz_qz_x = np.abs(np.mean(diff_pz_qz_x, axis=0))

    plt.figure()
    plt.subplot(111, axisbg='white')
    plt.plot(sorted(mean_diff_pa_qa_x)[::-1], color="#c0392b", label=r"$\log \frac{p(a_i)}{q(a_i|x)}$")
    plt.plot(sorted(mean_diff_pz_qz_x)[::-1], color="#9b59b6", label=r"$\log \frac{p(z_i)}{q(z_i|x)}$")
    plt.grid(color='0.9', linestyle='dashed', axis="y")
    plt.xlabel("stochastic units")
    plt.ylabel(r"$\log \frac{p(\cdot)}{q(\cdot)}$")
    plt.ylim((0, 2.7))
    plt.legend()
    plt.savefig("output/diff.png", format="png")

    # Sample 100 random normal distributed samples with fixed class y in the latent space and generate xhat.
    table_size = 10
    samples = 1
    z = np.random.random_sample((table_size ** 2, 100))
    y = np.eye(10, k=0).reshape(10, 1, 10).repeat(10, axis=1).reshape((-1, 10))
    xhat = model.f_xhat(z, y, samples)

    plt.figure(figsize=(20, 20), dpi=300)
    i = 0
    img_out = np.zeros((28 * table_size, 28 * table_size))
    for x in range(table_size):
        for y in range(table_size):
            xa, xb = x * 28, (x + 1) * 28
            ya, yb = y * 28, (y + 1) * 28
            im = np.reshape(xhat[i], (28, 28))
            img_out[xa:xb, ya:yb] = im
            i += 1
    plt.matshow(img_out, cmap=plt.cm.binary)
    plt.xticks(np.array([]))
    plt.yticks(np.array([]))
    plt.savefig("output/mnist.png", format="png")
Example #16
0
    def __init__(self,
                 n_x,
                 n_z,
                 z_hidden,
                 xhat_hidden,
                 trans_func=rectify,
                 init_w=1e-3,
                 x_dist='gaussian',
                 batchnorm=False):
        super(VAE, self).__init__(n_x, z_hidden + xhat_hidden, n_z, trans_func)
        self.n_x = n_x
        self.n_z = n_z
        self.x_dist = x_dist
        self.batchnorm = batchnorm
        self.sym_x = T.matrix('x')  # symbolic inputs
        self.sym_z = T.matrix('z')
        self.sym_samples = T.iscalar('samples')
        self._srng = RandomStreams()

        def stochastic_layer(layer_in, n, samples, nonlin=None):
            mu = DenseLayer(layer_in, n, init.Normal(init_w),
                            init.Normal(init_w), nonlin)
            logvar = DenseLayer(layer_in, n, init.Normal(init_w),
                                init.Normal(init_w), nonlin)
            return SampleLayer(mu, logvar, eq_samples=samples,
                               iw_samples=1), mu, logvar

        # Input
        l_x_in = InputLayer((None, n_x))

        # Inference q(z|x)
        l_z_x = l_x_in
        for hid in z_hidden:
            l_z_x = DenseLayer(l_z_x, hid, init.Normal(std=init_w),
                               init.Normal(std=init_w), self.transf)
        l_z_x, l_z_x_mu, l_z_x_logvar = stochastic_layer(
            l_z_x, n_z, self.sym_samples)

        # Reshape for density layers
        l_z_x_reshaped = ReshapeLayer(l_z_x, (-1, self.sym_samples, n_z))
        l_z_x_mu_reshaped = DimshuffleLayer(l_z_x_mu, (0, 'x', 1))
        l_z_x_logvar_reshaped = DimshuffleLayer(l_z_x_logvar, (0, 'x', 1))

        # Generative p(xhat|z)
        l_xhat_z = l_z_x
        for hid in xhat_hidden:
            l_xhat_z = DenseLayer(l_xhat_z, hid, init.Normal(std=init_w),
                                  init.Normal(std=init_w), self.transf)
        if x_dist == 'bernoulli':
            l_xhat_z_mu_reshaped = None
            l_xhat_z_logvar_reshaped = None
            l_xhat_z = DenseLayer(l_xhat_z, n_x, init.Normal(std=init_w),
                                  init.Normal(std=init_w), sigmoid)
        elif x_dist == 'gaussian':
            l_xhat_z, l_xhat_z_mu, l_xhat_z_logvar = stochastic_layer(
                l_xhat_z, n_x, self.sym_samples)
            l_xhat_z_mu_reshaped = ReshapeLayer(l_xhat_z_mu,
                                                (-1, self.sym_samples, 1, n_x))
            l_xhat_z_logvar_reshaped = ReshapeLayer(
                l_xhat_z_logvar, (-1, self.sym_samples, 1, n_x))
        l_xhat_z_reshaped = ReshapeLayer(l_xhat_z,
                                         (-1, self.sym_samples, 1, n_x))

        # Init class variables
        self.l_x_in = l_x_in
        self.l_xhat_mu = l_xhat_z_mu_reshaped
        self.l_xhat_logvar = l_xhat_z_logvar_reshaped
        self.l_xhat = l_xhat_z_reshaped
        self.l_z = l_z_x_reshaped
        self.l_z_mu = l_z_x_mu_reshaped
        self.l_z_logvar = l_z_x_logvar_reshaped
        self.model_params = get_all_params(self.l_xhat)

        inputs = [self.sym_x, self.sym_samples]
        outputs = get_output(self.l_z, self.sym_x,
                             deterministic=True).mean(axis=1)
        self.f_qz = theano.function(inputs, outputs)

        inputs = {l_z_x: self.sym_z}
        outputs = get_output(self.l_xhat, inputs,
                             deterministic=True).mean(axis=(1, 2))
        inputs = [self.sym_z, self.sym_samples]
        self.f_px = theano.function(inputs, outputs)
    def build_model(self, train_set, test_set, validation_set=None):
        """
        Build the auxiliary deep generative model from the initialized hyperparameters.
        Define the lower bound term and compile it into a training function.
        :param train_set: Train set containing variables x, t.
        for the unlabeled data_preparation in the train set, we define 0's in t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(ADGMSSL, self).build_model(train_set, test_set, validation_set)

        # Define the layers for the density estimation used in the lower bound.
        l_log_pa = GaussianMarginalLogDensityLayer(self.l_a_mu, self.l_a_logvar)
        l_log_pz = GaussianMarginalLogDensityLayer(self.l_z_mu, self.l_z_logvar)
        l_log_qa_x = GaussianMarginalLogDensityLayer(1, self.l_a_logvar)
        l_log_qz_xy = GaussianMarginalLogDensityLayer(1, self.l_z_logvar)
        l_log_qy_ax = MultinomialLogDensityLayer(self.l_y, self.l_y_in, eps=1e-8)
        if self.x_dist == 'bernoulli':
            l_px_zy = BernoulliLogDensityLayer(self.l_xhat, self.l_x_in)
        elif self.x_dist == 'multinomial':
            l_px_zy = MultinomialLogDensityLayer(self.l_xhat, self.l_x_in)
        elif self.x_dist == 'gaussian':
            l_px_zy = GaussianLogDensityLayer(self.l_x_in, self.l_xhat_mu, self.l_xhat_logvar)

        ### Compute lower bound for labeled data_preparation ###
        out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy, l_log_qy_ax]
        inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l}
        log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = get_output(out_layers, inputs)
        py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y)))  # non-informative prior
        log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_l = log_pa_l + log_pz_l + log_py_l + log_px_zy_l - log_qa_x_l - log_qz_axy_l
        # Upscale the discriminative term with a weight.
        log_qy_ax_l *= self.sym_beta
        xhat_grads_l = T.grad(lb_l.mean(axis=(1, 2)).sum(), self.xhat_params)
        y_grads_l = T.grad(log_qy_ax_l.mean(axis=(1, 2)).sum(), self.y_params)
        lb_l += log_qy_ax_l
        lb_l = lb_l.mean(axis=(1, 2))

        ### Compute lower bound for unlabeled data_preparation ###
        bs_u = self.sym_x_u.shape[0]  # size of the unlabeled data_preparation.
        t_eye = T.eye(self.n_y, k=0)  # ones in diagonal and 0's elsewhere (bs x n_y).
        # repeat unlabeled t the number of classes for integration (bs * n_y) x n_y.
        t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape((-1, self.n_y))
        # repeat unlabeled x the number of classes for integration (bs * n_y) x n_x
        x_u = self.sym_x_u.reshape((1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape((-1, self.n_x))
        out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy]
        inputs = {self.l_x_in: x_u, self.l_y_in: t_u}
        log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = get_output(out_layers, inputs)
        py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y)))  # non-informative prior.
        log_py_u = -categorical_crossentropy(py_u, t_u).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_u = log_pa_u + log_pz_u + log_py_u + log_px_zy_u - log_qa_x_u - log_qz_axy_u
        lb_u = lb_u.reshape((self.n_y, self.sym_samples, 1, bs_u)).transpose(3, 1, 2, 0).mean(
            axis=(1, 2))  # mean over samples.
        y_ax_u = get_output(self.l_y, self.sym_x_u)
        y_ax_u = y_ax_u.mean(axis=(1, 2))  # bs x n_y
        y_ax_u += 1e-8  # ensure that we get no NANs.
        y_ax_u /= T.sum(y_ax_u, axis=1, keepdims=True)
        xhat_grads_u = T.grad((y_ax_u * lb_u).sum(axis=1).sum(), self.xhat_params)
        lb_u = (y_ax_u * (lb_u - T.log(y_ax_u))).sum(axis=1)
        y_grads_u = T.grad(lb_u.sum(), self.y_params)

        # Loss - regularizing with weight priors p(theta|N(0,1)) and clipping gradients
        y_weight_priors = 0.0
        for p in self.y_params:
            if 'W' not in str(p):
                continue
            y_weight_priors += log_normal(p, 0, 1).sum()
        y_weight_priors_grad = T.grad(y_weight_priors, self.y_params, disconnected_inputs='ignore')

        xhat_weight_priors = 0.0
        for p in self.xhat_params:
            if 'W' not in str(p):
                continue
            xhat_weight_priors += log_normal(p, 0, 1).sum()
        xhat_weight_priors_grad = T.grad(xhat_weight_priors, self.xhat_params, disconnected_inputs='ignore')

        n = self.sh_train_x.shape[0].astype(theano.config.floatX)  # no. of data_preparation points in train set
        n_b = n / self.sym_batchsize.astype(theano.config.floatX)  # no. of batches in train set
        y_grads = [T.zeros(p.shape) for p in self.y_params]
        for i in range(len(y_grads)):
            y_grads[i] = (y_grads_l[i] + y_grads_u[i])
            y_grads[i] *= n_b
            y_grads[i] += y_weight_priors_grad[i]
            y_grads[i] /= -n

        xhat_grads = [T.zeros(p.shape) for p in self.xhat_params]
        for i in range(len(xhat_grads)):
            xhat_grads[i] = (xhat_grads_l[i] + xhat_grads_u[i])
            xhat_grads[i] *= n_b
            xhat_grads[i] += xhat_weight_priors_grad[i]
            xhat_grads[i] /= -n

        params = self.y_params + self.xhat_params
        grads = y_grads + xhat_grads

        # Collect the lower bound and scale it with the weight priors.
        elbo = ((lb_l.sum() + lb_u.sum()) * n_b + y_weight_priors + xhat_weight_priors) / -n

        # Avoid vanishing and exploding gradients.
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        updates = adam(mgrads, params, self.sym_lr, sym_beta1, sym_beta2)

        ### Compile training function ###
        x_batch_l = self.sh_train_x[self.batch_slice][:self.sym_bs_l]
        x_batch_u = self.sh_train_x[self.batch_slice][self.sym_bs_l:]
        t_batch_l = self.sh_train_t[self.batch_slice][:self.sym_bs_l]
        if self.x_dist == 'bernoulli':  # Sample bernoulli input.
            x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX)
            x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX)
        givens = {self.sym_x_l: x_batch_l,
                  self.sym_x_u: x_batch_u,
                  self.sym_t_l: t_batch_l}
        inputs = [self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta,
                  self.sym_lr, sym_beta1, sym_beta2, self.sym_samples]
        f_train = theano.function(inputs=inputs, outputs=[elbo], givens=givens, updates=updates)
        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize'] = 200
        self.train_args['inputs']['batchsize_labeled'] = 100
        self.train_args['inputs']['beta'] = 1200.
        self.train_args['inputs']['learningrate'] = 3e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['outputs']['lb'] = '%0.4f'

        ### Compile testing function ###
        class_err_test = self._classification_error(self.sym_x_l, self.sym_t_l)
        givens = {self.sym_x_l: self.sh_test_x,
                  self.sym_t_l: self.sh_test_t}
        f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err_test], givens=givens)
        # Testing args.  Note that these can be changed during or prior to training.
        self.test_args['inputs']['samples'] = 1
        self.test_args['outputs']['err'] = '%0.2f%%'

        ### Compile validation function ###
        f_validate = None
        if validation_set is not None:
            class_err_valid = self._classification_error(self.sym_x_l, self.sym_t_l)
            givens = {self.sym_x_l: self.sh_valid_x,
                      self.sym_t_l: self.sh_valid_t}
            inputs = [self.sym_samples]
            f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err_valid], givens=givens)
        # Default validation args. Note that these can be changed during or prior to training.
        self.validate_args['inputs']['samples'] = 1
        self.validate_args['outputs']['err'] = '%0.2f%%'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
Example #18
0
    def build_model(self,
                    train_set_unlabeled,
                    train_set_labeled,
                    test_set,
                    validation_set=None):
        """
        Build the auxiliary deep generative model from the initialized hyperparameters.
        Define the lower bound term and compile it into a training function.
        :param train_set_unlabeled: Unlabeled train set containing variables x, t.
        :param train_set_labeled: Unlabeled train set containing variables x, t.
        :param test_set: Test set containing variables x, t.
        :param validation_set: Validation set containing variables x, t.
        :return: train, test, validation function and dicts of arguments.
        """
        super(CSDGM, self).build_model(train_set_unlabeled, test_set,
                                       validation_set)

        sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0],
                                                dtype=theano.config.floatX),
                                     borrow=True)
        sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1],
                                                dtype=theano.config.floatX),
                                     borrow=True)
        n = self.sh_train_x.shape[0].astype(
            theano.config.floatX)  # no. of data points
        n_l = sh_train_x_l.shape[0].astype(
            theano.config.floatX)  # no. of labeled data points

        # Define the layers for the density estimation used in the lower bound.
        l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu,
                                           self.l_qa_logvar)
        l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu,
                                           self.l_qz_logvar)
        l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8)

        l_log_pz = StandardNormalLogDensityLayer(self.l_qz)
        l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu,
                                           self.l_pa_logvar)

        l_x_in = ReshapeLayer(self.l_x_in, (-1, self.n_l * self.n_c))
        l_px = DimshuffleLayer(self.l_px, (0, 3, 1, 2, 4))
        l_px = ReshapeLayer(l_px, (-1, self.sym_samples, 1, self.n_c))
        if self.x_dist == 'bernoulli':
            l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in)
        elif self.x_dist == 'multinomial':
            l_log_px = MultinomialLogDensityLayer(l_px, l_x_in)
            l_log_px = ReshapeLayer(l_log_px, (-1, self.n_l, 1, 1, 1))
            l_log_px = MeanLayer(l_log_px, axis=1)
        elif self.x_dist == 'gaussian':
            l_px_mu = ReshapeLayer(
                DimshuffleLayer(self.l_px_mu, (0, 2, 3, 1, 4)),
                (-1, self.sym_samples, 1, self.n_l * self.n_c))
            l_px_logvar = ReshapeLayer(
                DimshuffleLayer(self.l_px_logvar, (0, 2, 3, 1, 4)),
                (-1, self.sym_samples, 1, self.n_l * self.n_c))
            l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar)

        def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px):
            lb = log_px + log_py + (log_pz + log_pa - log_qa -
                                    log_qz) * (1.1 - self.sym_warmup)
            return lb

        # Lower bound for labeled data
        out_layers = [
            l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy
        ]
        inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out

        # Prior p(y) expecting that all classes are evenly distributed
        py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y)))
        log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l,
                           log_py_l, log_px_zy_l)
        lb_l = lb_l.mean(axis=(1, 2))  # Mean over the sampling dimensions
        log_qy_ax_l *= (
            self.sym_beta * (n / n_l)
        )  # Scale the supervised cross entropy with the alpha constant
        lb_l += log_qy_ax_l.mean(axis=(
            1, 2
        ))  # Collect the lower bound term and mean over sampling dimensions

        # Lower bound for unlabeled data
        bs_u = self.sym_x_u.shape[0]

        # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y
        # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form
        #               x_repeat                     t_repeat
        #  [[x[0,0], x[0,1], ..., x[0,n_x]]         [[1, 0, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [1, 0, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 1, 0]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]          [0, 1, 0]
        #   [x[0,0], x[0,1], ..., x[0,n_x]]          [0, 0, 1]
        #   [x[1,0], x[1,1], ..., x[1,n_x]]]         [0, 0, 1]]
        t_eye = T.eye(self.n_y, k=0)
        t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u,
                                                            axis=1).reshape(
                                                                (-1, self.n_y))
        x_u = self.sym_x_u.reshape(
            (1, bs_u, self.n_l, self.n_c)).repeat(self.n_y, axis=0).reshape(
                (-1, self.n_l, self.n_c))

        # Since the expectation of var a is outside the integration we calculate E_q(a|x) first
        a_x_u = get_output(self.l_qa,
                           self.sym_x_u,
                           batch_norm_update_averages=True,
                           batch_norm_use_averages=False)
        a_x_u_rep = a_x_u.reshape(
            (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y,
                                                           axis=0).reshape(
                                                               (-1, self.n_a))
        out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px]
        inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep}
        out = get_output(out_layers,
                         inputs,
                         batch_norm_update_averages=False,
                         batch_norm_use_averages=False)
        log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out

        # Prior p(y) expecting that all classes are evenly distributed
        py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y)))
        log_py_u = -categorical_crossentropy(py_u, t_u).reshape(
            (-1, 1)).dimshuffle((0, 'x', 'x', 1))
        lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u,
                           log_py_u, log_px_zy_u)
        lb_u = lb_u.reshape(
            (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2))
        inputs = {
            self.l_x_in: self.sym_x_u,
            self.l_a_in: a_x_u.reshape((-1, self.n_a))
        }
        y_u = get_output(self.l_qy,
                         inputs,
                         batch_norm_update_averages=True,
                         batch_norm_use_averages=False).mean(axis=(1, 2))
        y_u += 1e-8  # Ensure that we get no NANs when calculating the entropy
        y_u /= T.sum(y_u, axis=1, keepdims=True)
        lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1)

        # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients
        weight_priors = 0.0
        for p in self.trainable_model_params:
            if 'W' not in str(p):
                continue
            weight_priors += log_normal(p, 0, 1).sum()

        # Collect the lower bound and scale it with the weight priors.
        elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n
        lb_labeled = -lb_l.mean()
        lb_unlabeled = -lb_u.mean()
        log_px = log_px_zy_l.mean() + log_px_zy_u.mean()
        log_pz = log_pz_l.mean() + log_pz_u.mean()
        log_qz = log_qz_axy_l.mean() + log_qz_axy_u.mean()
        log_pa = log_pa_l.mean() + log_pa_u.mean()
        log_qa = log_qa_x_l.mean() + log_qa_x_u.mean()

        grads_collect = T.grad(elbo, self.trainable_model_params)
        params_collect = self.trainable_model_params
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        clip_grad, max_norm = 1, 5
        mgrads = total_norm_constraint(grads_collect, max_norm=max_norm)
        mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads]
        updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1,
                       sym_beta2)

        # Training function
        indices = self._srng.choice(size=[self.sym_bs_l],
                                    a=sh_train_x_l.shape[0],
                                    replace=False)
        x_batch_l = sh_train_x_l[indices]
        t_batch_l = sh_train_t_l[indices]
        x_batch_u = self.sh_train_x[self.batch_slice]
        if self.x_dist == 'bernoulli':  # Sample bernoulli input.
            x_batch_u = self._srng.binomial(size=x_batch_u.shape,
                                            n=1,
                                            p=x_batch_u,
                                            dtype=theano.config.floatX)
            x_batch_l = self._srng.binomial(size=x_batch_l.shape,
                                            n=1,
                                            p=x_batch_l,
                                            dtype=theano.config.floatX)

        givens = {
            self.sym_x_l: x_batch_l,
            self.sym_x_u: x_batch_u,
            self.sym_t_l: t_batch_l
        }
        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta,
            self.sym_lr, sym_beta1, sym_beta2, self.sym_samples,
            self.sym_warmup
        ]
        outputs = [
            elbo, lb_labeled, lb_unlabeled, log_px, log_pz, log_qz, log_pa,
            log_qa
        ]
        f_train = theano.function(inputs=inputs,
                                  outputs=outputs,
                                  givens=givens,
                                  updates=updates)

        # Default training args. Note that these can be changed during or prior to training.
        self.train_args['inputs']['batchsize_unlabeled'] = 100
        self.train_args['inputs']['batchsize_labeled'] = 100
        self.train_args['inputs']['beta'] = 0.1
        self.train_args['inputs']['learningrate'] = 3e-4
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['inputs']['samples'] = 1
        self.train_args['inputs']['warmup'] = 0.1
        self.train_args['outputs']['lb'] = '%0.3f'
        self.train_args['outputs']['lb-l'] = '%0.3f'
        self.train_args['outputs']['lb-u'] = '%0.3f'
        self.train_args['outputs']['px'] = '%0.3f'
        self.train_args['outputs']['pz'] = '%0.3f'
        self.train_args['outputs']['qz'] = '%0.3f'
        self.train_args['outputs']['pa'] = '%0.3f'
        self.train_args['outputs']['qa'] = '%0.3f'

        # Validation and test function
        y = get_output(self.l_qy, self.sym_x_l,
                       deterministic=True).mean(axis=(1, 2))
        class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100
        givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t}
        f_test = theano.function(inputs=[self.sym_samples],
                                 outputs=[class_err],
                                 givens=givens)

        # Test args.  Note that these can be changed during or prior to training.
        self.test_args['inputs']['samples'] = 1
        self.test_args['outputs']['test'] = '%0.2f%%'

        f_validate = None
        if validation_set is not None:
            givens = {
                self.sym_x_l: self.sh_valid_x,
                self.sym_t_l: self.sh_valid_t
            }
            f_validate = theano.function(inputs=[self.sym_samples],
                                         outputs=[class_err],
                                         givens=givens)
            # Default validation args. Note that these can be changed during or prior to training.
            self.validate_args['inputs']['samples'] = 1
            self.validate_args['outputs']['validation'] = '%0.2f%%'

        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
    def __init__(self, n_x, n_a, n_z, n_y, a_hidden, z_hidden, xhat_hidden, y_hidden, trans_func=rectify,
                 x_dist='bernoulli'):
        """
        Initialize an auxiliary deep generative model consisting of
        discriminative classifier q(y|a,x),
        generative model P p(xhat|z,y),
        inference model Q q(a|x) and q(z|x,y).
        All weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_x: Number of inputs.
        :param n_a: Number of auxiliary.
        :param n_z: Number of latent.
        :param n_y: Number of classes.
        :param a_hidden: List of number of deterministic hidden q(a|x).
        :param z_hidden: List of number of deterministic hidden q(z|x,y).
        :param xhat_hidden: List of number of deterministic hidden p(xhat|z,y).
        :param y_hidden: List of number of deterministic hidden q(y|a,x).
        :param trans_func: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli' or 'gaussian'.
        """
        super(ADGMSSL, self).__init__(n_x, a_hidden + z_hidden + xhat_hidden, n_a + n_z, trans_func)
        self.y_hidden = y_hidden
        self.x_dist = x_dist
        self.n_y = n_y
        self.n_x = n_x
        self.n_a = n_a
        self.n_z = n_z

        self._srng = RandomStreams()

        self.sym_beta = T.scalar('beta')  # symbolic upscaling of the discriminative term.
        self.sym_x_l = T.matrix('x')  # symbolic labeled inputs
        self.sym_t_l = T.matrix('t')  # symbolic labeled targets
        self.sym_x_u = T.matrix('x')  # symbolic unlabeled inputs
        self.sym_bs_l = T.iscalar('bs_l')  # symbolic number of labeled data_preparation points in batch
        self.sym_samples = T.iscalar('samples')  # symbolic number of Monte Carlo samples
        self.sym_y = T.matrix('y')
        self.sym_z = T.matrix('z')

        ### Input layers ###
        l_x_in = InputLayer((None, n_x))
        l_y_in = InputLayer((None, n_y))

        ### Auxiliary q(a|x) ###
        l_a_x = l_x_in
        for hid in a_hidden:
            l_a_x = DenseLayer(l_a_x, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf)
        l_a_x_mu = DenseLayer(l_a_x, n_a, init.GlorotNormal(), init.Normal(1e-3), None)
        l_a_x_logvar = DenseLayer(l_a_x, n_a, init.GlorotNormal(), init.Normal(1e-3), None)
        l_a_x = SampleLayer(l_a_x_mu, l_a_x_logvar, eq_samples=self.sym_samples)
        # Reshape all layers to align them for multiple samples in the lower bound calculation.
        l_a_x_reshaped = ReshapeLayer(l_a_x, (-1, self.sym_samples, 1, n_a))
        l_a_x_mu_reshaped = DimshuffleLayer(l_a_x_mu, (0, 'x', 'x', 1))
        l_a_x_logvar_reshaped = DimshuffleLayer(l_a_x_logvar, (0, 'x', 'x', 1))

        ### Classifier q(y|a,x) ###
        # Concatenate the input x and the output of the auxiliary MLP.
        l_a_to_y = DenseLayer(l_a_x, y_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None)
        l_a_to_y = ReshapeLayer(l_a_to_y, (-1, self.sym_samples, 1, y_hidden[0]))
        l_x_to_y = DenseLayer(l_x_in, y_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None)
        l_x_to_y = DimshuffleLayer(l_x_to_y, (0, 'x', 'x', 1))
        l_y_xa = ReshapeLayer(ElemwiseSumLayer([l_a_to_y, l_x_to_y]), (-1, y_hidden[0]))
        l_y_xa = NonlinearityLayer(l_y_xa, self.transf)

        if len(y_hidden) > 1:
            for hid in y_hidden[1:]:
                l_y_xa = DenseLayer(l_y_xa, hid, init.GlorotUniform('relu'), init.Normal(1e-3), self.transf)
        l_y_xa = DenseLayer(l_y_xa, n_y, init.GlorotUniform(), init.Normal(1e-3), softmax)
        l_y_xa_reshaped = ReshapeLayer(l_y_xa, (-1, self.sym_samples, 1, n_y))

        ### Recognition q(z|x,y) ###
        # Concatenate the input x and y.
        l_x_to_z = DenseLayer(l_x_in, z_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None)
        l_x_to_z = DimshuffleLayer(l_x_to_z, (0, 'x', 'x', 1))
        l_y_to_z = DenseLayer(l_y_in, z_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None)
        l_y_to_z = DimshuffleLayer(l_y_to_z, (0, 'x', 'x', 1))
        l_z_xy = ReshapeLayer(ElemwiseSumLayer([l_x_to_z, l_y_to_z]), [-1, z_hidden[0]])
        l_z_xy = NonlinearityLayer(l_z_xy, self.transf)

        if len(z_hidden) > 1:
            for hid in z_hidden[1:]:
                l_z_xy = DenseLayer(l_z_xy, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf)
        l_z_axy_mu = DenseLayer(l_z_xy, n_z, init.GlorotNormal(), init.Normal(1e-3), None)
        l_z_axy_logvar = DenseLayer(l_z_xy, n_z, init.GlorotNormal(), init.Normal(1e-3), None)
        l_z_xy = SampleLayer(l_z_axy_mu, l_z_axy_logvar, eq_samples=self.sym_samples)
        # Reshape all layers to align them for multiple samples in the lower bound calculation.
        l_z_axy_mu_reshaped = DimshuffleLayer(l_z_axy_mu, (0, 'x', 'x', 1))
        l_z_axy_logvar_reshaped = DimshuffleLayer(l_z_axy_logvar, (0, 'x', 'x', 1))
        l_z_axy_reshaped = ReshapeLayer(l_z_xy, (-1, self.sym_samples, 1, n_z))

        ### Generative p(xhat|z,y) ###
        # Concatenate the input x and y.
        l_y_to_xhat = DenseLayer(l_y_in, xhat_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None)
        l_y_to_xhat = DimshuffleLayer(l_y_to_xhat, (0, 'x', 'x', 1))
        l_z_to_xhat = DenseLayer(l_z_xy, xhat_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None)
        l_z_to_xhat = ReshapeLayer(l_z_to_xhat, (-1, self.sym_samples, 1, xhat_hidden[0]))
        l_xhat_zy = ReshapeLayer(ElemwiseSumLayer([l_z_to_xhat, l_y_to_xhat]), [-1, xhat_hidden[0]])
        l_xhat_zy = NonlinearityLayer(l_xhat_zy, self.transf)
        if len(xhat_hidden) > 1:
            for hid in xhat_hidden[1:]:
                l_xhat_zy = DenseLayer(l_xhat_zy, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf)
        if x_dist == 'bernoulli':
            l_xhat_zy_mu_reshaped = None
            l_xhat_zy_logvar_reshaped = None
            l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), sigmoid)
        elif x_dist == 'multinomial':
            l_xhat_zy_mu_reshaped = None
            l_xhat_zy_logvar_reshaped = None
            l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), softmax)
        elif x_dist == 'gaussian':
            l_xhat_zy_mu = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), None)
            l_xhat_zy_logvar = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), None)
            l_xhat_zy = SampleLayer(l_xhat_zy_mu, l_xhat_zy_logvar, eq_samples=1)
            l_xhat_zy_mu_reshaped = ReshapeLayer(l_xhat_zy_mu, (-1, self.sym_samples, 1, n_x))
            l_xhat_zy_logvar_reshaped = ReshapeLayer(l_xhat_zy_logvar, (-1, self.sym_samples, 1, n_x))
        l_xhat_zy_reshaped = ReshapeLayer(l_xhat_zy, (-1, self.sym_samples, 1, n_x))

        ### Various class variables ###
        self.l_x_in = l_x_in
        self.l_y_in = l_y_in
        self.l_a_mu = l_a_x_mu_reshaped
        self.l_a_logvar = l_a_x_logvar_reshaped
        self.l_a = l_a_x_reshaped
        self.l_z_mu = l_z_axy_mu_reshaped
        self.l_z_logvar = l_z_axy_logvar_reshaped
        self.l_z = l_z_axy_reshaped
        self.l_y = l_y_xa_reshaped
        self.l_xhat_mu = l_xhat_zy_mu_reshaped
        self.l_xhat_logvar = l_xhat_zy_logvar_reshaped
        self.l_xhat = l_xhat_zy_reshaped

        self.model_params = get_all_params([self.l_xhat, self.l_y])

        ### Calculate networks shapes for documentation ###
        self.qa_shapes = self.get_model_shape(get_all_params(l_a_x))
        self.qy_shapes = self.get_model_shape(get_all_params(l_y_xa))[len(self.qa_shapes) - 1:]
        self.qz_shapes = self.get_model_shape(get_all_params(l_z_xy))
        self.px_shapes = self.get_model_shape(get_all_params(l_xhat_zy))[(len(self.qz_shapes) - 1):]

        ### Predefined functions for generating xhat and y ###
        inputs = {l_z_xy: self.sym_z, self.l_y_in: self.sym_y}
        outputs = get_output(self.l_xhat, inputs, deterministic=True).mean(axis=(1, 2))
        inputs = [self.sym_z, self.sym_y, self.sym_samples]
        self.f_xhat = theano.function(inputs, outputs)

        inputs = [self.sym_x_l, self.sym_samples]
        outputs = get_output(self.l_y, self.sym_x_l, deterministic=True).mean(axis=(1, 2))
        self.f_y = theano.function(inputs, outputs)

        self.y_params = get_all_params(self.l_y, trainable=True)[(len(a_hidden) + 2) * 2::]
        self.xhat_params = get_all_params(self.l_xhat, trainable=True)
    def __init__(self,
                 n_c,
                 px_hid,
                 enc_rnn=256,
                 dec_rnn=256,
                 n_l=50,
                 nonlinearity=rectify,
                 batchnorm=False,
                 seed=1234):
        """
        Weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_c: Number of inputs.
        :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y).
        :param nonlinearity: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'.
        :param batchnorm: Boolean value for batch normalization.
        :param seed: The random seed.
        """
        super(RAE, self).__init__(n_c, px_hid, enc_rnn, nonlinearity)
        self.n_x = n_c
        self.max_seq_length = n_l
        self.batchnorm = batchnorm
        self._srng = RandomStreams(seed)

        # Decide Glorot initializaiton of weights.
        init_w = 1e-3
        hid_w = ""
        if nonlinearity == rectify or nonlinearity == softplus:
            hid_w = "relu"

        # Define symbolic variables for theano functions.
        self.sym_x = T.tensor3('x')  # inputs

        # Assist methods for collecting the layers
        def dense_layer(layer_in,
                        n,
                        dist_w=init.GlorotNormal,
                        dist_b=init.Normal):
            dense = DenseLayer(layer_in,
                               n,
                               dist_w(hid_w),
                               dist_b(init_w),
                               nonlinearity=None)
            if batchnorm:
                dense = BatchNormLayer(dense)
            return NonlinearityLayer(dense, self.transf)

        def lstm_layer(input,
                       nunits,
                       return_final,
                       backwards=False,
                       name='LSTM'):
            ingate = Gate(W_in=init.Uniform(0.01),
                          W_hid=init.Uniform(0.01),
                          b=init.Constant(0.0))
            forgetgate = Gate(W_in=init.Uniform(0.01),
                              W_hid=init.Uniform(0.01),
                              b=init.Constant(5.0))
            cell = Gate(
                W_cell=None,
                nonlinearity=T.tanh,
                W_in=init.Uniform(0.01),
                W_hid=init.Uniform(0.01),
            )
            outgate = Gate(W_in=init.Uniform(0.01),
                           W_hid=init.Uniform(0.01),
                           b=init.Constant(0.0))

            lstm = LSTMLayer(input,
                             num_units=nunits,
                             backwards=backwards,
                             peepholes=False,
                             ingate=ingate,
                             forgetgate=forgetgate,
                             cell=cell,
                             outgate=outgate,
                             name=name,
                             only_return_final=return_final)

            rec = RecurrentLayer(input,
                                 num_units=nunits,
                                 W_in_to_hid=init.GlorotNormal('relu'),
                                 W_hid_to_hid=init.GlorotNormal('relu'),
                                 backwards=backwards,
                                 nonlinearity=rectify,
                                 only_return_final=return_final,
                                 name=name)
            return lstm

        # RNN encoder implementation
        l_x_in = InputLayer((None, None, n_c))
        l_enc_forward = lstm_layer(l_x_in,
                                   enc_rnn,
                                   return_final=True,
                                   backwards=False,
                                   name='enc_forward')
        l_enc_backward = lstm_layer(l_x_in,
                                    enc_rnn,
                                    return_final=True,
                                    backwards=True,
                                    name='enc_backward')
        l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward], axis=-1)
        l_enc = dense_layer(l_enc_concat, enc_rnn)

        # RNN decoder implementation
        l_dec_repeat = RepeatLayer(l_enc, n=n_l)
        l_dec_forward = lstm_layer(l_dec_repeat,
                                   dec_rnn,
                                   return_final=False,
                                   backwards=False,
                                   name='dec_forward')
        l_dec_backward = lstm_layer(l_dec_repeat,
                                    dec_rnn,
                                    return_final=False,
                                    backwards=True,
                                    name='dec_backward')
        l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1)
        l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn))
        l_dec = dense_layer(l_dec, dec_rnn)

        # Generative p(x_hat|x)
        l_px = l_dec
        for hid in px_hid:
            l_px = dense_layer(l_px, hid)

        # Output
        self.l_enc = l_enc

        l_px = DenseLayer(l_px, n_c, nonlinearity=None)
        self.l_px = ReshapeLayer(l_px, (-1, n_l, n_c))
        self.l_x_in = l_x_in

        inputs = {l_x_in: self.sym_x}
        outputs = get_output(self.l_px, inputs, deterministic=True)
        self.f_px = theano.function([self.sym_x],
                                    outputs,
                                    on_unused_input='warn')

        # Define model parameters
        self.encoder_params = get_all_param_values(self.l_enc)
        self.model_params = get_all_params(self.l_px)
        self.trainable_model_params = get_all_params(self.l_px, trainable=True)
 def _classification_error(self, x, t):
     y = get_output(self.l_y, x, deterministic=True).mean(axis=(1, 2))  # Mean over samples.
     t_class = T.argmax(t, axis=1)
     y_class = T.argmax(y, axis=1)
     missclass = T.sum(T.neq(y_class, t_class))
     return (missclass.astype(theano.config.floatX) / t.shape[0].astype(theano.config.floatX)) * 100.
    def __init__(self,
                 n_x,
                 n_z,
                 qz_hid,
                 px_hid,
                 filters,
                 seq_length=50,
                 nonlinearity=rectify,
                 px_nonlinearity=None,
                 x_dist='linear',
                 batchnorm=False,
                 seed=1234):
        """
        Weights are initialized using the Bengio and Glorot (2010) initialization scheme.
        :param n_x: Number of inputs.
        :param n_z: Number of latent.
        :param qz_hid: List of number of deterministic hidden q(z|a,x,y).
        :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y).
        :param nonlinearity: The transfer function used in the deterministic layers.
        :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'.
        :param batchnorm: Boolean value for batch normalization.
        :param seed: The random seed.
        """
        super(CVAE, self).__init__(n_x, qz_hid + px_hid, n_z, nonlinearity)
        self.x_dist = x_dist
        self.n_x = n_x
        self.seq_length = seq_length
        self.n_z = n_z
        self.batchnorm = batchnorm
        self._srng = RandomStreams(seed)

        # Pool layer cache
        pool_layers = []

        # Decide Glorot initializaiton of weights.
        init_w = 1e-3
        hid_w = ""
        if nonlinearity == rectify or nonlinearity == softplus:
            hid_w = "relu"

        # Define symbolic variables for theano functions.
        self.sym_x = T.tensor3('x')  # inputs
        self.sym_z = T.matrix('z')
        self.sym_samples = T.iscalar('samples')  # MC samples

        # Assist methods for collecting the layers
        def dense_layer(layer_in,
                        n,
                        dist_w=init.GlorotNormal,
                        dist_b=init.Normal):
            dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w),
                               None)
            if batchnorm:
                dense = bn(dense)
            return NonlinearityLayer(dense, self.transf)

        def stochastic_layer(layer_in, n, samples, nonlin=None):
            mu = DenseLayer(layer_in, n, init.Normal(init_w),
                            init.Normal(init_w), nonlin)
            logvar = DenseLayer(layer_in, n, init.Normal(init_w),
                                init.Normal(init_w), nonlin)
            return SampleLayer(mu, logvar, eq_samples=samples,
                               iw_samples=1), mu, logvar

        def conv_layer(layer_in, filter, stride=(1, 1), pool=1, name='conv'):
            l_conv = Conv2DLayer(layer_in,
                                 num_filters=filter,
                                 filter_size=(3, 1),
                                 stride=stride,
                                 pad='full',
                                 name=name)
            if pool > 1:
                l_conv = MaxPool2DLayer(l_conv, pool_size=(pool, 1))
                pool_layers.append(l_conv)
            return l_conv

        # Reshape input
        l_x_in = InputLayer((None, seq_length, n_x), name='Input')
        l_x_in_reshp = ReshapeLayer(l_x_in, (-1, 1, seq_length, n_x))
        print("l_x_in_reshp", l_x_in_reshp.output_shape)

        # CNN encoder implementation
        l_conv_enc = l_x_in_reshp
        for filter, stride, pool in filters:
            l_conv_enc = conv_layer(l_conv_enc, filter, stride, pool)
            print("l_conv_enc", l_conv_enc.output_shape)

        # Pool along last 2 axes
        l_global_pool_enc = GlobalPoolLayer(l_conv_enc)
        l_enc = dense_layer(l_global_pool_enc, n_z)
        print("l_enc", l_enc.output_shape)

        # Recognition q(z|x)
        l_qz = l_enc
        for hid in qz_hid:
            l_qz = dense_layer(l_qz, hid)
        l_qz, l_qz_mu, l_qz_logvar = stochastic_layer(l_qz, n_z,
                                                      self.sym_samples)
        print("l_qz", l_qz.output_shape)

        # Inverse pooling
        l_global_depool = InverseLayer(l_qz, l_global_pool_enc)
        print("l_global_depool", l_global_depool.output_shape)

        # Reverse pool layer order
        pool_layers = pool_layers[::-1]

        # Decode
        l_deconv = l_global_depool
        for idx, filter in enumerate(filters[::-1]):
            filter, stride, pool = filter
            if pool > 1:
                l_deconv = InverseLayer(l_deconv, pool_layers[idx])
            l_deconv = Conv2DLayer(l_deconv,
                                   num_filters=filter,
                                   filter_size=(3, 1),
                                   stride=(stride, 1),
                                   W=init.GlorotNormal('relu'))
            print("l_deconv", l_deconv.output_shape)

        # The last l_conv layer should give us the input shape
        l_dec = Conv2DLayer(l_deconv,
                            num_filters=1,
                            filter_size=(3, 1),
                            pad='same',
                            nonlinearity=None)
        print("l_dec", l_dec.output_shape)

        # Flatten first two dimensions
        l_dec = ReshapeLayer(l_dec, (-1, n_x))

        l_px = l_dec
        if x_dist == 'bernoulli':
            l_px = DenseLayer(l_px, n_x, init.GlorotNormal(),
                              init.Normal(init_w), sigmoid)
        elif x_dist == 'multinomial':
            l_px = DenseLayer(l_px, n_x, init.GlorotNormal(),
                              init.Normal(init_w), softmax)
        elif x_dist == 'gaussian':
            l_px, l_px_mu, l_px_logvar = stochastic_layer(
                l_px, n_x, self.sym_samples, px_nonlinearity)
        elif x_dist == 'linear':
            l_px = DenseLayer(l_px, n_x, nonlinearity=None)

        # Reshape all the model layers to have the same size
        self.l_x_in = l_x_in

        self.l_qz = ReshapeLayer(l_qz, (-1, self.sym_samples, 1, n_z))
        self.l_qz_mu = DimshuffleLayer(l_qz_mu, (0, 'x', 'x', 1))
        self.l_qz_logvar = DimshuffleLayer(l_qz_logvar, (0, 'x', 'x', 1))

        self.l_px = DimshuffleLayer(
            ReshapeLayer(l_px, (-1, seq_length, self.sym_samples, 1, n_x)),
            (0, 2, 3, 1, 4))
        self.l_px_mu = DimshuffleLayer(ReshapeLayer(l_px_mu, (-1, seq_length, self.sym_samples, 1, n_x)), (0, 2, 3, 1, 4)) \
            if x_dist == "gaussian" else None
        self.l_px_logvar = DimshuffleLayer(ReshapeLayer(l_px_logvar, (-1, seq_length, self.sym_samples, 1, n_x)), (0, 2, 3, 1, 4)) \
            if x_dist == "gaussian" else None

        # Predefined functions
        inputs = {self.l_x_in: self.sym_x}
        outputs = get_output(l_qz, inputs, deterministic=True)
        self.f_qz = theano.function([self.sym_x, self.sym_samples], outputs)

        inputs = {l_qz: self.sym_z}
        outputs = get_output(self.l_px, inputs,
                             deterministic=True).mean(axis=(1, 2))
        self.f_px = theano.function([self.sym_z, self.sym_samples], outputs)

        outputs = get_output(self.l_px_mu, inputs,
                             deterministic=True).mean(axis=(1, 2))
        self.f_mu = theano.function([self.sym_z, self.sym_samples], outputs)

        outputs = get_output(self.l_px_logvar, inputs,
                             deterministic=True).mean(axis=(1, 2))
        self.f_var = theano.function([self.sym_z, self.sym_samples], outputs)

        # Define model parameters
        self.model_params = get_all_params([self.l_px])
        self.trainable_model_params = get_all_params([self.l_px],
                                                     trainable=True)
Example #23
0
def run_adgmssl_mnist():
    """
    Evaluate a auxiliary deep generative model on the mnist dataset with 100 evenly distributed labels.
    """

    # Load the mnist supervised dataset for evaluation.
    (train_x, train_t), (test_x, test_t), (valid_x,
                                           valid_t) = mnist.load_supervised(
                                               filter_std=0.0,
                                               train_valid_combine=True)

    # Initialize the auxiliary deep generative model.
    model = ADGMSSL(n_x=train_x.shape[-1],
                    n_a=100,
                    n_z=100,
                    n_y=10,
                    a_hidden=[500, 500],
                    z_hidden=[500, 500],
                    xhat_hidden=[500, 500],
                    y_hidden=[500, 500],
                    trans_func=rectify,
                    x_dist='bernoulli')

    model_id = 20151209002003  # Insert the trained model id here.
    model.load_model(
        model_id)  # Load trained model. See configurations in the log file.

    # Evaluate the test error of the ADGM.
    mean_evals = model.get_output(
        test_x, 100)  # 100 MC to get a good estimate for the auxiliary unit.
    t_class = np.argmax(test_t, axis=1)
    y_class = np.argmax(mean_evals, axis=1)
    class_err = np.sum(y_class != t_class) / 100.
    print "test set 100-samples: %0.2f%%." % class_err

    # Evaluate the active units in the auxiliary and latent distribution.
    f_a_mu_logvar = theano.function(
        [model.sym_x_l],
        get_output([model.l_a_mu, model.l_a_logvar], model.sym_x_l))
    q_a_mu, q_a_logvar = f_a_mu_logvar(test_x)
    log_pa = -0.5 * (np.log(2 * np.pi) + (q_a_mu**2 + np.exp(q_a_logvar)))
    log_qa_x = -0.5 * (np.log(2 * np.pi) + 1 + q_a_logvar)
    diff_pa_qa_x = (log_pa - log_qa_x).mean(axis=(1, 2))
    mean_diff_pa_qa_x = np.abs(np.mean(diff_pa_qa_x, axis=0))

    inputs = {model.l_x_in: model.sym_x_l, model.l_y_in: model.sym_t_l}
    f_z_mu_logvar = theano.function(
        [model.sym_x_l, model.sym_t_l],
        get_output([model.l_z_mu, model.l_z_logvar], inputs))
    q_z_mu, q_z_logvar = f_z_mu_logvar(test_x, test_t)
    log_pz = -0.5 * (np.log(2 * np.pi) + (q_z_mu**2 + np.exp(q_z_logvar)))
    log_qz_x = -0.5 * (np.log(2 * np.pi) + 1 + q_z_logvar)
    diff_pz_qz_x = (log_pz - log_qz_x).mean(axis=(1, 2))
    mean_diff_pz_qz_x = np.abs(np.mean(diff_pz_qz_x, axis=0))

    plt.figure()
    plt.subplot(111, axisbg='white')
    plt.plot(sorted(mean_diff_pa_qa_x)[::-1],
             color="#c0392b",
             label=r"$\log \frac{p(a_i)}{q(a_i|x)}$")
    plt.plot(sorted(mean_diff_pz_qz_x)[::-1],
             color="#9b59b6",
             label=r"$\log \frac{p(z_i)}{q(z_i|x)}$")
    plt.grid(color='0.9', linestyle='dashed', axis="y")
    plt.xlabel("stochastic units")
    plt.ylabel(r"$\log \frac{p(\cdot)}{q(\cdot)}$")
    plt.ylim((0, 2.7))
    plt.legend()
    plt.savefig("output/diff.png", format="png")

    # Sample 100 random normal distributed samples with fixed class y in the latent space and generate xhat.
    table_size = 10
    samples = 1
    z = np.random.random_sample((table_size**2, 100))
    y = np.eye(10, k=0).reshape(10, 1, 10).repeat(10, axis=1).reshape((-1, 10))
    xhat = model.f_xhat(z, y, samples)

    plt.figure(figsize=(20, 20), dpi=300)
    i = 0
    img_out = np.zeros((28 * table_size, 28 * table_size))
    for x in range(table_size):
        for y in range(table_size):
            xa, xb = x * 28, (x + 1) * 28
            ya, yb = y * 28, (y + 1) * 28
            im = np.reshape(xhat[i], (28, 28))
            img_out[xa:xb, ya:yb] = im
            i += 1
    plt.matshow(img_out, cmap=plt.cm.binary)
    plt.xticks(np.array([]))
    plt.yticks(np.array([]))
    plt.savefig("output/mnist.png", format="png")