def build_model(self, train_set, test_set, validation_set=None): super(NADE, self).build_model(train_set, test_set, validation_set) xhat = get_output(self.model, self.sym_x) loss = -((-binary_crossentropy(xhat, self.sym_x)).sum(axis=1)).mean() updates = sgd(loss, get_all_params(self.model), self.sym_lr) inputs = [self.sym_index, self.sym_batchsize, self.sym_lr] x_batch = self.sh_train_x[self.batch_slice] x_batch = self._srng.binomial(size=x_batch.shape, n=1, p=x_batch, dtype=theano.config.floatX) givens = {self.sym_x: x_batch} f_train = theano.function(inputs, [loss], updates=updates, givens=givens) subset = 1000 # Only take a subset, in order not to receive memory errors. givens = {self.sym_x: self.sh_test_x[:subset]} f_test = theano.function([], [loss], givens=givens) f_validate = None if validation_set is not None: givens = {self.sym_x: self.sh_valid_x[:subset]} f_validate = theano.function([], [loss], givens=givens) self.train_args['inputs']['batchsize'] = 100 self.train_args['inputs']['learningrate'] = 1e-2 self.train_args['outputs']['like.'] = '%0.6f' self.test_args['outputs']['like.'] = '%0.6f' self.validate_args['outputs']['like.'] = '%0.6f' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def _classification_error(self, x, t): y = get_output(self.l_y, x, deterministic=True).mean(axis=(1, 2)) # Mean over samples. t_class = T.argmax(t, axis=1) y_class = T.argmax(y, axis=1) missclass = T.sum(T.neq(y_class, t_class)) return (missclass.astype(theano.config.floatX) / t.shape[0].astype(theano.config.floatX)) * 100.
def __init__(self, n_x, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(SDGMSSL, self).__init__(n_x, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_x = n_x self.n_a = n_a self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.matrix('x') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.matrix('x') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar # Input layers l_x_in = InputLayer((None, n_x)) l_y_in = InputLayer((None, n_y)) # Auxiliary q(a|x) l_qa_x = l_x_in for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_x_in, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) self.l_x_to_qy = l_x_to_qy l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_x_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) if len(px_hid) > 1: for hid in px_hid[1:]: l_px_azy = dense_layer(l_px_azy, hid) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(l_px_azy, n_x, 1, px_nonlinearity) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) self.l_px = ReshapeLayer(l_px_azy, (-1, self.sym_samples, 1, n_x)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None # Predefined functions inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qa = theano.function(inputs, outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True) self.f_pa = theano.function([self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_px, inputs, deterministic=True) self.f_px = theano.function([self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params([self.l_qy, self.l_pa, self.l_px], trainable=True)
def build_model(self, train_set_unlabeled, train_set_labeled, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(SDGMSSL, self).build_model(train_set_unlabeled, test_set, validation_set) sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0], dtype=theano.config.floatX), borrow=True) sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1], dtype=theano.config.floatX), borrow=True) n = self.sh_train_x.shape[0].astype(theano.config.floatX) # no. of data points n_l = sh_train_x_l.shape[0].astype(theano.config.floatX) # no. of labeled data points # Define the layers for the density estimation used in the lower bound. l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu, self.l_qa_logvar) l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu, self.l_pa_logvar) if self.x_dist == 'bernoulli': l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'multinomial': l_log_px = MultinomialLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'gaussian': l_log_px = GaussianLogDensityLayer(self.l_x_in, self.l_px_mu, self.l_px_logvar) def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px): lb = log_px + log_py + log_pz + log_pa - log_qa - log_qz return lb # Lower bound for labeled data out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out # Prior p(y) expecting that all classes are evenly distributed py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l, log_py_l, log_px_zy_l) lb_l = lb_l.mean(axis=(1, 2)) # Mean over the sampling dimensions log_qy_ax_l *= (self.sym_beta * (n / n_l)) # Scale the supervised cross entropy with the alpha constant lb_l -= log_qy_ax_l.mean(axis=(1, 2)) # Collect the lower bound term and mean over sampling dimensions # Lower bound for unlabeled data bs_u = self.sym_x_u.shape[0] # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form # x_repeat t_repeat # [[x[0,0], x[0,1], ..., x[0,n_x]] [[1, 0, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [1, 0, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 1, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [0, 1, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 0, 1] # [x[1,0], x[1,1], ..., x[1,n_x]]] [0, 0, 1]] t_eye = T.eye(self.n_y, k=0) t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape((-1, self.n_y)) x_u = self.sym_x_u.reshape((1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape((-1, self.n_x)) # Since the expectation of var a is outside the integration we calculate E_q(a|x) first a_x_u = get_output(self.l_qa, self.sym_x_u, batch_norm_update_averages=True, batch_norm_use_averages=False) a_x_u_rep = a_x_u.reshape((1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y, axis=0).reshape( (-1, self.n_a)) out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px] inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out # Prior p(y) expecting that all classes are evenly distributed py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) log_py_u = -categorical_crossentropy(py_u, t_u).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u, log_py_u, log_px_zy_u) lb_u = lb_u.reshape((self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) inputs = {self.l_x_in: self.sym_x_u, self.l_a_in: a_x_u.reshape((-1, self.n_a))} y_u = get_output(self.l_qy, inputs, batch_norm_update_averages=True, batch_norm_use_averages=False).mean( axis=(1, 2)) y_u += 1e-8 # Ensure that we get no NANs when calculating the entropy y_u /= T.sum(y_u, axis=1, keepdims=True) lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1) if self.batchnorm: # TODO: implement the BN layer correctly. inputs = {self.l_x_in: self.sym_x_u, self.l_y_in: y_u, self.l_a_in: a_x_u} get_output(out_layers, inputs, weighting=None, batch_norm_update_averages=True, batch_norm_use_averages=False) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n lb_labeled = -lb_l.mean() lb_unlabeled = -lb_u.mean() grads_collect = T.grad(elbo, self.trainable_model_params) params_collect = self.trainable_model_params sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1, sym_beta2) # Training function indices = self._srng.choice(size=[self.sym_bs_l], a=sh_train_x_l.shape[0], replace=False) x_batch_l = sh_train_x_l[indices] t_batch_l = sh_train_t_l[indices] x_batch_u = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = {self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l} inputs = [self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples] outputs = [elbo, lb_labeled, lb_unlabeled] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize_unlabeled'] = 100 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 0.1 self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['outputs']['lb'] = '%0.4f' self.train_args['outputs']['lb-labeled'] = '%0.4f' self.train_args['outputs']['lb-unlabeled'] = '%0.4f' # Validation and test function y = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100 givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['test'] = '%0.2f%%' f_validate = None if validation_set is not None: givens = {self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t} f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['validation'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def __init__(self, n_x, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(SDGMSSL, self).__init__(n_x, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_x = n_x self.n_a = n_a self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.matrix('x') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.matrix('x') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar # Input layers l_x_in = InputLayer((None, n_x)) l_y_in = InputLayer((None, n_y)) # Auxiliary q(a|x) l_qa_x = l_x_in for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_x_in, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_x_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) if len(px_hid) > 1: for hid in px_hid[1:]: l_px_azy = dense_layer(l_px_azy, hid) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(l_px_azy, n_x, 1, px_nonlinearity) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) self.l_px = ReshapeLayer(l_px_azy, (-1, self.sym_samples, 1, n_x)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None # Predefined functions inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qa = theano.function(inputs, outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True) self.f_pa = theano.function([self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_px, inputs, deterministic=True) self.f_px = theano.function([self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params([self.l_qy, self.l_pa, self.l_px], trainable=True)
def build_model(self, train_set, test_set, validation_set=None): """ :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(CVAE, self).build_model(train_set, test_set, validation_set) n = self.sh_train_x.shape[0].astype( theano.config.floatX) # no. of data points # Define the layers for the density estimation used in the lower bound. l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_x_in = ReshapeLayer(self.l_x_in, (-1, self.seq_length * self.n_x)) if self.x_dist == 'bernoulli': l_px = ReshapeLayer( self.l_px, (-1, self.sym_samples, 1, self.seq_length * self.n_x)) l_log_px = BernoulliLogDensityLayer(l_px, l_x_in) elif self.x_dist == 'multinomial': l_px = ReshapeLayer( self.l_px, (-1, self.sym_samples, 1, self.seq_length * self.n_x)) l_log_px = MultinomialLogDensityLayer(l_px, l_x_in) elif self.x_dist == 'gaussian': l_px_mu = ReshapeLayer( self.l_px_mu, (-1, self.sym_samples, 1, self.seq_length * self.n_x)) l_px_logvar = ReshapeLayer( self.l_px_logvar, (-1, self.sym_samples, 1, self.seq_length * self.n_x)) l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar) elif self.x_dist == 'linear': l_log_px = self.l_px self.sym_warmup = T.fscalar('warmup') def lower_bound(log_pz, log_qz, log_px): return log_px + (log_pz - log_qz) * (1. - self.sym_warmup - 0.1) # Lower bound out_layers = [l_log_pz, l_log_qz, l_log_px] inputs = {self.l_x_in: self.sym_x} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pz, log_qz, log_px = out # If the decoder output is linear we need the reconstruction error if self.x_dist == 'linear': log_px = -aggregate(squared_error(log_px.mean(axis=(1, 2)), self.sym_x), mode='mean') lb = lower_bound(log_pz, log_qz, log_px) lb = lb.mean(axis=(1, 2)) # Mean over the sampling dimensions if self.batchnorm: # TODO: implement the BN layer correctly. inputs = {self.l_x_in: self.sym_x} get_output(out_layers, inputs, weighting=None, batch_norm_update_averages=True, batch_norm_use_averages=False) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = lb.mean() cost = (elbo * n + weight_priors) / -n grads_collect = T.grad(cost, self.trainable_model_params) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, self.trainable_model_params, self.sym_lr, sym_beta1, sym_beta2) # updates = rmsprop(mgrads, self.trainable_model_params, self.sym_lr + (0*sym_beta1*sym_beta2)) # Training function x_batch = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch = self._srng.binomial(size=x_batch.shape, n=1, p=x_batch, dtype=theano.config.floatX) givens = {self.sym_x: x_batch} inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples, self.sym_warmup ] outputs = [ log_px.mean(), log_pz.mean(), log_qz.mean(), elbo, self.sym_warmup ] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize'] = 100 self.train_args['inputs']['learningrate'] = 1e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['inputs']['warmup'] = 0.1 self.train_args['outputs']['log p(x)'] = '%0.6f' self.train_args['outputs']['log p(z)'] = '%0.6f' self.train_args['outputs']['log q(z)'] = '%0.6f' self.train_args['outputs']['elbo train'] = '%0.6f' self.train_args['outputs']['warmup'] = '%0.3f' # Validation and test function givens = {self.sym_x: self.sh_test_x} f_test = theano.function(inputs=[self.sym_samples, self.sym_warmup], outputs=[elbo], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['inputs']['warmup'] = 0.1 self.test_args['outputs']['elbo test'] = '%0.6f' f_validate = None if validation_set is not None: givens = {self.sym_x: self.sh_valid_x} f_validate = theano.function(inputs=[self.sym_samples], outputs=[elbo], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['elbo validation'] = '%0.6f' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def build_model(self, train_set, test_set, validation_set=None): """ :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(RAE, self).build_model(train_set, test_set, validation_set) # Cost inputs = {self.l_x_in: self.sym_x} # px = get_output(self.l_px, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) px = get_output(self.l_px, inputs) cost = aggregate(squared_error(px, self.sym_x), mode='mean') # cost += 1e-4 * regularize_network_params(self.l_px, l2) grads_collect = T.grad(cost, self.trainable_model_params) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, self.trainable_model_params, self.sym_lr, sym_beta1, sym_beta2) # updates = rmsprop(mgrads, self.trainable_model_params, self.sym_lr + (0*sym_beta1*sym_beta2)) # Training function x_batch = self.sh_train_x[self.batch_slice] givens = {self.sym_x: x_batch} inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2 ] outputs = [cost] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize'] = 100 self.train_args['inputs']['learningrate'] = 3e-3 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['outputs']['cost train'] = '%0.6f' # Validation and test function givens = {self.sym_x: self.sh_test_x} f_test = theano.function(inputs=[], outputs=[cost], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['outputs']['cost test'] = '%0.6f' f_validate = None if validation_set is not None: givens = {self.sym_x: self.sh_valid_x} f_validate = theano.function(inputs=[], outputs=[cost], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['outputs']['cost val'] = '%0.6f' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def __init__(self, n_c, n_l, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, filters, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of input channels. :param n_l: Number of lengths. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(CSDGM, self).__init__(n_c, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_c = n_c self.n_l = n_l self.n_a = n_a self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" pool_layers = [] # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.tensor3('x') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.tensor3('x') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a self.sym_warmup = T.fscalar('warmup') # warmup to scale KL term # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def conv_layer(layer_in, filter, stride=(1, 1), pool=1, name='conv', dist_w=init.GlorotNormal, dist_b=init.Normal): l_conv = Conv2DLayer(layer_in, num_filters=filter, filter_size=(3, 1), stride=stride, pad='full', W=dist_w(hid_w), b=dist_b(init_w), name=name) if pool > 1: l_conv = MaxPool2DLayer(l_conv, pool_size=(pool, 1)) pool_layers.append(l_conv) return l_conv # Input layers l_y_in = InputLayer((None, n_y)) l_x_in = InputLayer((None, n_l, n_c), name='Input') # Reshape input l_x_in_reshp = ReshapeLayer(l_x_in, (-1, 1, n_l, n_c)) print("l_x_in_reshp", l_x_in_reshp.output_shape) # CNN encoder implementation l_conv_enc = l_x_in_reshp for filter, stride, pool in filters: l_conv_enc = conv_layer(l_conv_enc, filter, stride, pool) print("l_conv_enc", l_conv_enc.output_shape) # Pool along last 2 axes l_global_pool_enc = GlobalPoolLayer(l_conv_enc, pool_function=T.mean) l_enc = dense_layer(l_global_pool_enc, n_z) print("l_enc", l_enc.output_shape) # Auxiliary q(a|x) l_qa_x = l_enc for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer( l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_enc, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_enc, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer( l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) # Note that px_hid[0] has to be equal to the number filters in the first convolution. Otherwise add a # dense layers here. # Inverse pooling l_global_depool = InverseLayer(l_px_azy, l_global_pool_enc) print("l_global_depool", l_global_depool.output_shape) # Reverse pool layer order pool_layers = pool_layers[::-1] # Decode l_deconv = l_global_depool for idx, filter in enumerate(filters[::-1]): filter, stride, pool = filter if pool > 1: l_deconv = InverseLayer(l_deconv, pool_layers[idx]) l_deconv = Conv2DLayer(l_deconv, num_filters=filter, filter_size=(3, 1), stride=(stride, 1), W=init.GlorotNormal('relu')) print("l_deconv", l_deconv.output_shape) # The last l_conv layer should give us the input shape l_px_azy = Conv2DLayer(l_deconv, num_filters=1, filter_size=(3, 1), pad='same', nonlinearity=None) print("l_dec", l_px_azy.output_shape) # Flatten first two dimensions l_px_azy = ReshapeLayer(l_px_azy, (-1, n_c)) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer( l_px_azy, n_c, self.sym_samples, px_nonlinearity) elif x_dist == 'linear': l_px_azy = DenseLayer(l_px_azy, n_c, nonlinearity=None) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) # Here we assume that we pass (batch size * segment length, number of features) to the sample layer from # which we then get (batch size * segment length, samples, IW samples, features) self.l_px = ReshapeLayer(l_px_azy, (-1, n_l, self.sym_samples, 1, n_c)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None # Predefined functions inputs = {l_x_in: self.sym_x_l} outputs = get_output(self.l_qy, inputs, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function([self.sym_x_l, self.sym_samples], outputs) outputs = get_output(l_qa_x, inputs, deterministic=True) self.f_qa = theano.function([self.sym_x_l, self.sym_samples], outputs) inputs = {l_x_in: self.sym_x_l, l_y_in: self.sym_t_l} outputs = get_output(l_qz_axy, inputs, deterministic=True) self.f_qz = theano.function( [self.sym_x_l, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True).mean(axis=(1, 2)) self.f_pa = theano.function( [self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = { l_x_in: self.sym_x_l, l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l } outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(2, 3)) self.f_px = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(2, 3)) self.f_mu = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(2, 3)) self.f_var = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params( [self.l_qy, self.l_pa, self.l_px], trainable=True)
def __init__(self, n_x, n_a, n_z, n_y, a_hidden, z_hidden, xhat_hidden, y_hidden, trans_func=rectify, x_dist='bernoulli'): """ Initialize an auxiliary deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(xhat|z,y), inference model Q q(a|x) and q(z|x,y). All weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param a_hidden: List of number of deterministic hidden q(a|x). :param z_hidden: List of number of deterministic hidden q(z|x,y). :param xhat_hidden: List of number of deterministic hidden p(xhat|z,y). :param y_hidden: List of number of deterministic hidden q(y|a,x). :param trans_func: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli' or 'gaussian'. """ super(ADGMSSL, self).__init__(n_x, a_hidden + z_hidden + xhat_hidden, n_a + n_z, trans_func) self.y_hidden = y_hidden self.x_dist = x_dist self.n_y = n_y self.n_x = n_x self.n_a = n_a self.n_z = n_z self._srng = RandomStreams() self.sym_beta = T.scalar( 'beta') # symbolic upscaling of the discriminative term. self.sym_x_l = T.matrix('x') # symbolic labeled inputs self.sym_t_l = T.matrix('t') # symbolic labeled targets self.sym_x_u = T.matrix('x') # symbolic unlabeled inputs self.sym_bs_l = T.iscalar( 'bs_l' ) # symbolic number of labeled data_preparation points in batch self.sym_samples = T.iscalar( 'samples') # symbolic number of Monte Carlo samples self.sym_y = T.matrix('y') self.sym_z = T.matrix('z') ### Input layers ### l_x_in = InputLayer((None, n_x)) l_y_in = InputLayer((None, n_y)) ### Auxiliary q(a|x) ### l_a_x = l_x_in for hid in a_hidden: l_a_x = DenseLayer(l_a_x, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) l_a_x_mu = DenseLayer(l_a_x, n_a, init.GlorotNormal(), init.Normal(1e-3), None) l_a_x_logvar = DenseLayer(l_a_x, n_a, init.GlorotNormal(), init.Normal(1e-3), None) l_a_x = SampleLayer(l_a_x_mu, l_a_x_logvar, eq_samples=self.sym_samples) # Reshape all layers to align them for multiple samples in the lower bound calculation. l_a_x_reshaped = ReshapeLayer(l_a_x, (-1, self.sym_samples, 1, n_a)) l_a_x_mu_reshaped = DimshuffleLayer(l_a_x_mu, (0, 'x', 'x', 1)) l_a_x_logvar_reshaped = DimshuffleLayer(l_a_x_logvar, (0, 'x', 'x', 1)) ### Classifier q(y|a,x) ### # Concatenate the input x and the output of the auxiliary MLP. l_a_to_y = DenseLayer(l_a_x, y_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_a_to_y = ReshapeLayer(l_a_to_y, (-1, self.sym_samples, 1, y_hidden[0])) l_x_to_y = DenseLayer(l_x_in, y_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_x_to_y = DimshuffleLayer(l_x_to_y, (0, 'x', 'x', 1)) l_y_xa = ReshapeLayer(ElemwiseSumLayer([l_a_to_y, l_x_to_y]), (-1, y_hidden[0])) l_y_xa = NonlinearityLayer(l_y_xa, self.transf) if len(y_hidden) > 1: for hid in y_hidden[1:]: l_y_xa = DenseLayer(l_y_xa, hid, init.GlorotUniform('relu'), init.Normal(1e-3), self.transf) l_y_xa = DenseLayer(l_y_xa, n_y, init.GlorotUniform(), init.Normal(1e-3), softmax) l_y_xa_reshaped = ReshapeLayer(l_y_xa, (-1, self.sym_samples, 1, n_y)) ### Recognition q(z|x,y) ### # Concatenate the input x and y. l_x_to_z = DenseLayer(l_x_in, z_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_x_to_z = DimshuffleLayer(l_x_to_z, (0, 'x', 'x', 1)) l_y_to_z = DenseLayer(l_y_in, z_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_y_to_z = DimshuffleLayer(l_y_to_z, (0, 'x', 'x', 1)) l_z_xy = ReshapeLayer(ElemwiseSumLayer([l_x_to_z, l_y_to_z]), [-1, z_hidden[0]]) l_z_xy = NonlinearityLayer(l_z_xy, self.transf) if len(z_hidden) > 1: for hid in z_hidden[1:]: l_z_xy = DenseLayer(l_z_xy, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) l_z_axy_mu = DenseLayer(l_z_xy, n_z, init.GlorotNormal(), init.Normal(1e-3), None) l_z_axy_logvar = DenseLayer(l_z_xy, n_z, init.GlorotNormal(), init.Normal(1e-3), None) l_z_xy = SampleLayer(l_z_axy_mu, l_z_axy_logvar, eq_samples=self.sym_samples) # Reshape all layers to align them for multiple samples in the lower bound calculation. l_z_axy_mu_reshaped = DimshuffleLayer(l_z_axy_mu, (0, 'x', 'x', 1)) l_z_axy_logvar_reshaped = DimshuffleLayer(l_z_axy_logvar, (0, 'x', 'x', 1)) l_z_axy_reshaped = ReshapeLayer(l_z_xy, (-1, self.sym_samples, 1, n_z)) ### Generative p(xhat|z,y) ### # Concatenate the input x and y. l_y_to_xhat = DenseLayer(l_y_in, xhat_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_y_to_xhat = DimshuffleLayer(l_y_to_xhat, (0, 'x', 'x', 1)) l_z_to_xhat = DenseLayer(l_z_xy, xhat_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_z_to_xhat = ReshapeLayer(l_z_to_xhat, (-1, self.sym_samples, 1, xhat_hidden[0])) l_xhat_zy = ReshapeLayer(ElemwiseSumLayer([l_z_to_xhat, l_y_to_xhat]), [-1, xhat_hidden[0]]) l_xhat_zy = NonlinearityLayer(l_xhat_zy, self.transf) if len(xhat_hidden) > 1: for hid in xhat_hidden[1:]: l_xhat_zy = DenseLayer(l_xhat_zy, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) if x_dist == 'bernoulli': l_xhat_zy_mu_reshaped = None l_xhat_zy_logvar_reshaped = None l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), sigmoid) elif x_dist == 'multinomial': l_xhat_zy_mu_reshaped = None l_xhat_zy_logvar_reshaped = None l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), softmax) elif x_dist == 'gaussian': l_xhat_zy_mu = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), None) l_xhat_zy_logvar = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), None) l_xhat_zy = SampleLayer(l_xhat_zy_mu, l_xhat_zy_logvar, eq_samples=1) l_xhat_zy_mu_reshaped = ReshapeLayer( l_xhat_zy_mu, (-1, self.sym_samples, 1, n_x)) l_xhat_zy_logvar_reshaped = ReshapeLayer( l_xhat_zy_logvar, (-1, self.sym_samples, 1, n_x)) l_xhat_zy_reshaped = ReshapeLayer(l_xhat_zy, (-1, self.sym_samples, 1, n_x)) ### Various class variables ### self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_mu = l_a_x_mu_reshaped self.l_a_logvar = l_a_x_logvar_reshaped self.l_a = l_a_x_reshaped self.l_z_mu = l_z_axy_mu_reshaped self.l_z_logvar = l_z_axy_logvar_reshaped self.l_z = l_z_axy_reshaped self.l_y = l_y_xa_reshaped self.l_xhat_mu = l_xhat_zy_mu_reshaped self.l_xhat_logvar = l_xhat_zy_logvar_reshaped self.l_xhat = l_xhat_zy_reshaped self.model_params = get_all_params([self.l_xhat, self.l_y]) ### Calculate networks shapes for documentation ### self.qa_shapes = self.get_model_shape(get_all_params(l_a_x)) self.qy_shapes = self.get_model_shape( get_all_params(l_y_xa))[len(self.qa_shapes) - 1:] self.qz_shapes = self.get_model_shape(get_all_params(l_z_xy)) self.px_shapes = self.get_model_shape( get_all_params(l_xhat_zy))[(len(self.qz_shapes) - 1):] ### Predefined functions for generating xhat and y ### inputs = {l_z_xy: self.sym_z, self.l_y_in: self.sym_y} outputs = get_output(self.l_xhat, inputs, deterministic=True).mean(axis=(1, 2)) inputs = [self.sym_z, self.sym_y, self.sym_samples] self.f_xhat = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_y, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_y = theano.function(inputs, outputs) self.y_params = get_all_params( self.l_y, trainable=True)[(len(a_hidden) + 2) * 2::] self.xhat_params = get_all_params(self.l_xhat, trainable=True)
def build_model(self, train_set, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set: Train set containing variables x, t. for the unlabeled data_preparation in the train set, we define 0's in t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(ADGMSSL, self).build_model(train_set, test_set, validation_set) # Define the layers for the density estimation used in the lower bound. l_log_pa = GaussianMarginalLogDensityLayer(self.l_a_mu, self.l_a_logvar) l_log_pz = GaussianMarginalLogDensityLayer(self.l_z_mu, self.l_z_logvar) l_log_qa_x = GaussianMarginalLogDensityLayer(1, self.l_a_logvar) l_log_qz_xy = GaussianMarginalLogDensityLayer(1, self.l_z_logvar) l_log_qy_ax = MultinomialLogDensityLayer(self.l_y, self.l_y_in, eps=1e-8) if self.x_dist == 'bernoulli': l_px_zy = BernoulliLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'multinomial': l_px_zy = MultinomialLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'gaussian': l_px_zy = GaussianLogDensityLayer(self.l_x_in, self.l_xhat_mu, self.l_xhat_logvar) ### Compute lower bound for labeled data_preparation ### out_layers = [ l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy, l_log_qy_ax ] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = get_output( out_layers, inputs) py_l = softmax(T.zeros( (self.sym_x_l.shape[0], self.n_y))) # non-informative prior log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = log_pa_l + log_pz_l + log_py_l + log_px_zy_l - log_qa_x_l - log_qz_axy_l # Upscale the discriminative term with a weight. log_qy_ax_l *= self.sym_beta xhat_grads_l = T.grad(lb_l.mean(axis=(1, 2)).sum(), self.xhat_params) y_grads_l = T.grad(log_qy_ax_l.mean(axis=(1, 2)).sum(), self.y_params) lb_l += log_qy_ax_l lb_l = lb_l.mean(axis=(1, 2)) ### Compute lower bound for unlabeled data_preparation ### bs_u = self.sym_x_u.shape[0] # size of the unlabeled data_preparation. t_eye = T.eye(self.n_y, k=0) # ones in diagonal and 0's elsewhere (bs x n_y). # repeat unlabeled t the number of classes for integration (bs * n_y) x n_y. t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape( (-1, self.n_y)) # repeat unlabeled x the number of classes for integration (bs * n_y) x n_x x_u = self.sym_x_u.reshape( (1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape( (-1, self.n_x)) out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy] inputs = {self.l_x_in: x_u, self.l_y_in: t_u} log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = get_output( out_layers, inputs) py_u = softmax(T.zeros( (bs_u * self.n_y, self.n_y))) # non-informative prior. log_py_u = -categorical_crossentropy(py_u, t_u).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = log_pa_u + log_pz_u + log_py_u + log_px_zy_u - log_qa_x_u - log_qz_axy_u lb_u = lb_u.reshape( (self.n_y, self.sym_samples, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) # mean over samples. y_ax_u = get_output(self.l_y, self.sym_x_u) y_ax_u = y_ax_u.mean(axis=(1, 2)) # bs x n_y y_ax_u += 1e-8 # ensure that we get no NANs. y_ax_u /= T.sum(y_ax_u, axis=1, keepdims=True) xhat_grads_u = T.grad((y_ax_u * lb_u).sum(axis=1).sum(), self.xhat_params) lb_u = (y_ax_u * (lb_u - T.log(y_ax_u))).sum(axis=1) y_grads_u = T.grad(lb_u.sum(), self.y_params) # Loss - regularizing with weight priors p(theta|N(0,1)) and clipping gradients y_weight_priors = 0.0 for p in self.y_params: if 'W' not in str(p): continue y_weight_priors += log_normal(p, 0, 1).sum() y_weight_priors_grad = T.grad(y_weight_priors, self.y_params, disconnected_inputs='ignore') xhat_weight_priors = 0.0 for p in self.xhat_params: if 'W' not in str(p): continue xhat_weight_priors += log_normal(p, 0, 1).sum() xhat_weight_priors_grad = T.grad(xhat_weight_priors, self.xhat_params, disconnected_inputs='ignore') n = self.sh_train_x.shape[0].astype( theano.config.floatX ) # no. of data_preparation points in train set n_b = n / self.sym_batchsize.astype( theano.config.floatX) # no. of batches in train set y_grads = [T.zeros(p.shape) for p in self.y_params] for i in range(len(y_grads)): y_grads[i] = (y_grads_l[i] + y_grads_u[i]) y_grads[i] *= n_b y_grads[i] += y_weight_priors_grad[i] y_grads[i] /= -n xhat_grads = [T.zeros(p.shape) for p in self.xhat_params] for i in range(len(xhat_grads)): xhat_grads[i] = (xhat_grads_l[i] + xhat_grads_u[i]) xhat_grads[i] *= n_b xhat_grads[i] += xhat_weight_priors_grad[i] xhat_grads[i] /= -n params = self.y_params + self.xhat_params grads = y_grads + xhat_grads # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.sum() + lb_u.sum()) * n_b + y_weight_priors + xhat_weight_priors) / -n # Avoid vanishing and exploding gradients. clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') updates = adam(mgrads, params, self.sym_lr, sym_beta1, sym_beta2) ### Compile training function ### x_batch_l = self.sh_train_x[self.batch_slice][:self.sym_bs_l] x_batch_u = self.sh_train_x[self.batch_slice][self.sym_bs_l:] t_batch_l = self.sh_train_t[self.batch_slice][:self.sym_bs_l] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = { self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l } inputs = [ self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples ] f_train = theano.function(inputs=inputs, outputs=[elbo], givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize'] = 200 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 1200. self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['outputs']['lb'] = '%0.4f' ### Compile testing function ### class_err_test = self._classification_error(self.sym_x_l, self.sym_t_l) givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err_test], givens=givens) # Testing args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['err'] = '%0.2f%%' ### Compile validation function ### f_validate = None if validation_set is not None: class_err_valid = self._classification_error( self.sym_x_l, self.sym_t_l) givens = { self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t } inputs = [self.sym_samples] f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err_valid], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['err'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def __init__(self, n_c, n_z, qz_hid, px_hid, enc_rnn=256, dec_rnn=256, n_l=28, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of inputs. :param n_z: Number of latent. :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(RVAE, self).__init__(n_c, qz_hid + px_hid, n_z, nonlinearity) self.x_dist = x_dist self.n_x = n_c self.seq_length = n_l self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_x = T.tensor3('x') # inputs self.sym_z = T.matrix('z') self.sym_samples = T.iscalar('samples') # MC samples self.sym_warmup = T.fscalar('warmup') # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, num_units=n, W=dist_w(hid_w), b=dist_b(init_w), nonlinearity=None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, W=init.Normal(init_w, mean=.0), b=init.Normal(init_w), nonlinearity=nonlin) logvar = DenseLayer(layer_in, n, W=init.Normal(init_w, mean=.0), b=init.Normal(init_w), nonlinearity=nonlin) # logvar = ConstrainLayer(logvar, scale=1, max=T.log(-0.999 * self.sym_warmup + 1.0999)) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def lstm_layer(input, nunits, return_final, backwards=False, name='LSTM'): ingate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) forgetgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(5.0)) cell = Gate( W_cell=None, nonlinearity=T.tanh, W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), ) outgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) lstm = LSTMLayer(input, num_units=nunits, backwards=backwards, peepholes=False, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, name=name, only_return_final=return_final) return lstm # RNN encoder implementation l_x_in = InputLayer((None, n_l, n_c)) l_enc_forward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=False, name='enc_forward') l_enc_backward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=True, name='enc_backward') l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward], axis=-1) l_enc = dense_layer(l_enc_concat, enc_rnn) # # Overwrite encoder # l_enc = dense_layer(l_x_in, enc_rnn) # Recognition q(z|x) l_qz = l_enc for hid in qz_hid: l_qz = dense_layer(l_qz, hid) # Reparameterisation and sample l_qz_mu = DenseLayer(l_qz, n_z, W=init.Normal(init_w, mean=1.0), b=init.Normal(init_w), nonlinearity=None) l_qz_logvar = DenseLayer(l_qz, n_z, init.Normal(init_w), init.Normal(init_w), nonlinearity=None) l_qz = SampleLayer(l_qz_mu, l_qz_logvar, eq_samples=self.sym_samples, iw_samples=1) # Generative p(x|z) l_qz_repeat = RepeatLayer(l_qz, n=n_l) # Skip connection to encoder until warmup threshold is reached if T.ge(self.sym_warmup, 0.4): l_skip_enc_repeat = RepeatLayer(l_enc, n=n_l) l_qz_repeat = ConcatLayer([l_qz_repeat, l_skip_enc_repeat], axis=-1) l_dec_forward = lstm_layer(l_qz_repeat, dec_rnn, return_final=False, backwards=False, name='dec_forward') l_dec_backward = lstm_layer(l_qz_repeat, dec_rnn, return_final=False, backwards=True, name='dec_backward') l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1) l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn)) l_dec = dense_layer(l_dec, dec_rnn) # # Overwrite decoder # l_dec = dense_layer(l_qz, n_l) # Add additional dense layers l_px = l_dec for hid in px_hid: l_px = dense_layer(l_px, hid) # Reshape the last dimension and perhaps model with a distribution if x_dist == 'bernoulli': l_px = DenseLayer(l_px, n_c, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px = DenseLayer(l_px, n_c, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px, l_px_mu, l_px_logvar = stochastic_layer( l_px, n_c, self.sym_samples, nonlin=px_nonlinearity) elif x_dist == 'linear': l_px = DenseLayer(l_px, n_c, nonlinearity=None) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_qz = ReshapeLayer(l_qz, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = DimshuffleLayer(l_qz_mu, (0, 'x', 'x', 1)) self.l_qz_logvar = DimshuffleLayer(l_qz_logvar, (0, 'x', 'x', 1)) self.l_px = DimshuffleLayer( ReshapeLayer(l_px, (-1, n_l, self.sym_samples, 1, n_c)), (0, 2, 3, 1, 4)) self.l_px_mu = DimshuffleLayer(ReshapeLayer(l_px_mu, (-1, n_l, self.sym_samples, 1, n_c)), (0, 2, 3, 1, 4)) \ if x_dist == "gaussian" else None self.l_px_logvar = DimshuffleLayer(ReshapeLayer(l_px_logvar, (-1, n_l, self.sym_samples, 1, n_c)), (0, 2, 3, 1, 4)) \ if x_dist == "gaussian" else None # Predefined functions inputs = {self.l_x_in: self.sym_x} outputs = get_output(l_qz, inputs, deterministic=True) self.f_qz = theano.function([self.sym_x, self.sym_samples], outputs, on_unused_input='warn') inputs = {l_qz: self.sym_z, self.l_x_in: self.sym_x} outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(1, 2)) self.f_px = theano.function([self.sym_x, self.sym_z, self.sym_samples], outputs, on_unused_input='warn') if x_dist == "gaussian": outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(1, 2)) self.f_mu = theano.function( [self.sym_x, self.sym_z, self.sym_samples], outputs, on_unused_input='ignore') outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(1, 2)) self.f_var = theano.function( [self.sym_x, self.sym_z, self.sym_samples], outputs, on_unused_input='ignore') # Define model parameters self.model_params = get_all_params([self.l_px]) self.trainable_model_params = get_all_params([self.l_px], trainable=True)
def __init__(self, n_l, n_c, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, enc_rnn=256, dec_rnn=256, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(RSDGM, self).__init__(n_c, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_c = n_c self.n_a = n_a self.n_z = n_z self.n_l = n_l self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.tensor3('x_l') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.tensor3('x_u') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a self.sym_warmup = T.fscalar('warmup') # warmup to dampen KL term # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def lstm_layer(input, nunits, return_final, backwards=False, name='LSTM'): ingate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) forgetgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(5.0)) cell = Gate( W_cell=None, nonlinearity=T.tanh, W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), ) outgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) lstm = LSTMLayer(input, num_units=nunits, backwards=backwards, peepholes=False, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, name=name, only_return_final=return_final) rec = RecurrentLayer(input, nunits, W_in_to_hid=init.GlorotNormal('relu'), W_hid_to_hid=init.GlorotNormal('relu'), backwards=backwards, nonlinearity=rectify, only_return_final=return_final, name=name) return lstm # Input layers l_y_in = InputLayer((None, n_y)) l_x_in = InputLayer((None, n_l, n_c)) # RNN encoder implementation l_enc_forward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=False, name='enc_forward') l_enc_backward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=True, name='enc_backward') l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward]) l_enc = dense_layer(l_enc_concat, enc_rnn) # Auxiliary q(a|x) l_qa_x = l_enc for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer( l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_enc, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_enc, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer( l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) # RNN decoder implementation l_px_azy_repeat = RepeatLayer(l_px_azy, n=n_l) l_dec_forward = lstm_layer(l_px_azy_repeat, dec_rnn, return_final=False, backwards=False, name='dec_forward') l_dec_backward = lstm_layer(l_px_azy_repeat, dec_rnn, return_final=False, backwards=True, name='dec_backward') l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1) l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn)) l_dec = dense_layer(l_dec, dec_rnn) l_px_azy = l_dec if len(px_hid) > 1: for hid in px_hid[1:]: l_px_azy = dense_layer(l_px_azy, hid) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer( l_px_azy, n_c, self.sym_samples, px_nonlinearity) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) self.l_px = ReshapeLayer(l_px_azy, (-1, n_l, self.sym_samples, 1, n_c)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None # Predefined functions inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qa = theano.function(inputs, outputs) inputs = {l_x_in: self.sym_x_l, l_y_in: self.sym_t_l} outputs = get_output(l_qz_axy, inputs, deterministic=True) self.f_qz = theano.function( [self.sym_x_l, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True) self.f_pa = theano.function( [self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = { l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l } outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(2, 3)) self.f_px = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(2, 3)) self.f_mu = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(2, 3)) self.f_var = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params( [self.l_qy, self.l_pa, self.l_px], trainable=True)
def get_output(self, x): return get_output(self.model, x, deterministic=True)
def build_model(self, train_set, test_set, validation_set=None): super(VAE, self).build_model(train_set, test_set, validation_set) # Density estimations l_log_pz = StandardNormalLogDensityLayer(self.l_z) l_log_qz_x = GaussianLogDensityLayer(self.l_z, self.l_z_mu, self.l_z_logvar) if self.x_dist == 'bernoulli': l_px_z = BernoulliLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'gaussian': l_px_z = GaussianLogDensityLayer(self.l_x_in, self.l_xhat_mu, self.l_xhat_logvar) out_layers = [l_log_pz, l_log_qz_x, l_px_z] inputs = {self.l_x_in: self.sym_x} log_pz, log_qz_x, log_px_z = get_output(out_layers, inputs) lb = -(log_pz + log_px_z - log_qz_x).mean(axis=1).mean() all_params = get_all_params(self.l_xhat, trainable=True) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') updates = adam(lb, all_params, self.sym_lr, sym_beta1, sym_beta2) x_batch = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': x_batch = self._srng.binomial(size=x_batch.shape, n=1, p=x_batch, dtype=theano.config.floatX) givens = {self.sym_x: x_batch} inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples ] outputs = [lb] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Training args self.train_args['inputs']['batchsize'] = 100 self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['outputs']['lb'] = '%0.4f' givens = {self.sym_x: self.sh_test_x} inputs = [self.sym_samples] outputs = [lb] f_test = theano.function(inputs=inputs, outputs=outputs, givens=givens) # Testing args self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['lb'] = '%0.4f' f_validate = None if validation_set is not None: givens = {self.sym_x: self.sh_valid_x} inputs = [self.sym_samples] outputs = [lb] f_validate = theano.function(inputs=inputs, outputs=outputs, givens=givens) # Validation args self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['lb'] = '%0.4f' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def run_adgmssl_mnist(): """ Evaluate a auxiliary deep generative model on the mnist dataset with 100 evenly distributed labels. """ # Load the mnist supervised dataset for evaluation. (train_x, train_t), (test_x, test_t), (valid_x, valid_t) = mnist.load_supervised(filter_std=0.0, train_valid_combine=True) # Initialize the auxiliary deep generative model. model = ADGMSSL(n_x=train_x.shape[-1], n_a=100, n_z=100, n_y=10, a_hidden=[500, 500], z_hidden=[500, 500], xhat_hidden=[500, 500], y_hidden=[500, 500], trans_func=rectify, x_dist='bernoulli') model_id = 20151209002003 # Insert the trained model id here. model.load_model(model_id) # Load trained model. See configurations in the log file. # Evaluate the test error of the ADGM. mean_evals = model.get_output(test_x, 100) # 100 MC to get a good estimate for the auxiliary unit. t_class = np.argmax(test_t, axis=1) y_class = np.argmax(mean_evals, axis=1) class_err = np.sum(y_class != t_class) / 100. print "test set 100-samples: %0.2f%%." % class_err # Evaluate the active units in the auxiliary and latent distribution. f_a_mu_logvar = theano.function([model.sym_x_l], get_output([model.l_a_mu, model.l_a_logvar], model.sym_x_l)) q_a_mu, q_a_logvar = f_a_mu_logvar(test_x) log_pa = -0.5 * (np.log(2 * np.pi) + (q_a_mu ** 2 + np.exp(q_a_logvar))) log_qa_x = -0.5 * (np.log(2 * np.pi) + 1 + q_a_logvar) diff_pa_qa_x = (log_pa - log_qa_x).mean(axis=(1, 2)) mean_diff_pa_qa_x = np.abs(np.mean(diff_pa_qa_x, axis=0)) inputs = {model.l_x_in: model.sym_x_l, model.l_y_in: model.sym_t_l} f_z_mu_logvar = theano.function([model.sym_x_l, model.sym_t_l], get_output([model.l_z_mu, model.l_z_logvar], inputs)) q_z_mu, q_z_logvar = f_z_mu_logvar(test_x, test_t) log_pz = -0.5 * (np.log(2 * np.pi) + (q_z_mu ** 2 + np.exp(q_z_logvar))) log_qz_x = -0.5 * (np.log(2 * np.pi) + 1 + q_z_logvar) diff_pz_qz_x = (log_pz - log_qz_x).mean(axis=(1, 2)) mean_diff_pz_qz_x = np.abs(np.mean(diff_pz_qz_x, axis=0)) plt.figure() plt.subplot(111, axisbg='white') plt.plot(sorted(mean_diff_pa_qa_x)[::-1], color="#c0392b", label=r"$\log \frac{p(a_i)}{q(a_i|x)}$") plt.plot(sorted(mean_diff_pz_qz_x)[::-1], color="#9b59b6", label=r"$\log \frac{p(z_i)}{q(z_i|x)}$") plt.grid(color='0.9', linestyle='dashed', axis="y") plt.xlabel("stochastic units") plt.ylabel(r"$\log \frac{p(\cdot)}{q(\cdot)}$") plt.ylim((0, 2.7)) plt.legend() plt.savefig("output/diff.png", format="png") # Sample 100 random normal distributed samples with fixed class y in the latent space and generate xhat. table_size = 10 samples = 1 z = np.random.random_sample((table_size ** 2, 100)) y = np.eye(10, k=0).reshape(10, 1, 10).repeat(10, axis=1).reshape((-1, 10)) xhat = model.f_xhat(z, y, samples) plt.figure(figsize=(20, 20), dpi=300) i = 0 img_out = np.zeros((28 * table_size, 28 * table_size)) for x in range(table_size): for y in range(table_size): xa, xb = x * 28, (x + 1) * 28 ya, yb = y * 28, (y + 1) * 28 im = np.reshape(xhat[i], (28, 28)) img_out[xa:xb, ya:yb] = im i += 1 plt.matshow(img_out, cmap=plt.cm.binary) plt.xticks(np.array([])) plt.yticks(np.array([])) plt.savefig("output/mnist.png", format="png")
def __init__(self, n_x, n_z, z_hidden, xhat_hidden, trans_func=rectify, init_w=1e-3, x_dist='gaussian', batchnorm=False): super(VAE, self).__init__(n_x, z_hidden + xhat_hidden, n_z, trans_func) self.n_x = n_x self.n_z = n_z self.x_dist = x_dist self.batchnorm = batchnorm self.sym_x = T.matrix('x') # symbolic inputs self.sym_z = T.matrix('z') self.sym_samples = T.iscalar('samples') self._srng = RandomStreams() def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar # Input l_x_in = InputLayer((None, n_x)) # Inference q(z|x) l_z_x = l_x_in for hid in z_hidden: l_z_x = DenseLayer(l_z_x, hid, init.Normal(std=init_w), init.Normal(std=init_w), self.transf) l_z_x, l_z_x_mu, l_z_x_logvar = stochastic_layer( l_z_x, n_z, self.sym_samples) # Reshape for density layers l_z_x_reshaped = ReshapeLayer(l_z_x, (-1, self.sym_samples, n_z)) l_z_x_mu_reshaped = DimshuffleLayer(l_z_x_mu, (0, 'x', 1)) l_z_x_logvar_reshaped = DimshuffleLayer(l_z_x_logvar, (0, 'x', 1)) # Generative p(xhat|z) l_xhat_z = l_z_x for hid in xhat_hidden: l_xhat_z = DenseLayer(l_xhat_z, hid, init.Normal(std=init_w), init.Normal(std=init_w), self.transf) if x_dist == 'bernoulli': l_xhat_z_mu_reshaped = None l_xhat_z_logvar_reshaped = None l_xhat_z = DenseLayer(l_xhat_z, n_x, init.Normal(std=init_w), init.Normal(std=init_w), sigmoid) elif x_dist == 'gaussian': l_xhat_z, l_xhat_z_mu, l_xhat_z_logvar = stochastic_layer( l_xhat_z, n_x, self.sym_samples) l_xhat_z_mu_reshaped = ReshapeLayer(l_xhat_z_mu, (-1, self.sym_samples, 1, n_x)) l_xhat_z_logvar_reshaped = ReshapeLayer( l_xhat_z_logvar, (-1, self.sym_samples, 1, n_x)) l_xhat_z_reshaped = ReshapeLayer(l_xhat_z, (-1, self.sym_samples, 1, n_x)) # Init class variables self.l_x_in = l_x_in self.l_xhat_mu = l_xhat_z_mu_reshaped self.l_xhat_logvar = l_xhat_z_logvar_reshaped self.l_xhat = l_xhat_z_reshaped self.l_z = l_z_x_reshaped self.l_z_mu = l_z_x_mu_reshaped self.l_z_logvar = l_z_x_logvar_reshaped self.model_params = get_all_params(self.l_xhat) inputs = [self.sym_x, self.sym_samples] outputs = get_output(self.l_z, self.sym_x, deterministic=True).mean(axis=1) self.f_qz = theano.function(inputs, outputs) inputs = {l_z_x: self.sym_z} outputs = get_output(self.l_xhat, inputs, deterministic=True).mean(axis=(1, 2)) inputs = [self.sym_z, self.sym_samples] self.f_px = theano.function(inputs, outputs)
def build_model(self, train_set, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set: Train set containing variables x, t. for the unlabeled data_preparation in the train set, we define 0's in t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(ADGMSSL, self).build_model(train_set, test_set, validation_set) # Define the layers for the density estimation used in the lower bound. l_log_pa = GaussianMarginalLogDensityLayer(self.l_a_mu, self.l_a_logvar) l_log_pz = GaussianMarginalLogDensityLayer(self.l_z_mu, self.l_z_logvar) l_log_qa_x = GaussianMarginalLogDensityLayer(1, self.l_a_logvar) l_log_qz_xy = GaussianMarginalLogDensityLayer(1, self.l_z_logvar) l_log_qy_ax = MultinomialLogDensityLayer(self.l_y, self.l_y_in, eps=1e-8) if self.x_dist == 'bernoulli': l_px_zy = BernoulliLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'multinomial': l_px_zy = MultinomialLogDensityLayer(self.l_xhat, self.l_x_in) elif self.x_dist == 'gaussian': l_px_zy = GaussianLogDensityLayer(self.l_x_in, self.l_xhat_mu, self.l_xhat_logvar) ### Compute lower bound for labeled data_preparation ### out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy, l_log_qy_ax] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = get_output(out_layers, inputs) py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) # non-informative prior log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = log_pa_l + log_pz_l + log_py_l + log_px_zy_l - log_qa_x_l - log_qz_axy_l # Upscale the discriminative term with a weight. log_qy_ax_l *= self.sym_beta xhat_grads_l = T.grad(lb_l.mean(axis=(1, 2)).sum(), self.xhat_params) y_grads_l = T.grad(log_qy_ax_l.mean(axis=(1, 2)).sum(), self.y_params) lb_l += log_qy_ax_l lb_l = lb_l.mean(axis=(1, 2)) ### Compute lower bound for unlabeled data_preparation ### bs_u = self.sym_x_u.shape[0] # size of the unlabeled data_preparation. t_eye = T.eye(self.n_y, k=0) # ones in diagonal and 0's elsewhere (bs x n_y). # repeat unlabeled t the number of classes for integration (bs * n_y) x n_y. t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape((-1, self.n_y)) # repeat unlabeled x the number of classes for integration (bs * n_y) x n_x x_u = self.sym_x_u.reshape((1, bs_u, self.n_x)).repeat(self.n_y, axis=0).reshape((-1, self.n_x)) out_layers = [l_log_pa, l_log_pz, l_log_qa_x, l_log_qz_xy, l_px_zy] inputs = {self.l_x_in: x_u, self.l_y_in: t_u} log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = get_output(out_layers, inputs) py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) # non-informative prior. log_py_u = -categorical_crossentropy(py_u, t_u).reshape((-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = log_pa_u + log_pz_u + log_py_u + log_px_zy_u - log_qa_x_u - log_qz_axy_u lb_u = lb_u.reshape((self.n_y, self.sym_samples, 1, bs_u)).transpose(3, 1, 2, 0).mean( axis=(1, 2)) # mean over samples. y_ax_u = get_output(self.l_y, self.sym_x_u) y_ax_u = y_ax_u.mean(axis=(1, 2)) # bs x n_y y_ax_u += 1e-8 # ensure that we get no NANs. y_ax_u /= T.sum(y_ax_u, axis=1, keepdims=True) xhat_grads_u = T.grad((y_ax_u * lb_u).sum(axis=1).sum(), self.xhat_params) lb_u = (y_ax_u * (lb_u - T.log(y_ax_u))).sum(axis=1) y_grads_u = T.grad(lb_u.sum(), self.y_params) # Loss - regularizing with weight priors p(theta|N(0,1)) and clipping gradients y_weight_priors = 0.0 for p in self.y_params: if 'W' not in str(p): continue y_weight_priors += log_normal(p, 0, 1).sum() y_weight_priors_grad = T.grad(y_weight_priors, self.y_params, disconnected_inputs='ignore') xhat_weight_priors = 0.0 for p in self.xhat_params: if 'W' not in str(p): continue xhat_weight_priors += log_normal(p, 0, 1).sum() xhat_weight_priors_grad = T.grad(xhat_weight_priors, self.xhat_params, disconnected_inputs='ignore') n = self.sh_train_x.shape[0].astype(theano.config.floatX) # no. of data_preparation points in train set n_b = n / self.sym_batchsize.astype(theano.config.floatX) # no. of batches in train set y_grads = [T.zeros(p.shape) for p in self.y_params] for i in range(len(y_grads)): y_grads[i] = (y_grads_l[i] + y_grads_u[i]) y_grads[i] *= n_b y_grads[i] += y_weight_priors_grad[i] y_grads[i] /= -n xhat_grads = [T.zeros(p.shape) for p in self.xhat_params] for i in range(len(xhat_grads)): xhat_grads[i] = (xhat_grads_l[i] + xhat_grads_u[i]) xhat_grads[i] *= n_b xhat_grads[i] += xhat_weight_priors_grad[i] xhat_grads[i] /= -n params = self.y_params + self.xhat_params grads = y_grads + xhat_grads # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.sum() + lb_u.sum()) * n_b + y_weight_priors + xhat_weight_priors) / -n # Avoid vanishing and exploding gradients. clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') updates = adam(mgrads, params, self.sym_lr, sym_beta1, sym_beta2) ### Compile training function ### x_batch_l = self.sh_train_x[self.batch_slice][:self.sym_bs_l] x_batch_u = self.sh_train_x[self.batch_slice][self.sym_bs_l:] t_batch_l = self.sh_train_t[self.batch_slice][:self.sym_bs_l] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = {self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l} inputs = [self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples] f_train = theano.function(inputs=inputs, outputs=[elbo], givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize'] = 200 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 1200. self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['outputs']['lb'] = '%0.4f' ### Compile testing function ### class_err_test = self._classification_error(self.sym_x_l, self.sym_t_l) givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err_test], givens=givens) # Testing args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['err'] = '%0.2f%%' ### Compile validation function ### f_validate = None if validation_set is not None: class_err_valid = self._classification_error(self.sym_x_l, self.sym_t_l) givens = {self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t} inputs = [self.sym_samples] f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err_valid], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['err'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def build_model(self, train_set_unlabeled, train_set_labeled, test_set, validation_set=None): """ Build the auxiliary deep generative model from the initialized hyperparameters. Define the lower bound term and compile it into a training function. :param train_set_unlabeled: Unlabeled train set containing variables x, t. :param train_set_labeled: Unlabeled train set containing variables x, t. :param test_set: Test set containing variables x, t. :param validation_set: Validation set containing variables x, t. :return: train, test, validation function and dicts of arguments. """ super(CSDGM, self).build_model(train_set_unlabeled, test_set, validation_set) sh_train_x_l = theano.shared(np.asarray(train_set_labeled[0], dtype=theano.config.floatX), borrow=True) sh_train_t_l = theano.shared(np.asarray(train_set_labeled[1], dtype=theano.config.floatX), borrow=True) n = self.sh_train_x.shape[0].astype( theano.config.floatX) # no. of data points n_l = sh_train_x_l.shape[0].astype( theano.config.floatX) # no. of labeled data points # Define the layers for the density estimation used in the lower bound. l_log_qa = GaussianLogDensityLayer(self.l_qa, self.l_qa_mu, self.l_qa_logvar) l_log_qz = GaussianLogDensityLayer(self.l_qz, self.l_qz_mu, self.l_qz_logvar) l_log_qy = MultinomialLogDensityLayer(self.l_qy, self.l_y_in, eps=1e-8) l_log_pz = StandardNormalLogDensityLayer(self.l_qz) l_log_pa = GaussianLogDensityLayer(self.l_qa, self.l_pa_mu, self.l_pa_logvar) l_x_in = ReshapeLayer(self.l_x_in, (-1, self.n_l * self.n_c)) l_px = DimshuffleLayer(self.l_px, (0, 3, 1, 2, 4)) l_px = ReshapeLayer(l_px, (-1, self.sym_samples, 1, self.n_c)) if self.x_dist == 'bernoulli': l_log_px = BernoulliLogDensityLayer(self.l_px, self.l_x_in) elif self.x_dist == 'multinomial': l_log_px = MultinomialLogDensityLayer(l_px, l_x_in) l_log_px = ReshapeLayer(l_log_px, (-1, self.n_l, 1, 1, 1)) l_log_px = MeanLayer(l_log_px, axis=1) elif self.x_dist == 'gaussian': l_px_mu = ReshapeLayer( DimshuffleLayer(self.l_px_mu, (0, 2, 3, 1, 4)), (-1, self.sym_samples, 1, self.n_l * self.n_c)) l_px_logvar = ReshapeLayer( DimshuffleLayer(self.l_px_logvar, (0, 2, 3, 1, 4)), (-1, self.sym_samples, 1, self.n_l * self.n_c)) l_log_px = GaussianLogDensityLayer(l_x_in, l_px_mu, l_px_logvar) def lower_bound(log_pa, log_qa, log_pz, log_qz, log_py, log_px): lb = log_px + log_py + (log_pz + log_pa - log_qa - log_qz) * (1.1 - self.sym_warmup) return lb # Lower bound for labeled data out_layers = [ l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px, l_log_qy ] inputs = {self.l_x_in: self.sym_x_l, self.l_y_in: self.sym_t_l} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_l, log_pz_l, log_qa_x_l, log_qz_axy_l, log_px_zy_l, log_qy_ax_l = out # Prior p(y) expecting that all classes are evenly distributed py_l = softmax(T.zeros((self.sym_x_l.shape[0], self.n_y))) log_py_l = -categorical_crossentropy(py_l, self.sym_t_l).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_l = lower_bound(log_pa_l, log_qa_x_l, log_pz_l, log_qz_axy_l, log_py_l, log_px_zy_l) lb_l = lb_l.mean(axis=(1, 2)) # Mean over the sampling dimensions log_qy_ax_l *= ( self.sym_beta * (n / n_l) ) # Scale the supervised cross entropy with the alpha constant lb_l += log_qy_ax_l.mean(axis=( 1, 2 )) # Collect the lower bound term and mean over sampling dimensions # Lower bound for unlabeled data bs_u = self.sym_x_u.shape[0] # For the integrating out approach, we repeat the input matrix x, and construct a target (bs * n_y) x n_y # Example of input and target matrix for a 3 class problem and batch_size=2. 2D tensors of the form # x_repeat t_repeat # [[x[0,0], x[0,1], ..., x[0,n_x]] [[1, 0, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [1, 0, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 1, 0] # [x[1,0], x[1,1], ..., x[1,n_x]] [0, 1, 0] # [x[0,0], x[0,1], ..., x[0,n_x]] [0, 0, 1] # [x[1,0], x[1,1], ..., x[1,n_x]]] [0, 0, 1]] t_eye = T.eye(self.n_y, k=0) t_u = t_eye.reshape((self.n_y, 1, self.n_y)).repeat(bs_u, axis=1).reshape( (-1, self.n_y)) x_u = self.sym_x_u.reshape( (1, bs_u, self.n_l, self.n_c)).repeat(self.n_y, axis=0).reshape( (-1, self.n_l, self.n_c)) # Since the expectation of var a is outside the integration we calculate E_q(a|x) first a_x_u = get_output(self.l_qa, self.sym_x_u, batch_norm_update_averages=True, batch_norm_use_averages=False) a_x_u_rep = a_x_u.reshape( (1, bs_u * self.sym_samples, self.n_a)).repeat(self.n_y, axis=0).reshape( (-1, self.n_a)) out_layers = [l_log_pa, l_log_pz, l_log_qa, l_log_qz, l_log_px] inputs = {self.l_x_in: x_u, self.l_y_in: t_u, self.l_a_in: a_x_u_rep} out = get_output(out_layers, inputs, batch_norm_update_averages=False, batch_norm_use_averages=False) log_pa_u, log_pz_u, log_qa_x_u, log_qz_axy_u, log_px_zy_u = out # Prior p(y) expecting that all classes are evenly distributed py_u = softmax(T.zeros((bs_u * self.n_y, self.n_y))) log_py_u = -categorical_crossentropy(py_u, t_u).reshape( (-1, 1)).dimshuffle((0, 'x', 'x', 1)) lb_u = lower_bound(log_pa_u, log_qa_x_u, log_pz_u, log_qz_axy_u, log_py_u, log_px_zy_u) lb_u = lb_u.reshape( (self.n_y, 1, 1, bs_u)).transpose(3, 1, 2, 0).mean(axis=(1, 2)) inputs = { self.l_x_in: self.sym_x_u, self.l_a_in: a_x_u.reshape((-1, self.n_a)) } y_u = get_output(self.l_qy, inputs, batch_norm_update_averages=True, batch_norm_use_averages=False).mean(axis=(1, 2)) y_u += 1e-8 # Ensure that we get no NANs when calculating the entropy y_u /= T.sum(y_u, axis=1, keepdims=True) lb_u = (y_u * (lb_u - T.log(y_u))).sum(axis=1) # Regularizing with weight priors p(theta|N(0,1)), collecting and clipping gradients weight_priors = 0.0 for p in self.trainable_model_params: if 'W' not in str(p): continue weight_priors += log_normal(p, 0, 1).sum() # Collect the lower bound and scale it with the weight priors. elbo = ((lb_l.mean() + lb_u.mean()) * n + weight_priors) / -n lb_labeled = -lb_l.mean() lb_unlabeled = -lb_u.mean() log_px = log_px_zy_l.mean() + log_px_zy_u.mean() log_pz = log_pz_l.mean() + log_pz_u.mean() log_qz = log_qz_axy_l.mean() + log_qz_axy_u.mean() log_pa = log_pa_l.mean() + log_pa_u.mean() log_qa = log_qa_x_l.mean() + log_qa_x_u.mean() grads_collect = T.grad(elbo, self.trainable_model_params) params_collect = self.trainable_model_params sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') clip_grad, max_norm = 1, 5 mgrads = total_norm_constraint(grads_collect, max_norm=max_norm) mgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] updates = adam(mgrads, params_collect, self.sym_lr, sym_beta1, sym_beta2) # Training function indices = self._srng.choice(size=[self.sym_bs_l], a=sh_train_x_l.shape[0], replace=False) x_batch_l = sh_train_x_l[indices] t_batch_l = sh_train_t_l[indices] x_batch_u = self.sh_train_x[self.batch_slice] if self.x_dist == 'bernoulli': # Sample bernoulli input. x_batch_u = self._srng.binomial(size=x_batch_u.shape, n=1, p=x_batch_u, dtype=theano.config.floatX) x_batch_l = self._srng.binomial(size=x_batch_l.shape, n=1, p=x_batch_l, dtype=theano.config.floatX) givens = { self.sym_x_l: x_batch_l, self.sym_x_u: x_batch_u, self.sym_t_l: t_batch_l } inputs = [ self.sym_index, self.sym_batchsize, self.sym_bs_l, self.sym_beta, self.sym_lr, sym_beta1, sym_beta2, self.sym_samples, self.sym_warmup ] outputs = [ elbo, lb_labeled, lb_unlabeled, log_px, log_pz, log_qz, log_pa, log_qa ] f_train = theano.function(inputs=inputs, outputs=outputs, givens=givens, updates=updates) # Default training args. Note that these can be changed during or prior to training. self.train_args['inputs']['batchsize_unlabeled'] = 100 self.train_args['inputs']['batchsize_labeled'] = 100 self.train_args['inputs']['beta'] = 0.1 self.train_args['inputs']['learningrate'] = 3e-4 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['inputs']['samples'] = 1 self.train_args['inputs']['warmup'] = 0.1 self.train_args['outputs']['lb'] = '%0.3f' self.train_args['outputs']['lb-l'] = '%0.3f' self.train_args['outputs']['lb-u'] = '%0.3f' self.train_args['outputs']['px'] = '%0.3f' self.train_args['outputs']['pz'] = '%0.3f' self.train_args['outputs']['qz'] = '%0.3f' self.train_args['outputs']['pa'] = '%0.3f' self.train_args['outputs']['qa'] = '%0.3f' # Validation and test function y = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) class_err = (1. - categorical_accuracy(y, self.sym_t_l).mean()) * 100 givens = {self.sym_x_l: self.sh_test_x, self.sym_t_l: self.sh_test_t} f_test = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Test args. Note that these can be changed during or prior to training. self.test_args['inputs']['samples'] = 1 self.test_args['outputs']['test'] = '%0.2f%%' f_validate = None if validation_set is not None: givens = { self.sym_x_l: self.sh_valid_x, self.sym_t_l: self.sh_valid_t } f_validate = theano.function(inputs=[self.sym_samples], outputs=[class_err], givens=givens) # Default validation args. Note that these can be changed during or prior to training. self.validate_args['inputs']['samples'] = 1 self.validate_args['outputs']['validation'] = '%0.2f%%' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def __init__(self, n_x, n_a, n_z, n_y, a_hidden, z_hidden, xhat_hidden, y_hidden, trans_func=rectify, x_dist='bernoulli'): """ Initialize an auxiliary deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(xhat|z,y), inference model Q q(a|x) and q(z|x,y). All weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param a_hidden: List of number of deterministic hidden q(a|x). :param z_hidden: List of number of deterministic hidden q(z|x,y). :param xhat_hidden: List of number of deterministic hidden p(xhat|z,y). :param y_hidden: List of number of deterministic hidden q(y|a,x). :param trans_func: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli' or 'gaussian'. """ super(ADGMSSL, self).__init__(n_x, a_hidden + z_hidden + xhat_hidden, n_a + n_z, trans_func) self.y_hidden = y_hidden self.x_dist = x_dist self.n_y = n_y self.n_x = n_x self.n_a = n_a self.n_z = n_z self._srng = RandomStreams() self.sym_beta = T.scalar('beta') # symbolic upscaling of the discriminative term. self.sym_x_l = T.matrix('x') # symbolic labeled inputs self.sym_t_l = T.matrix('t') # symbolic labeled targets self.sym_x_u = T.matrix('x') # symbolic unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # symbolic number of labeled data_preparation points in batch self.sym_samples = T.iscalar('samples') # symbolic number of Monte Carlo samples self.sym_y = T.matrix('y') self.sym_z = T.matrix('z') ### Input layers ### l_x_in = InputLayer((None, n_x)) l_y_in = InputLayer((None, n_y)) ### Auxiliary q(a|x) ### l_a_x = l_x_in for hid in a_hidden: l_a_x = DenseLayer(l_a_x, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) l_a_x_mu = DenseLayer(l_a_x, n_a, init.GlorotNormal(), init.Normal(1e-3), None) l_a_x_logvar = DenseLayer(l_a_x, n_a, init.GlorotNormal(), init.Normal(1e-3), None) l_a_x = SampleLayer(l_a_x_mu, l_a_x_logvar, eq_samples=self.sym_samples) # Reshape all layers to align them for multiple samples in the lower bound calculation. l_a_x_reshaped = ReshapeLayer(l_a_x, (-1, self.sym_samples, 1, n_a)) l_a_x_mu_reshaped = DimshuffleLayer(l_a_x_mu, (0, 'x', 'x', 1)) l_a_x_logvar_reshaped = DimshuffleLayer(l_a_x_logvar, (0, 'x', 'x', 1)) ### Classifier q(y|a,x) ### # Concatenate the input x and the output of the auxiliary MLP. l_a_to_y = DenseLayer(l_a_x, y_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_a_to_y = ReshapeLayer(l_a_to_y, (-1, self.sym_samples, 1, y_hidden[0])) l_x_to_y = DenseLayer(l_x_in, y_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_x_to_y = DimshuffleLayer(l_x_to_y, (0, 'x', 'x', 1)) l_y_xa = ReshapeLayer(ElemwiseSumLayer([l_a_to_y, l_x_to_y]), (-1, y_hidden[0])) l_y_xa = NonlinearityLayer(l_y_xa, self.transf) if len(y_hidden) > 1: for hid in y_hidden[1:]: l_y_xa = DenseLayer(l_y_xa, hid, init.GlorotUniform('relu'), init.Normal(1e-3), self.transf) l_y_xa = DenseLayer(l_y_xa, n_y, init.GlorotUniform(), init.Normal(1e-3), softmax) l_y_xa_reshaped = ReshapeLayer(l_y_xa, (-1, self.sym_samples, 1, n_y)) ### Recognition q(z|x,y) ### # Concatenate the input x and y. l_x_to_z = DenseLayer(l_x_in, z_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_x_to_z = DimshuffleLayer(l_x_to_z, (0, 'x', 'x', 1)) l_y_to_z = DenseLayer(l_y_in, z_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_y_to_z = DimshuffleLayer(l_y_to_z, (0, 'x', 'x', 1)) l_z_xy = ReshapeLayer(ElemwiseSumLayer([l_x_to_z, l_y_to_z]), [-1, z_hidden[0]]) l_z_xy = NonlinearityLayer(l_z_xy, self.transf) if len(z_hidden) > 1: for hid in z_hidden[1:]: l_z_xy = DenseLayer(l_z_xy, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) l_z_axy_mu = DenseLayer(l_z_xy, n_z, init.GlorotNormal(), init.Normal(1e-3), None) l_z_axy_logvar = DenseLayer(l_z_xy, n_z, init.GlorotNormal(), init.Normal(1e-3), None) l_z_xy = SampleLayer(l_z_axy_mu, l_z_axy_logvar, eq_samples=self.sym_samples) # Reshape all layers to align them for multiple samples in the lower bound calculation. l_z_axy_mu_reshaped = DimshuffleLayer(l_z_axy_mu, (0, 'x', 'x', 1)) l_z_axy_logvar_reshaped = DimshuffleLayer(l_z_axy_logvar, (0, 'x', 'x', 1)) l_z_axy_reshaped = ReshapeLayer(l_z_xy, (-1, self.sym_samples, 1, n_z)) ### Generative p(xhat|z,y) ### # Concatenate the input x and y. l_y_to_xhat = DenseLayer(l_y_in, xhat_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_y_to_xhat = DimshuffleLayer(l_y_to_xhat, (0, 'x', 'x', 1)) l_z_to_xhat = DenseLayer(l_z_xy, xhat_hidden[0], init.GlorotNormal('relu'), init.Normal(1e-3), None) l_z_to_xhat = ReshapeLayer(l_z_to_xhat, (-1, self.sym_samples, 1, xhat_hidden[0])) l_xhat_zy = ReshapeLayer(ElemwiseSumLayer([l_z_to_xhat, l_y_to_xhat]), [-1, xhat_hidden[0]]) l_xhat_zy = NonlinearityLayer(l_xhat_zy, self.transf) if len(xhat_hidden) > 1: for hid in xhat_hidden[1:]: l_xhat_zy = DenseLayer(l_xhat_zy, hid, init.GlorotNormal('relu'), init.Normal(1e-3), self.transf) if x_dist == 'bernoulli': l_xhat_zy_mu_reshaped = None l_xhat_zy_logvar_reshaped = None l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), sigmoid) elif x_dist == 'multinomial': l_xhat_zy_mu_reshaped = None l_xhat_zy_logvar_reshaped = None l_xhat_zy = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), softmax) elif x_dist == 'gaussian': l_xhat_zy_mu = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), None) l_xhat_zy_logvar = DenseLayer(l_xhat_zy, n_x, init.GlorotNormal(), init.Normal(1e-3), None) l_xhat_zy = SampleLayer(l_xhat_zy_mu, l_xhat_zy_logvar, eq_samples=1) l_xhat_zy_mu_reshaped = ReshapeLayer(l_xhat_zy_mu, (-1, self.sym_samples, 1, n_x)) l_xhat_zy_logvar_reshaped = ReshapeLayer(l_xhat_zy_logvar, (-1, self.sym_samples, 1, n_x)) l_xhat_zy_reshaped = ReshapeLayer(l_xhat_zy, (-1, self.sym_samples, 1, n_x)) ### Various class variables ### self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_mu = l_a_x_mu_reshaped self.l_a_logvar = l_a_x_logvar_reshaped self.l_a = l_a_x_reshaped self.l_z_mu = l_z_axy_mu_reshaped self.l_z_logvar = l_z_axy_logvar_reshaped self.l_z = l_z_axy_reshaped self.l_y = l_y_xa_reshaped self.l_xhat_mu = l_xhat_zy_mu_reshaped self.l_xhat_logvar = l_xhat_zy_logvar_reshaped self.l_xhat = l_xhat_zy_reshaped self.model_params = get_all_params([self.l_xhat, self.l_y]) ### Calculate networks shapes for documentation ### self.qa_shapes = self.get_model_shape(get_all_params(l_a_x)) self.qy_shapes = self.get_model_shape(get_all_params(l_y_xa))[len(self.qa_shapes) - 1:] self.qz_shapes = self.get_model_shape(get_all_params(l_z_xy)) self.px_shapes = self.get_model_shape(get_all_params(l_xhat_zy))[(len(self.qz_shapes) - 1):] ### Predefined functions for generating xhat and y ### inputs = {l_z_xy: self.sym_z, self.l_y_in: self.sym_y} outputs = get_output(self.l_xhat, inputs, deterministic=True).mean(axis=(1, 2)) inputs = [self.sym_z, self.sym_y, self.sym_samples] self.f_xhat = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_y, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_y = theano.function(inputs, outputs) self.y_params = get_all_params(self.l_y, trainable=True)[(len(a_hidden) + 2) * 2::] self.xhat_params = get_all_params(self.l_xhat, trainable=True)
def __init__(self, n_c, px_hid, enc_rnn=256, dec_rnn=256, n_l=50, nonlinearity=rectify, batchnorm=False, seed=1234): """ Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of inputs. :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(RAE, self).__init__(n_c, px_hid, enc_rnn, nonlinearity) self.n_x = n_c self.max_seq_length = n_l self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_x = T.tensor3('x') # inputs # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), nonlinearity=None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def lstm_layer(input, nunits, return_final, backwards=False, name='LSTM'): ingate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) forgetgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(5.0)) cell = Gate( W_cell=None, nonlinearity=T.tanh, W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), ) outgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) lstm = LSTMLayer(input, num_units=nunits, backwards=backwards, peepholes=False, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, name=name, only_return_final=return_final) rec = RecurrentLayer(input, num_units=nunits, W_in_to_hid=init.GlorotNormal('relu'), W_hid_to_hid=init.GlorotNormal('relu'), backwards=backwards, nonlinearity=rectify, only_return_final=return_final, name=name) return lstm # RNN encoder implementation l_x_in = InputLayer((None, None, n_c)) l_enc_forward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=False, name='enc_forward') l_enc_backward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=True, name='enc_backward') l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward], axis=-1) l_enc = dense_layer(l_enc_concat, enc_rnn) # RNN decoder implementation l_dec_repeat = RepeatLayer(l_enc, n=n_l) l_dec_forward = lstm_layer(l_dec_repeat, dec_rnn, return_final=False, backwards=False, name='dec_forward') l_dec_backward = lstm_layer(l_dec_repeat, dec_rnn, return_final=False, backwards=True, name='dec_backward') l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1) l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn)) l_dec = dense_layer(l_dec, dec_rnn) # Generative p(x_hat|x) l_px = l_dec for hid in px_hid: l_px = dense_layer(l_px, hid) # Output self.l_enc = l_enc l_px = DenseLayer(l_px, n_c, nonlinearity=None) self.l_px = ReshapeLayer(l_px, (-1, n_l, n_c)) self.l_x_in = l_x_in inputs = {l_x_in: self.sym_x} outputs = get_output(self.l_px, inputs, deterministic=True) self.f_px = theano.function([self.sym_x], outputs, on_unused_input='warn') # Define model parameters self.encoder_params = get_all_param_values(self.l_enc) self.model_params = get_all_params(self.l_px) self.trainable_model_params = get_all_params(self.l_px, trainable=True)
def __init__(self, n_x, n_z, qz_hid, px_hid, filters, seq_length=50, nonlinearity=rectify, px_nonlinearity=None, x_dist='linear', batchnorm=False, seed=1234): """ Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_z: Number of latent. :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(CVAE, self).__init__(n_x, qz_hid + px_hid, n_z, nonlinearity) self.x_dist = x_dist self.n_x = n_x self.seq_length = seq_length self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Pool layer cache pool_layers = [] # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_x = T.tensor3('x') # inputs self.sym_z = T.matrix('z') self.sym_samples = T.iscalar('samples') # MC samples # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = bn(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def conv_layer(layer_in, filter, stride=(1, 1), pool=1, name='conv'): l_conv = Conv2DLayer(layer_in, num_filters=filter, filter_size=(3, 1), stride=stride, pad='full', name=name) if pool > 1: l_conv = MaxPool2DLayer(l_conv, pool_size=(pool, 1)) pool_layers.append(l_conv) return l_conv # Reshape input l_x_in = InputLayer((None, seq_length, n_x), name='Input') l_x_in_reshp = ReshapeLayer(l_x_in, (-1, 1, seq_length, n_x)) print("l_x_in_reshp", l_x_in_reshp.output_shape) # CNN encoder implementation l_conv_enc = l_x_in_reshp for filter, stride, pool in filters: l_conv_enc = conv_layer(l_conv_enc, filter, stride, pool) print("l_conv_enc", l_conv_enc.output_shape) # Pool along last 2 axes l_global_pool_enc = GlobalPoolLayer(l_conv_enc) l_enc = dense_layer(l_global_pool_enc, n_z) print("l_enc", l_enc.output_shape) # Recognition q(z|x) l_qz = l_enc for hid in qz_hid: l_qz = dense_layer(l_qz, hid) l_qz, l_qz_mu, l_qz_logvar = stochastic_layer(l_qz, n_z, self.sym_samples) print("l_qz", l_qz.output_shape) # Inverse pooling l_global_depool = InverseLayer(l_qz, l_global_pool_enc) print("l_global_depool", l_global_depool.output_shape) # Reverse pool layer order pool_layers = pool_layers[::-1] # Decode l_deconv = l_global_depool for idx, filter in enumerate(filters[::-1]): filter, stride, pool = filter if pool > 1: l_deconv = InverseLayer(l_deconv, pool_layers[idx]) l_deconv = Conv2DLayer(l_deconv, num_filters=filter, filter_size=(3, 1), stride=(stride, 1), W=init.GlorotNormal('relu')) print("l_deconv", l_deconv.output_shape) # The last l_conv layer should give us the input shape l_dec = Conv2DLayer(l_deconv, num_filters=1, filter_size=(3, 1), pad='same', nonlinearity=None) print("l_dec", l_dec.output_shape) # Flatten first two dimensions l_dec = ReshapeLayer(l_dec, (-1, n_x)) l_px = l_dec if x_dist == 'bernoulli': l_px = DenseLayer(l_px, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px = DenseLayer(l_px, n_x, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px, l_px_mu, l_px_logvar = stochastic_layer( l_px, n_x, self.sym_samples, px_nonlinearity) elif x_dist == 'linear': l_px = DenseLayer(l_px, n_x, nonlinearity=None) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_qz = ReshapeLayer(l_qz, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = DimshuffleLayer(l_qz_mu, (0, 'x', 'x', 1)) self.l_qz_logvar = DimshuffleLayer(l_qz_logvar, (0, 'x', 'x', 1)) self.l_px = DimshuffleLayer( ReshapeLayer(l_px, (-1, seq_length, self.sym_samples, 1, n_x)), (0, 2, 3, 1, 4)) self.l_px_mu = DimshuffleLayer(ReshapeLayer(l_px_mu, (-1, seq_length, self.sym_samples, 1, n_x)), (0, 2, 3, 1, 4)) \ if x_dist == "gaussian" else None self.l_px_logvar = DimshuffleLayer(ReshapeLayer(l_px_logvar, (-1, seq_length, self.sym_samples, 1, n_x)), (0, 2, 3, 1, 4)) \ if x_dist == "gaussian" else None # Predefined functions inputs = {self.l_x_in: self.sym_x} outputs = get_output(l_qz, inputs, deterministic=True) self.f_qz = theano.function([self.sym_x, self.sym_samples], outputs) inputs = {l_qz: self.sym_z} outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(1, 2)) self.f_px = theano.function([self.sym_z, self.sym_samples], outputs) outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(1, 2)) self.f_mu = theano.function([self.sym_z, self.sym_samples], outputs) outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(1, 2)) self.f_var = theano.function([self.sym_z, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_px]) self.trainable_model_params = get_all_params([self.l_px], trainable=True)
def run_adgmssl_mnist(): """ Evaluate a auxiliary deep generative model on the mnist dataset with 100 evenly distributed labels. """ # Load the mnist supervised dataset for evaluation. (train_x, train_t), (test_x, test_t), (valid_x, valid_t) = mnist.load_supervised( filter_std=0.0, train_valid_combine=True) # Initialize the auxiliary deep generative model. model = ADGMSSL(n_x=train_x.shape[-1], n_a=100, n_z=100, n_y=10, a_hidden=[500, 500], z_hidden=[500, 500], xhat_hidden=[500, 500], y_hidden=[500, 500], trans_func=rectify, x_dist='bernoulli') model_id = 20151209002003 # Insert the trained model id here. model.load_model( model_id) # Load trained model. See configurations in the log file. # Evaluate the test error of the ADGM. mean_evals = model.get_output( test_x, 100) # 100 MC to get a good estimate for the auxiliary unit. t_class = np.argmax(test_t, axis=1) y_class = np.argmax(mean_evals, axis=1) class_err = np.sum(y_class != t_class) / 100. print "test set 100-samples: %0.2f%%." % class_err # Evaluate the active units in the auxiliary and latent distribution. f_a_mu_logvar = theano.function( [model.sym_x_l], get_output([model.l_a_mu, model.l_a_logvar], model.sym_x_l)) q_a_mu, q_a_logvar = f_a_mu_logvar(test_x) log_pa = -0.5 * (np.log(2 * np.pi) + (q_a_mu**2 + np.exp(q_a_logvar))) log_qa_x = -0.5 * (np.log(2 * np.pi) + 1 + q_a_logvar) diff_pa_qa_x = (log_pa - log_qa_x).mean(axis=(1, 2)) mean_diff_pa_qa_x = np.abs(np.mean(diff_pa_qa_x, axis=0)) inputs = {model.l_x_in: model.sym_x_l, model.l_y_in: model.sym_t_l} f_z_mu_logvar = theano.function( [model.sym_x_l, model.sym_t_l], get_output([model.l_z_mu, model.l_z_logvar], inputs)) q_z_mu, q_z_logvar = f_z_mu_logvar(test_x, test_t) log_pz = -0.5 * (np.log(2 * np.pi) + (q_z_mu**2 + np.exp(q_z_logvar))) log_qz_x = -0.5 * (np.log(2 * np.pi) + 1 + q_z_logvar) diff_pz_qz_x = (log_pz - log_qz_x).mean(axis=(1, 2)) mean_diff_pz_qz_x = np.abs(np.mean(diff_pz_qz_x, axis=0)) plt.figure() plt.subplot(111, axisbg='white') plt.plot(sorted(mean_diff_pa_qa_x)[::-1], color="#c0392b", label=r"$\log \frac{p(a_i)}{q(a_i|x)}$") plt.plot(sorted(mean_diff_pz_qz_x)[::-1], color="#9b59b6", label=r"$\log \frac{p(z_i)}{q(z_i|x)}$") plt.grid(color='0.9', linestyle='dashed', axis="y") plt.xlabel("stochastic units") plt.ylabel(r"$\log \frac{p(\cdot)}{q(\cdot)}$") plt.ylim((0, 2.7)) plt.legend() plt.savefig("output/diff.png", format="png") # Sample 100 random normal distributed samples with fixed class y in the latent space and generate xhat. table_size = 10 samples = 1 z = np.random.random_sample((table_size**2, 100)) y = np.eye(10, k=0).reshape(10, 1, 10).repeat(10, axis=1).reshape((-1, 10)) xhat = model.f_xhat(z, y, samples) plt.figure(figsize=(20, 20), dpi=300) i = 0 img_out = np.zeros((28 * table_size, 28 * table_size)) for x in range(table_size): for y in range(table_size): xa, xb = x * 28, (x + 1) * 28 ya, yb = y * 28, (y + 1) * 28 im = np.reshape(xhat[i], (28, 28)) img_out[xa:xb, ya:yb] = im i += 1 plt.matshow(img_out, cmap=plt.cm.binary) plt.xticks(np.array([])) plt.yticks(np.array([])) plt.savefig("output/mnist.png", format="png")