def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf)
def __init__(self, n_x, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_x: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(SDGMSSL, self).__init__(n_x, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_x = n_x self.n_a = n_a self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.matrix('x') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.matrix('x') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar # Input layers l_x_in = InputLayer((None, n_x)) l_y_in = InputLayer((None, n_y)) # Auxiliary q(a|x) l_qa_x = l_x_in for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer(l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_x_in, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_x_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer(l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer(ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) if len(px_hid) > 1: for hid in px_hid[1:]: l_px_azy = dense_layer(l_px_azy, hid) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_x, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer(l_px_azy, n_x, 1, px_nonlinearity) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) self.l_px = ReshapeLayer(l_px_azy, (-1, self.sym_samples, 1, n_x)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, self.sym_samples, 1, n_x)) if x_dist == "gaussian" else None # Predefined functions inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qa = theano.function(inputs, outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True) self.f_pa = theano.function([self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_px, inputs, deterministic=True) self.f_px = theano.function([self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params([self.l_qy, self.l_pa, self.l_px], trainable=True)
def __init__(self, n_c, n_l, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, filters, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of input channels. :param n_l: Number of lengths. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(CSDGM, self).__init__(n_c, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_c = n_c self.n_l = n_l self.n_a = n_a self.n_z = n_z self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" pool_layers = [] # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.tensor3('x') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.tensor3('x') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a self.sym_warmup = T.fscalar('warmup') # warmup to scale KL term # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def conv_layer(layer_in, filter, stride=(1, 1), pool=1, name='conv', dist_w=init.GlorotNormal, dist_b=init.Normal): l_conv = Conv2DLayer(layer_in, num_filters=filter, filter_size=(3, 1), stride=stride, pad='full', W=dist_w(hid_w), b=dist_b(init_w), name=name) if pool > 1: l_conv = MaxPool2DLayer(l_conv, pool_size=(pool, 1)) pool_layers.append(l_conv) return l_conv # Input layers l_y_in = InputLayer((None, n_y)) l_x_in = InputLayer((None, n_l, n_c), name='Input') # Reshape input l_x_in_reshp = ReshapeLayer(l_x_in, (-1, 1, n_l, n_c)) print("l_x_in_reshp", l_x_in_reshp.output_shape) # CNN encoder implementation l_conv_enc = l_x_in_reshp for filter, stride, pool in filters: l_conv_enc = conv_layer(l_conv_enc, filter, stride, pool) print("l_conv_enc", l_conv_enc.output_shape) # Pool along last 2 axes l_global_pool_enc = GlobalPoolLayer(l_conv_enc, pool_function=T.mean) l_enc = dense_layer(l_global_pool_enc, n_z) print("l_enc", l_enc.output_shape) # Auxiliary q(a|x) l_qa_x = l_enc for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer( l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_enc, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_enc, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer( l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) # Note that px_hid[0] has to be equal to the number filters in the first convolution. Otherwise add a # dense layers here. # Inverse pooling l_global_depool = InverseLayer(l_px_azy, l_global_pool_enc) print("l_global_depool", l_global_depool.output_shape) # Reverse pool layer order pool_layers = pool_layers[::-1] # Decode l_deconv = l_global_depool for idx, filter in enumerate(filters[::-1]): filter, stride, pool = filter if pool > 1: l_deconv = InverseLayer(l_deconv, pool_layers[idx]) l_deconv = Conv2DLayer(l_deconv, num_filters=filter, filter_size=(3, 1), stride=(stride, 1), W=init.GlorotNormal('relu')) print("l_deconv", l_deconv.output_shape) # The last l_conv layer should give us the input shape l_px_azy = Conv2DLayer(l_deconv, num_filters=1, filter_size=(3, 1), pad='same', nonlinearity=None) print("l_dec", l_px_azy.output_shape) # Flatten first two dimensions l_px_azy = ReshapeLayer(l_px_azy, (-1, n_c)) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer( l_px_azy, n_c, self.sym_samples, px_nonlinearity) elif x_dist == 'linear': l_px_azy = DenseLayer(l_px_azy, n_c, nonlinearity=None) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) # Here we assume that we pass (batch size * segment length, number of features) to the sample layer from # which we then get (batch size * segment length, samples, IW samples, features) self.l_px = ReshapeLayer(l_px_azy, (-1, n_l, self.sym_samples, 1, n_c)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None # Predefined functions inputs = {l_x_in: self.sym_x_l} outputs = get_output(self.l_qy, inputs, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function([self.sym_x_l, self.sym_samples], outputs) outputs = get_output(l_qa_x, inputs, deterministic=True) self.f_qa = theano.function([self.sym_x_l, self.sym_samples], outputs) inputs = {l_x_in: self.sym_x_l, l_y_in: self.sym_t_l} outputs = get_output(l_qz_axy, inputs, deterministic=True) self.f_qz = theano.function( [self.sym_x_l, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True).mean(axis=(1, 2)) self.f_pa = theano.function( [self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = { l_x_in: self.sym_x_l, l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l } outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(2, 3)) self.f_px = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(2, 3)) self.f_mu = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(2, 3)) self.f_var = theano.function([ self.sym_x_l, self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples ], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params( [self.l_qy, self.l_pa, self.l_px], trainable=True)
def __init__(self, n_l, n_c, n_a, n_z, n_y, qa_hid, qz_hid, qy_hid, px_hid, pa_hid, enc_rnn=256, dec_rnn=256, nonlinearity=rectify, px_nonlinearity=None, x_dist='bernoulli', batchnorm=False, seed=1234): """ Initialize an skip deep generative model consisting of discriminative classifier q(y|a,x), generative model P p(a|z,y) and p(x|a,z,y), inference model Q q(a|x) and q(z|a,x,y). Weights are initialized using the Bengio and Glorot (2010) initialization scheme. :param n_c: Number of inputs. :param n_a: Number of auxiliary. :param n_z: Number of latent. :param n_y: Number of classes. :param qa_hid: List of number of deterministic hidden q(a|x). :param qz_hid: List of number of deterministic hidden q(z|a,x,y). :param qy_hid: List of number of deterministic hidden q(y|a,x). :param px_hid: List of number of deterministic hidden p(a|z,y) & p(x|z,y). :param nonlinearity: The transfer function used in the deterministic layers. :param x_dist: The x distribution, 'bernoulli', 'multinomial', or 'gaussian'. :param batchnorm: Boolean value for batch normalization. :param seed: The random seed. """ super(RSDGM, self).__init__(n_c, qz_hid + px_hid, n_a + n_z, nonlinearity) self.x_dist = x_dist self.n_y = n_y self.n_c = n_c self.n_a = n_a self.n_z = n_z self.n_l = n_l self.batchnorm = batchnorm self._srng = RandomStreams(seed) # Decide Glorot initializaiton of weights. init_w = 1e-3 hid_w = "" if nonlinearity == rectify or nonlinearity == softplus: hid_w = "relu" # Define symbolic variables for theano functions. self.sym_beta = T.scalar('beta') # scaling constant beta self.sym_x_l = T.tensor3('x_l') # labeled inputs self.sym_t_l = T.matrix('t') # labeled targets self.sym_x_u = T.tensor3('x_u') # unlabeled inputs self.sym_bs_l = T.iscalar('bs_l') # number of labeled data self.sym_samples = T.iscalar('samples') # MC samples self.sym_z = T.matrix('z') # latent variable z self.sym_a = T.matrix('a') # auxiliary variable a self.sym_warmup = T.fscalar('warmup') # warmup to dampen KL term # Assist methods for collecting the layers def dense_layer(layer_in, n, dist_w=init.GlorotNormal, dist_b=init.Normal): dense = DenseLayer(layer_in, n, dist_w(hid_w), dist_b(init_w), None) if batchnorm: dense = BatchNormLayer(dense) return NonlinearityLayer(dense, self.transf) def stochastic_layer(layer_in, n, samples, nonlin=None): mu = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) logvar = DenseLayer(layer_in, n, init.Normal(init_w), init.Normal(init_w), nonlin) return SampleLayer(mu, logvar, eq_samples=samples, iw_samples=1), mu, logvar def lstm_layer(input, nunits, return_final, backwards=False, name='LSTM'): ingate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) forgetgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(5.0)) cell = Gate( W_cell=None, nonlinearity=T.tanh, W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), ) outgate = Gate(W_in=init.Uniform(0.01), W_hid=init.Uniform(0.01), b=init.Constant(0.0)) lstm = LSTMLayer(input, num_units=nunits, backwards=backwards, peepholes=False, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, name=name, only_return_final=return_final) rec = RecurrentLayer(input, nunits, W_in_to_hid=init.GlorotNormal('relu'), W_hid_to_hid=init.GlorotNormal('relu'), backwards=backwards, nonlinearity=rectify, only_return_final=return_final, name=name) return lstm # Input layers l_y_in = InputLayer((None, n_y)) l_x_in = InputLayer((None, n_l, n_c)) # RNN encoder implementation l_enc_forward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=False, name='enc_forward') l_enc_backward = lstm_layer(l_x_in, enc_rnn, return_final=True, backwards=True, name='enc_backward') l_enc_concat = ConcatLayer([l_enc_forward, l_enc_backward]) l_enc = dense_layer(l_enc_concat, enc_rnn) # Auxiliary q(a|x) l_qa_x = l_enc for hid in qa_hid: l_qa_x = dense_layer(l_qa_x, hid) l_qa_x, l_qa_x_mu, l_qa_x_logvar = stochastic_layer( l_qa_x, n_a, self.sym_samples) # Classifier q(y|a,x) l_qa_to_qy = DenseLayer(l_qa_x, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qy = ReshapeLayer(l_qa_to_qy, (-1, self.sym_samples, 1, qy_hid[0])) l_x_to_qy = DenseLayer(l_enc, qy_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qy = DimshuffleLayer(l_x_to_qy, (0, 'x', 'x', 1)) l_qy_xa = ReshapeLayer(ElemwiseSumLayer([l_qa_to_qy, l_x_to_qy]), (-1, qy_hid[0])) if batchnorm: l_qy_xa = BatchNormLayer(l_qy_xa) l_qy_xa = NonlinearityLayer(l_qy_xa, self.transf) if len(qy_hid) > 1: for hid in qy_hid[1:]: l_qy_xa = dense_layer(l_qy_xa, hid) l_qy_xa = DenseLayer(l_qy_xa, n_y, init.GlorotNormal(), init.Normal(init_w), softmax) # Recognition q(z|x,a,y) l_qa_to_qz = DenseLayer(l_qa_x, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_qz = ReshapeLayer(l_qa_to_qz, (-1, self.sym_samples, 1, qz_hid[0])) l_x_to_qz = DenseLayer(l_enc, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_x_to_qz = DimshuffleLayer(l_x_to_qz, (0, 'x', 'x', 1)) l_y_to_qz = DenseLayer(l_y_in, qz_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_qz = DimshuffleLayer(l_y_to_qz, (0, 'x', 'x', 1)) l_qz_axy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_qz, l_x_to_qz, l_y_to_qz]), (-1, qz_hid[0])) if batchnorm: l_qz_axy = BatchNormLayer(l_qz_axy) l_qz_axy = NonlinearityLayer(l_qz_axy, self.transf) if len(qz_hid) > 1: for hid in qz_hid[1:]: l_qz_axy = dense_layer(l_qz_axy, hid) l_qz_axy, l_qz_axy_mu, l_qz_axy_logvar = stochastic_layer( l_qz_axy, n_z, 1) # Generative p(a|z,y) l_y_to_pa = DenseLayer(l_y_in, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_pa = DimshuffleLayer(l_y_to_pa, (0, 'x', 'x', 1)) l_qz_to_pa = DenseLayer(l_qz_axy, pa_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_pa = ReshapeLayer(l_qz_to_pa, (-1, self.sym_samples, 1, pa_hid[0])) l_pa_zy = ReshapeLayer(ElemwiseSumLayer([l_qz_to_pa, l_y_to_pa]), [-1, pa_hid[0]]) if batchnorm: l_pa_zy = BatchNormLayer(l_pa_zy) l_pa_zy = NonlinearityLayer(l_pa_zy, self.transf) if len(pa_hid) > 1: for hid in pa_hid[1:]: l_pa_zy = dense_layer(l_pa_zy, hid) l_pa_zy, l_pa_zy_mu, l_pa_zy_logvar = stochastic_layer(l_pa_zy, n_a, 1) # Generative p(x|a,z,y) l_qa_to_px = DenseLayer(l_qa_x, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qa_to_px = ReshapeLayer(l_qa_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_y_to_px = DenseLayer(l_y_in, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_y_to_px = DimshuffleLayer(l_y_to_px, (0, 'x', 'x', 1)) l_qz_to_px = DenseLayer(l_qz_axy, px_hid[0], init.GlorotNormal(hid_w), init.Normal(init_w), None) l_qz_to_px = ReshapeLayer(l_qz_to_px, (-1, self.sym_samples, 1, px_hid[0])) l_px_azy = ReshapeLayer( ElemwiseSumLayer([l_qa_to_px, l_qz_to_px, l_y_to_px]), [-1, px_hid[0]]) if batchnorm: l_px_azy = BatchNormLayer(l_px_azy) l_px_azy = NonlinearityLayer(l_px_azy, self.transf) # RNN decoder implementation l_px_azy_repeat = RepeatLayer(l_px_azy, n=n_l) l_dec_forward = lstm_layer(l_px_azy_repeat, dec_rnn, return_final=False, backwards=False, name='dec_forward') l_dec_backward = lstm_layer(l_px_azy_repeat, dec_rnn, return_final=False, backwards=True, name='dec_backward') l_dec_concat = ConcatLayer([l_dec_forward, l_dec_backward], axis=-1) l_dec = ReshapeLayer(l_dec_concat, (-1, 2 * dec_rnn)) l_dec = dense_layer(l_dec, dec_rnn) l_px_azy = l_dec if len(px_hid) > 1: for hid in px_hid[1:]: l_px_azy = dense_layer(l_px_azy, hid) if x_dist == 'bernoulli': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), sigmoid) elif x_dist == 'multinomial': l_px_azy = DenseLayer(l_px_azy, n_c, init.GlorotNormal(), init.Normal(init_w), softmax) elif x_dist == 'gaussian': l_px_azy, l_px_zy_mu, l_px_zy_logvar = stochastic_layer( l_px_azy, n_c, self.sym_samples, px_nonlinearity) # Reshape all the model layers to have the same size self.l_x_in = l_x_in self.l_y_in = l_y_in self.l_a_in = l_qa_x self.l_qa = ReshapeLayer(l_qa_x, (-1, self.sym_samples, 1, n_a)) self.l_qa_mu = DimshuffleLayer(l_qa_x_mu, (0, 'x', 'x', 1)) self.l_qa_logvar = DimshuffleLayer(l_qa_x_logvar, (0, 'x', 'x', 1)) self.l_qz = ReshapeLayer(l_qz_axy, (-1, self.sym_samples, 1, n_z)) self.l_qz_mu = ReshapeLayer(l_qz_axy_mu, (-1, self.sym_samples, 1, n_z)) self.l_qz_logvar = ReshapeLayer(l_qz_axy_logvar, (-1, self.sym_samples, 1, n_z)) self.l_qy = ReshapeLayer(l_qy_xa, (-1, self.sym_samples, 1, n_y)) self.l_pa = ReshapeLayer(l_pa_zy, (-1, self.sym_samples, 1, n_a)) self.l_pa_mu = ReshapeLayer(l_pa_zy_mu, (-1, self.sym_samples, 1, n_a)) self.l_pa_logvar = ReshapeLayer(l_pa_zy_logvar, (-1, self.sym_samples, 1, n_a)) self.l_px = ReshapeLayer(l_px_azy, (-1, n_l, self.sym_samples, 1, n_c)) self.l_px_mu = ReshapeLayer(l_px_zy_mu, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None self.l_px_logvar = ReshapeLayer(l_px_zy_logvar, (-1, n_l, self.sym_samples, 1, n_c)) \ if x_dist == "gaussian" else None # Predefined functions inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qy, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qy = theano.function(inputs, outputs) inputs = [self.sym_x_l, self.sym_samples] outputs = get_output(self.l_qa, self.sym_x_l, deterministic=True).mean(axis=(1, 2)) self.f_qa = theano.function(inputs, outputs) inputs = {l_x_in: self.sym_x_l, l_y_in: self.sym_t_l} outputs = get_output(l_qz_axy, inputs, deterministic=True) self.f_qz = theano.function( [self.sym_x_l, self.sym_t_l, self.sym_samples], outputs) inputs = {l_qz_axy: self.sym_z, l_y_in: self.sym_t_l} outputs = get_output(self.l_pa, inputs, deterministic=True) self.f_pa = theano.function( [self.sym_z, self.sym_t_l, self.sym_samples], outputs) inputs = { l_qa_x: self.sym_a, l_qz_axy: self.sym_z, l_y_in: self.sym_t_l } outputs = get_output(self.l_px, inputs, deterministic=True).mean(axis=(2, 3)) self.f_px = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) outputs = get_output(self.l_px_mu, inputs, deterministic=True).mean(axis=(2, 3)) self.f_mu = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) outputs = get_output(self.l_px_logvar, inputs, deterministic=True).mean(axis=(2, 3)) self.f_var = theano.function( [self.sym_a, self.sym_z, self.sym_t_l, self.sym_samples], outputs) # Define model parameters self.model_params = get_all_params([self.l_qy, self.l_pa, self.l_px]) self.trainable_model_params = get_all_params( [self.l_qy, self.l_pa, self.l_px], trainable=True)